In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Tokenizing words and sentences 

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# tokenizing - word tokenizers ... sentence tokenizers
# lexicon and corporas
# corporea - body of text, ex: medical journals, presidential speeches, English language.
# lexicon - words and their means

In [None]:

text_sample = "Hello Mr. Johnson, how are you doing? The weather is great and python is awesom. The sky is pinkish blue."

print(sent_tokenize(text_sample))
print(word_tokenize(text_sample))

['Hello Mr. Johnson, how are you doing?', 'The weather is great and python is awesom.', 'The sky is pinkish blue.']
['Hello', 'Mr.', 'Johnson', ',', 'how', 'are', 'you', 'doing', '?', 'The', 'weather', 'is', 'great', 'and', 'python', 'is', 'awesom', '.', 'The', 'sky', 'is', 'pinkish', 'blue', '.']


# Stop words

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
example_sentence = "This is an example showing off stop word filteration."
stop_words = set(stopwords.words("english"))
#print(stop_words)

words = word_tokenize(example_sentence)

filtered_sentence = [w for w in words if w not in stop_words]
print(filtered_sentence)

['This', 'example', 'showing', 'stop', 'word', 'filteration', '.']


# Stemming

In [None]:
# reading, reads ==(stemmed to)==> read

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
ps = PorterStemmer()

example_words = ["developer", "developing", "develops"]

for w in example_words: 
  print(ps.stem(w))

develop
develop
develop


In [None]:
new_text = "You have to be the best developer to develop the apps; while developing take care of ;"

words = word_tokenize(new_text)

post_text = " ".join([ps.stem(w) for w in words])
print(post_text)

you have to be the best develop to develop the app ; while develop take care of ;


# Part of Speech

In [None]:
import nltk
nltk.download('tagsets')
nltk.download('state_union')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.
[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
#help(state_union)
train_text = state_union.raw("/content/sample_data/joe-biden-march-2022")
sample_text = state_union.raw("/content/sample_data/joe-biden-sep-2022")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text) # train 

tokenized = custom_sent_tokenizer.tokenize(sample_text) # 

In [None]:
def process_content():
   try: 
    for i in tokenized:
      words = nltk.word_tokenize(i)
      tagged = nltk.pos_tag(words)
      print(tagged)   
   except Exception as e:
     print(e)

process_content()

[('My', 'PRP$'), ('fellow', 'JJ'), ('Americans', 'NNPS'), (',', ','), ('please', 'VB'), (',', ','), ('if', 'IN'), ('you', 'PRP'), ('have', 'VBP'), ('a', 'DT'), ('seat', 'NN'), (',', ','), ('take', 'VB'), ('it', 'PRP'), ('.', '.')]
[('I', 'PRP'), ('speak', 'VBP'), ('to', 'TO'), ('you', 'PRP'), ('tonight', 'VBN'), ('from', 'IN'), ('sacred', 'VBN'), ('ground', 'NN'), ('in', 'IN'), ('America', 'NNP'), (':', ':'), ('Independence', 'NNP'), ('Hall', 'NNP'), ('in', 'IN'), ('Philadelphia', 'NNP'), (',', ','), ('Pennsylvania', 'NNP'), ('.', '.')]
[('This', 'DT'), ('is', 'VBZ'), ('where', 'WRB'), ('America', 'NNP'), ('made', 'VBD'), ('its', 'PRP$'), ('Declaration', 'NNP'), ('of', 'IN'), ('Independence', 'NNP'), ('to', 'TO'), ('the', 'DT'), ('world', 'NN'), ('more', 'JJR'), ('than', 'IN'), ('two', 'CD'), ('centuries', 'NNS'), ('ago', 'IN'), ('with', 'IN'), ('an', 'DT'), ('idea', 'NN'), (',', ','), ('unique', 'JJ'), ('among', 'IN'), ('nations', 'NNS'), (',', ','), ('that', 'IN'), ('in', 'IN'), ('Am

In [None]:
# TO KNOW THE ENGLISH AVAILABLE PART-OF-SPEECHES
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

# Chunking

In [None]:
import nltk
nltk.download('state_union')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# https://www.nltk.org/book_1ed/ch07.html
#help(state_union)
train_text = state_union.raw("/content/sample_data/joe-biden-march-2022")
sample_text = state_union.raw("/content/sample_data/joe-biden-sep-2022")


custom_sent_tokenizer = PunktSentenceTokenizer(train_text) # train 
tokenized = custom_sent_tokenizer.tokenize(sample_text) # 


def process_content():
   try: 
    for i in tokenized:
      words = nltk.word_tokenize(i)
      tagged = nltk.pos_tag(words)

      chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
      chunkParser = nltk.RegexpParser(chunkGram)
      
      chunked = chunkParser.parse(tagged)
      #print(chunked)
   except Exception as e:
     print(e)

process_content()

# Chinking

In [None]:
# https://www.nltk.org/book_1ed/ch07.html
#help(state_union)
train_text = state_union.raw("/content/sample_data/joe-biden-march-2022")
sample_text = state_union.raw("/content/sample_data/joe-biden-sep-2022")


custom_sent_tokenizer = PunktSentenceTokenizer(train_text) # train 
tokenized = custom_sent_tokenizer.tokenize(sample_text) # 


def process_content():
   try: 
    for i in tokenized:
      words = nltk.word_tokenize(i)
      tagged = nltk.pos_tag(words)

      chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}
                             }<VB.?|IN|DT|TO>+{"""
      chunkParser = nltk.RegexpParser(chunkGram)
      
      chunked = chunkParser.parse(tagged)
      #print(chunked)
   except Exception as e:
     print(e)

process_content()

NameError: ignored

# Named Entitiy Recogantion 

In [None]:
import nltk
nltk.download('state_union')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')

from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
# https://www.nltk.org/book/ch07.html#named-entity-recognition

"""
Commonly Used Types of Named Entity:

ORGANIZATION	Georgia-Pacific Corp., WHO
PERSON	Eddy Bonte, President Obama
LOCATION	Murray River, Mount Everest
DATE	June, 2008-06-29
TIME	two fifty a m, 1:30 p.m.
MONEY	175 million Canadian Dollars, GBP 10.40
PERCENT	twenty pct, 18.75 %
FACILITY	Washington Monument, Stonehenge
GPE	South East Asia, Midlothian
"""

'\nNE Type:\nOrgnaization\nPerson\nLocation\nDate\nTime \nMoney\nPrecent\nFacility\nGPE\n'

In [None]:
# https://www.nltk.org/book_1ed/ch07.html
#help(state_union)
train_text = state_union.raw("/content/biden-may-2022.txt")
sample_text = state_union.raw("/content/biden-sep-2022.txt")


custom_sent_tokenizer = PunktSentenceTokenizer(train_text) # train 
tokenized = custom_sent_tokenizer.tokenize(sample_text) # 


def process_content():
   try: 
    for i in tokenized:
      words = nltk.word_tokenize(i)
      tagged = nltk.pos_tag(words)

      namedEnt = nltk.ne_chunk(tagged) # ,binary=True

      print(namedEnt)
   except Exception as e:
     print(e)

process_content()

# Lemmatizing

Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item.

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:

lemmatizer = WordNetLemmatizer()

# default pos='n' = noun
# other options such, a=adj

print(lemmatizer.lemmatize('better'))
print(lemmatizer.lemmatize('better', pos="a"))
print(lemmatizer.lemmatize('best', pos="a"), '\n')
print(lemmatizer.lemmatize("corpora"))


print(lemmatizer.lemmatize('run'))
print(lemmatizer.lemmatize('run', pos="a"))

better
good
best 

corpus
run
run


# NLTK Corpora

In [None]:
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

import nltk
nltk.download('gutenberg')

print("nltk path:: ", nltk.__file__)

path::  /usr/local/lib/python3.7/dist-packages/nltk/__init__.py


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [None]:
sample = gutenberg.raw('melville-moby_dick.txt')

tok = sent_tokenize(sample)

print(tok[1:5])

['(Supplied by a Late Consumptive Usher to a Grammar School)\r\n\r\nThe pale Usher--threadbare in coat, heart, body, and brain; I see him\r\nnow.', 'He was ever dusting his old lexicons and grammars, with a queer\r\nhandkerchief, mockingly embellished with all the gay flags of all the\r\nknown nations of the world.', 'He loved to dust his old grammars; it\r\nsomehow mildly reminded him of his mortality.', '"While you take in hand to school others, and to teach them by what\r\nname a whale-fish is to be called in our tongue leaving out, through\r\nignorance, the letter H, which almost alone maketh the signification\r\nof the word, you deliver that which is not true."']


# WordNet

In [None]:
from nltk.corpus import wordnet

In [None]:
keyword = 'good'

syns = wordnet.synsets(keyword)

# sysnsel
print(syns[0].name)

#just the word
print(syns[0].lemmas()[0].name())

#defination
print(syns[0].definition())

#examples
print(syns[0].examples())


# synonyms and antonyms
synonyms = []
antonyms = []

for syn in wordnet.synsets(keyword):
  for l in syn.lemmas():
    synonyms.append(l.name())
    if l.antonyms():
      antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))


# check word similarity
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')

print(w1.wup_similarity(w2))


w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')

print(w1.wup_similarity(w2))


w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('car.n.01')

print(w1.wup_similarity(w2))

<bound method Synset.name of Synset('good.n.01')>
good
benefit
['for your own good', "what's the good of worrying?"]
{'goodness', 'respectable', 'right', 'serious', 'unspoiled', 'sound', 'salutary', 'in_effect', 'soundly', 'trade_good', 'expert', 'well', 'just', 'honest', 'near', 'upright', 'proficient', 'good', 'effective', 'commodity', 'full', 'unspoilt', 'secure', 'estimable', 'dear', 'safe', 'undecomposed', 'in_force', 'honorable', 'ripe', 'practiced', 'adept', 'skilful', 'thoroughly', 'beneficial', 'skillful', 'dependable'}
{'ill', 'evilness', 'badness', 'bad', 'evil'}
0.9090909090909091
0.32
0.6956521739130435


# Text Classifcation


In [None]:
import nltk 
import random
from nltk.corpus import movie_reviews

nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [None]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

#print(documents[1])

all_words = []
for w in movie_reviews.words():
  all_words.append(w.lower())


all_words = nltk.FreqDist(all_words)

#print(all_words.most_common(15))
#print(all_words['stupid'])


In [None]:
# words as features for learning

word_features = list(all_words.keys())[:3000]

def find_features(document):
  words = set(document)
  features = {}
  for w in word_features:
    features[w] = w in words
  return features

print(find_features(movie_reviews.words('neg/cv000_29416.txt')))

featuresets = [(find_features(rev), category) for (rev, category) in documents]

#print(featuresets)


# NAIVE BAYES
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bays Algo accuracy: ", (nltk.classify.accuracy(classifier, testing_set))*100)
print(classifier.show_most_informative_features(15))

Naive Bays Algo accuracy:  81.0
Most Informative Features
                   sucks = True              neg : pos    =     10.4 : 1.0
                 frances = True              pos : neg    =      9.1 : 1.0
                  annual = True              pos : neg    =      8.5 : 1.0
           unimaginative = True              neg : pos    =      8.2 : 1.0
             silverstone = True              neg : pos    =      7.6 : 1.0
               atrocious = True              neg : pos    =      6.9 : 1.0
                    mena = True              neg : pos    =      6.9 : 1.0
              schumacher = True              neg : pos    =      6.9 : 1.0
                  suvari = True              neg : pos    =      6.9 : 1.0
                 idiotic = True              neg : pos    =      6.9 : 1.0
                  regard = True              pos : neg    =      6.7 : 1.0
                 cunning = True              pos : neg    =      6.4 : 1.0
              metropolis = True           

In [None]:
import pickle

# export model using pickle

pickle_file = 'naivebayes.pickle'
# save_classifier = open(pickle_file,'wb')
#pickle.dump(classifier, save_classifier)
#save_classifier.close()

# import model from pickle
#classifier_f = open(pickle_file)
#classifier = pickle.load(classifier_f)
#classifier_f.close()

## Scikit-learn incorporation

In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [None]:
from nltk.classify.scikitlearn import SklearnClassifier

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent: ", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

#GNB_classifier = SklearnClassifier(GaussianNB())
#GNB_classifier.train(training_set)
#print("GNB_classifier accuracy percent: ", (nltk.classify.accuracy(GNB_classifier, testing_set))*100)

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BNB_classifier accuracy percent: ", (nltk.classify.accuracy(BNB_classifier, testing_set))*100)


#SVM 

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent: ", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)


LSVC_classifier = SklearnClassifier(LinearSVC())
LSVC_classifier.train(training_set)
print("LSVC_classifier accuracy percent: ", (nltk.classify.accuracy(LSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent: ", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

# 
LR_classifier = SklearnClassifier(LogisticRegression())
LR_classifier.train(training_set)
print("LR_classifier accuracy percent: ", (nltk.classify.accuracy(LR_classifier, testing_set))*100)

SGD_classifier = SklearnClassifier(SGDClassifier())
SGD_classifier.train(training_set)
print("SGD_classifier accuracy percent: ", (nltk.classify.accuracy(SGD_classifier, testing_set))*100)



MNB_classifier accuracy percent:  85.0
BNB_classifier accuracy percent:  82.0
SVC_classifier accuracy percent:  86.0
LSVC_classifier accuracy percent:  81.0
NuSVC_classifier accuracy percent:  84.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LR_classifier accuracy percent:  82.0
SGD_classifier accuracy percent:  80.0


In [None]:
# Combining Algos with a Vote

from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf


voted_classifier = VoteClassifier(#classifier,
                                  MNB_classifier, BNB_classifier, 
                                  SVC_classifier, LSVC_classifier , NuSVC_classifier,
                                  LR_classifier, SGD_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification: ", voted_classifier.classify(testing_set[0][0]), 
      "Confidence %:", voted_classifier.confidence(testing_set[0][0]))

voted_classifier accuracy percent: 84.0
Classification:  pos Confidence %: 1.0
