In [None]:
import nltk

## Tokenizing
1. Word Tokenizer
2. Sentence Tokenizer

-----------------------------------------------------------------
Corpora  - a body of text (Ex: medical journals, speech)

Lexicon - a dictionary (words and their meaning
Ex: Bull for investor is someone who is positive about the market whereas in general speaking, it refers to an animal

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
example_text = "Hello Mr. Ram, how are you doing today? The weather is great and Python is awesome. The sky is orangish-blue. You should not eat Cardboard."

In [None]:
print(sent_tokenize(example_text))

['Hello Mr. Ram, how are you doing today?', 'The weather is great and Python is awesome.', 'The sky is orangish-blue.', 'You should not eat Cardboard.']


In [None]:
print(word_tokenize(example_text))

['Hello', 'Mr.', 'Ram', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'orangish-blue', '.', 'You', 'should', 'not', 'eat', 'Cardboard', '.']


In [None]:
for i in word_tokenize(example_text):
    print(i)

Hello
Mr.
Ram
,
how
are
you
doing
today
?
The
weather
is
great
and
Python
is
awesome
.
The
sky
is
orangish-blue
.
You
should
not
eat
Cardboard
.


## Stop Words

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
ex_sen = "The Natural Language Toolkit, or more commonly NLTK, is a suite of libraries and programs for symbolic and statistical natural language processing (NLP) for English written in the Python programming language. It was developed by Steven Bird and Edward Loper in the Department of Computer and Information Science at the University of Pennsylvania.[4] NLTK includes graphical demonstrations and sample data. It is accompanied by a book that explains the underlying concepts behind the language processing tasks supported by the toolkit,[5] plus a cookbook."

In [None]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [None]:
words = word_tokenize(ex_sen)

filtered_sentence = []

for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
        
print(filtered_sentence)

# words

['The', 'Natural', 'Language', 'Toolkit', ',', 'commonly', 'NLTK', ',', 'suite', 'libraries', 'programs', 'symbolic', 'statistical', 'natural', 'language', 'processing', '(', 'NLP', ')', 'English', 'written', 'Python', 'programming', 'language', '.', 'It', 'developed', 'Steven', 'Bird', 'Edward', 'Loper', 'Department', 'Computer', 'Information', 'Science', 'University', 'Pennsylvania', '.', '[', '4', ']', 'NLTK', 'includes', 'graphical', 'demonstrations', 'sample', 'data', '.', 'It', 'accompanied', 'book', 'explains', 'underlying', 'concepts', 'behind', 'language', 'processing', 'tasks', 'supported', 'toolkit', ',', '[', '5', ']', 'plus', 'cookbook', '.']


In [None]:
# above code in 1 line

filter_senetence = [w for w in words if not w in stop_words]
print(filter_senetence)

['The', 'Natural', 'Language', 'Toolkit', ',', 'commonly', 'NLTK', ',', 'suite', 'libraries', 'programs', 'symbolic', 'statistical', 'natural', 'language', 'processing', '(', 'NLP', ')', 'English', 'written', 'Python', 'programming', 'language', '.', 'It', 'developed', 'Steven', 'Bird', 'Edward', 'Loper', 'Department', 'Computer', 'Information', 'Science', 'University', 'Pennsylvania', '.', '[', '4', ']', 'NLTK', 'includes', 'graphical', 'demonstrations', 'sample', 'data', '.', 'It', 'accompanied', 'book', 'explains', 'underlying', 'concepts', 'behind', 'language', 'processing', 'tasks', 'supported', 'toolkit', ',', '[', '5', ']', 'plus', 'cookbook', '.']


## Stemming

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
ps = PorterStemmer()

In [None]:
ex_words = ["walk", "walker", "walking", "walked", "walks"]

In [None]:
for w in ex_words:
    print(ps.stem(w))

walk
walker
walk
walk
walk


In [None]:
new_text = "NLTK is intended to support research and teaching in NLP or closely related areas, including empirical linguistics, cognitive science, artificial intelligence, information retrieval, and machine learning.[7] NLTK has been used successfully as a teaching tool, as an individual study tool, and as a platform for prototyping and building research systems. There are 32 universities in the US and 25 countries using NLTK in their courses. NLTK supports classification, tokenization, stemming, tagging, parsing, and semantic reasoning functionalities.[8]"

In [None]:
words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

nltk
is
intend
to
support
research
and
teach
in
nlp
or
close
relat
area
,
includ
empir
linguist
,
cognit
scienc
,
artifici
intellig
,
inform
retriev
,
and
machin
learn
.
[
7
]
nltk
ha
been
use
success
as
a
teach
tool
,
as
an
individu
studi
tool
,
and
as
a
platform
for
prototyp
and
build
research
system
.
there
are
32
univers
in
the
us
and
25
countri
use
nltk
in
their
cours
.
nltk
support
classif
,
token
,
stem
,
tag
,
pars
,
and
semant
reason
function
.
[
8
]


## Parts of Speech Tagging

In [None]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer  # this is a unsupervised machine learning tokenizer

nltk.download('state_union')

[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.


True

In [None]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [None]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [None]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [None]:
# craeting a function to generate tuples having words and POS tags
nltk.download('averaged_perceptron_tagger')

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            print(tagged)
            
    except Exception as e:
        print(str(e))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# run below function to create tuples for each words with its POS tag
process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

In [None]:
#### POS tag list:
'''
- CC coordinating conjunction
- CD cardinal digit
- DT determiner
- EX existential there (like: "there is" ... think of it like "there exists")
- FW foreign word
- IN preposition/subordinating conjunction
- JJ adjective 'big'
- JJR adjective, comparative 'bigger'
- JJS adjective, superlative 'biggest'
- LS list marker 1)
- MD modal could, will
- NN noun, singular 'desk'
- NNS noun plural 'desks'
- NNP proper noun, singular 'Harrison'
- NNPS proper noun, plural 'Americans'
- PDT predeterminer 'all the kids'
- POS possessive ending parent's
- PRP personal pronoun I, he, she
- PRP$ possessive pronoun my, his, hers
- RB adverb very, silently,
- RBR adverb, comparative better
- RBS adverb, superlative best
- RP particle give up
- TO to go 'to' the store.
- UH interjection errrrrrrrm
- VB verb, base form take
- VBD verb, past tense took
- VBG verb, gerund/present participle taking
- VBN verb, past participle taken
- VBP verb, sing. present, non-3d take
- VBZ verb, 3rd person sing. present takes
- WDT wh-determiner which
- WP wh-pronoun who, what
- WP$ possessive wh-pronoun whose
- WRB wh-abverb where, when
'''

'\n- CC coordinating conjunction\n- CD cardinal digit\n- DT determiner\n- EX existential there (like: "there is" ... think of it like "there exists")\n- FW foreign word\n- IN preposition/subordinating conjunction\n- JJ adjective \'big\'\n- JJR adjective, comparative \'bigger\'\n- JJS adjective, superlative \'biggest\'\n- LS list marker 1)\n- MD modal could, will\n- NN noun, singular \'desk\'\n- NNS noun plural \'desks\'\n- NNP proper noun, singular \'Harrison\'\n- NNPS proper noun, plural \'Americans\'\n- PDT predeterminer \'all the kids\'\n- POS possessive ending parent\'s\n- PRP personal pronoun I, he, she\n- PRP$ possessive pronoun my, his, hers\n- RB adverb very, silently,\n- RBR adverb, comparative better\n- RBS adverb, superlative best\n- RP particle give up\n- TO to go \'to\' the store.\n- UH interjection errrrrrrrm\n- VB verb, base form take\n- VBD verb, past tense took\n- VBG verb, gerund/present participle taking\n- VBN verb, past participle taken\n- VBP verb, sing. present, 

## Chunking

In [None]:
import re

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for item in tokenized:
            words = nltk.word_tokenize(item)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print(chunked)
            chunked.draw()
            
    except Exception as e:
        print(str(e))

In [None]:
process_content()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
no display name and no $DISPLAY environment variable


## Chinking

similar to chunking where chunking is done with a condition of exception

i.e. chunk these expect for some

In [None]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer  # this is a unsupervised machine learning tokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized =  custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT>+{"""
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            chunked.draw()
            
    except Exception as e:
        print(str(e))

In [None]:
process_content()

no display name and no $DISPLAY environment variable


## Named Entity Recognition

In [None]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer  # this is a unsupervised machine learning tokenizer

nltk.download('maxent_ne_chunker')
nltk.download('words')

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized =  custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[4:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            namedEnt = nltk.ne_chunk(tagged)
            namedEnt.draw()
            
    except Exception as e:
        print(str(e))

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
process_content()

no display name and no $DISPLAY environment variable


In [None]:
# NE Type and Examples
'''
ORGANIZATION - Georgia-Pacific Corp., WHO
PERSON - Eddy Bonte, President Obama
LOCATION - Murray River, Mount Everest
DATE - June, 2008-06-29
TIME - two fifty a m, 1:30 p.m.
MONEY - 175 million Canadian Dollars, GBP 10.40
PERCENT - twenty pct, 18.75 %
FACILITY - Washington Monument, Stonehenge
GPE - South East Asia, Midlothian
'''

'\nORGANIZATION - Georgia-Pacific Corp., WHO\nPERSON - Eddy Bonte, President Obama\nLOCATION - Murray River, Mount Everest\nDATE - June, 2008-06-29\nTIME - two fifty a m, 1:30 p.m.\nMONEY - 175 million Canadian Dollars, GBP 10.40\nPERCENT - twenty pct, 18.75 %\nFACILITY - Washington Monument, Stonehenge\nGPE - South East Asia, Midlothian\n'

In [None]:
## If we do not want the classification, we can make binary = True

def process_content():
    try:
        for i in tokenized[4:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            namedEnt.draw()
            
    except Exception as e:
        print(str(e))

In [None]:
process_content()

no display name and no $DISPLAY environment variable


## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# default parameter id pos='n' i.e. noun
print(lemmatizer.lemmatize("cats"))

print(lemmatizer.lemmatize("better", pos='a')) # gives base adjective
print(lemmatizer.lemmatize("best", pos='a')) 

print(lemmatizer.lemmatize("ran", pos='v'))  # pos --> verb

cat
good
best
run


## Corpora

This is a collection of datasets in nltk module which we can work with. To navigate go to below link
- C:\Users\user_name\AppData\Roaming\nltk_data\corpora

In [None]:
import nltk

print(nltk.__file__)

/usr/local/lib/python3.9/dist-packages/nltk/__init__.py


In [None]:
nltk.download('gutenberg')

from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

sample = gutenberg.raw("bible-kjv.txt")

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


In [None]:
tok = sent_tokenize(sample)
print(tok[5:15])

['1:5 And God called the light Day, and the darkness he called Night.', 'And the evening and the morning were the first day.', '1:6 And God said, Let there be a firmament in the midst of the waters,\nand let it divide the waters from the waters.', '1:7 And God made the firmament, and divided the waters which were\nunder the firmament from the waters which were above the firmament:\nand it was so.', '1:8 And God called the firmament Heaven.', 'And the evening and the\nmorning were the second day.', '1:9 And God said, Let the waters under the heaven be gathered together\nunto one place, and let the dry land appear: and it was so.', '1:10 And God called the dry land Earth; and the gathering together of\nthe waters called he Seas: and God saw that it was good.', '1:11 And God said, Let the earth bring forth grass, the herb yielding\nseed, and the fruit tree yielding fruit after his kind, whose seed is\nin itself, upon the earth: and it was so.', '1:12 And the earth brought forth grass, and

## WordNet

with this we can look for synonyms, definition, relations, context, etc. of a word

In [5]:
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')

syns = wordnet.synsets('program')
print(syns)

print(syns[5])

print(syns[2].lemmas())

print(syns[2].lemmas()[0].name()) 

[nltk_data] Downloading package wordnet to /root/nltk_data...


[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]
Synset('course_of_study.n.01')
[Lemma('broadcast.n.02.broadcast'), Lemma('broadcast.n.02.program'), Lemma('broadcast.n.02.programme')]
broadcast


In [6]:
print(syns[0].lemmas())
print(syns[0].lemmas()[0].name())
print(syns[0].definition())
print(syns[0].examples())

[Lemma('plan.n.01.plan'), Lemma('plan.n.01.program'), Lemma('plan.n.01.programme')]
plan
a series of steps to be carried out or goals to be accomplished
['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [7]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        # print("l:",l)
        synonyms.append(l.name())
        if l.antonyms():
            # print("ant l:", l)
            antonyms.append(l.antonyms()[0].name())
            
print(set(synonyms))
print(set(antonyms))

{'honorable', 'respectable', 'skillful', 'in_force', 'honest', 'goodness', 'upright', 'commodity', 'just', 'right', 'practiced', 'trade_good', 'near', 'proficient', 'estimable', 'in_effect', 'adept', 'safe', 'beneficial', 'serious', 'dependable', 'well', 'undecomposed', 'full', 'secure', 'unspoiled', 'thoroughly', 'ripe', 'sound', 'good', 'skilful', 'soundly', 'unspoilt', 'salutary', 'expert', 'dear', 'effective'}
{'evil', 'badness', 'bad', 'ill', 'evilness'}


In [8]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")

print(w1.wup_similarity(w2))

0.9090909090909091


In [9]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")

print(w1.wup_similarity(w2))

0.6956521739130435


In [10]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")

print(w1.wup_similarity(w2))

0.32


## Text Classification

In [11]:
import nltk
import random
from nltk.corpus import movie_reviews

In [13]:
nltk.download('movie_reviews')

documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

# above code is same as
'''
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append(list(movie_reviews.words(fileid)), category)
'''

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


'\nfor category in movie_reviews.categories():\n    for fileid in movie_reviews.fileids(category):\n        documents.append(list(movie_reviews.words(fileid)), category)\n'

In [14]:
random.shuffle(documents)

In [15]:
# print(documents[2])

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

# all_words is a list now. Converting to nltk frequency distribution
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(10))

print("frequency of bad in the list is: ", all_words['bad'])
    

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822)]
frequency of bad in the list is:  1395


## Converting words to features

In [17]:
word_features = list(all_words.keys())[:3000]   # using only top 3000 words

def find_features(documents):
    words = set(documents)                  # no duplicates in set
    features = {}
    for w in word_features:
        features[w] = (w in words)
        
    return features

In [18]:
# print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

features_set = [(find_features(rev), category) for (rev, category) in documents]

## Naive Bayes Algo

In [19]:
training_set = features_set[:1900]
testing_set = features_set[1900:]

In [20]:
# posterior  = prior occurrences * liklihood / evidence

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Naive Bayes Algo accuracy percentage", (nltk.classify.accuracy(classifier, testing_set))*100)

Naive Bayes Algo accuracy percentage 83.0


In [21]:
classifier.show_most_informative_features(10)

Most Informative Features
                   sucks = True              neg : pos    =      9.7 : 1.0
                bothered = True              neg : pos    =      9.6 : 1.0
                 frances = True              pos : neg    =      9.1 : 1.0
                  annual = True              pos : neg    =      8.4 : 1.0
           unimaginative = True              neg : pos    =      8.3 : 1.0
              uninspired = True              neg : pos    =      8.1 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
                 idiotic = True              neg : pos    =      7.2 : 1.0
                  sexist = True              neg : pos    =      7.0 : 1.0
             silverstone = True              neg : pos    =      7.0 : 1.0


## Saving Classifier with Pickle

In [23]:
import pickle

save_classifier = open("naive_bayes.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [24]:
classifier_f = open("naive_bayes.pickle", "rb")
classifier1 = pickle.load(classifier_f)
classifier_f.close()

In [25]:
print("Naive Bayes Algo accuracy percentage", (nltk.classify.accuracy(classifier1, testing_set))*100)
classifier1.show_most_informative_features(10)

Naive Bayes Algo accuracy percentage 83.0
Most Informative Features
                   sucks = True              neg : pos    =      9.7 : 1.0
                bothered = True              neg : pos    =      9.6 : 1.0
                 frances = True              pos : neg    =      9.1 : 1.0
                  annual = True              pos : neg    =      8.4 : 1.0
           unimaginative = True              neg : pos    =      8.3 : 1.0
              uninspired = True              neg : pos    =      8.1 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
                 idiotic = True              neg : pos    =      7.2 : 1.0
                  sexist = True              neg : pos    =      7.0 : 1.0
             silverstone = True              neg : pos    =      7.0 : 1.0


## sklearn with nltk

In [26]:
from nltk.classify.scikitlearn import SklearnClassifier  # wrapper to include sklearn algorithms in nltk

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [27]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)

print("MNB Algo accuracy percentage", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)


BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)

print("BNB Algo accuracy percentage", (nltk.classify.accuracy(BNB_classifier, testing_set))*100)


from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC


LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)

print("LogisticRegression Algo accuracy percentage", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)


SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)

print("SGDClassifier Algo accuracy percentage", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)


SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)

print("SVC Algo accuracy percentage", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)


LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)

print("LinearSVC Algo accuracy percentage", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)


NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)

print("NuSVC Algo accuracy percentage", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

MNB Algo accuracy percentage 86.0
BNB Algo accuracy percentage 83.0
LogisticRegression Algo accuracy percentage 79.0
SGDClassifier Algo accuracy percentage 81.0
SVC Algo accuracy percentage 83.0
LinearSVC Algo accuracy percentage 77.0
NuSVC Algo accuracy percentage 83.0


## Voting System for algo

In [28]:
from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):
    
    def __init__(self, *classifiers):
        self.classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        
        return conf
      

In [29]:
voted_classifier = VoteClassifier(NuSVC_classifier,
                                 LinearSVC_classifier,
                                 SVC_classifier,
                                 SGDClassifier_classifier,
                                 LogisticRegression_classifier,
                                 BNB_classifier,
                                 MNB_classifier)

print("Voted Classifier Algo accuracy percentage", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print('Classifiecation: ', voted_classifier.classify(testing_set[0][0]), 
      'Confidence percentage: ', voted_classifier.confidence(testing_set[0][0]))

Voted Classifier Algo accuracy percentage 82.0
Classifiecation:  pos Confidence percentage:  1.0


In [30]:
print('Classifiecation: ', voted_classifier.classify(testing_set[1][0]), 
      'Confidence percentage: ', voted_classifier.confidence(testing_set[1][0]))

print('Classifiecation: ', voted_classifier.classify(testing_set[2][0]), 
      'Confidence percentage: ', voted_classifier.confidence(testing_set[2][0]))

print('Classifiecation: ', voted_classifier.classify(testing_set[3][0]), 
      'Confidence percentage: ', voted_classifier.confidence(testing_set[3][0]))

print('Classifiecation: ', voted_classifier.classify(testing_set[4][0]), 
      'Confidence percentage: ', voted_classifier.confidence(testing_set[4][0]))

print('Classifiecation: ', voted_classifier.classify(testing_set[5][0]), 
      'Confidence percentage: ', voted_classifier.confidence(testing_set[5][0]))

print('Classifiecation: ', voted_classifier.classify(testing_set[6][0]), 
      'Confidence percentage: ', voted_classifier.confidence(testing_set[6][0]))

Classifiecation:  neg Confidence percentage:  1.0
Classifiecation:  neg Confidence percentage:  1.0
Classifiecation:  pos Confidence percentage:  1.0
Classifiecation:  neg Confidence percentage:  1.0
Classifiecation:  neg Confidence percentage:  0.5714285714285714
Classifiecation:  neg Confidence percentage:  1.0


## Working with new dataset


In [31]:
from nltk.classify.scikitlearn import SklearnClassifier  # wrapper to include sklearn algorithms in nltk
from nltk.classify import ClassifierI

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from statistics import mode

class VoteClassifier(ClassifierI):
    
    def __init__(self, *classifiers):
        self.classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        
        return conf


In [32]:
# read the files from github
pos_rev = open("https://raw.githubusercontent.com/Coder9494/NLP/main/Datasets/nltk_nlp_dataset/positive.txt", 'r').read()
neg_rev = open("https://raw.githubusercontent.com/Coder9494/NLP/main/Datasets/nltk_nlp_dataset/negative.txt", 'r').read()

FileNotFoundError: ignored

In [None]:
documents = []

for r in pos_rev.split('\n'):
    documents.append((r, 'pos'))

for r in neg_rev.split('\n'):
    documents.append((r, 'neg'))
    


In [None]:
from nltk.tokenize import word_tokenize
import nltk

all_words = []

pos_rev_words = word_tokenize(pos_rev)
neg_rev_words = word_tokenize(neg_rev)

for w in pos_rev_words:
    all_words.append(w.lower())
    
for w in neg_rev_words:
    all_words.append(w.lower())
    
# all_words is a list now. Converting to nltk frequency distribution
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(10))

In [None]:
word__features = list(all_words.keys())[:5000]   # using only top 5000 words

def find_features(documents):
    words = word_tokenize(documents)                 # no duplicates in set
    features = {}
    for w in word__features:
        features[w] = (w in words)
        
    return features

In [None]:
features_set = [(find_features(rev), category) for (rev, category) in documents]

import random
random.shuffle(features_set)

In [None]:
training_set = features_set[:10000]
testing_set = features_set[10000:]

In [None]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB Algo accuracy percentage", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)


BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BNB Algo accuracy percentage", (nltk.classify.accuracy(BNB_classifier, testing_set))*100)


LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression Algo accuracy percentage", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)


SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier Algo accuracy percentage", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)


SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC Algo accuracy percentage", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)


LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC Algo accuracy percentage", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC Algo accuracy percentage", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

In [None]:
voted_classifier = VoteClassifier(NuSVC_classifier,
                                 LinearSVC_classifier,
                                 SVC_classifier,
                                 SGDClassifier_classifier,
                                 LogisticRegression_classifier,
                                 BNB_classifier,
                                 MNB_classifier)

print("Voted Classifier Algo accuracy percentage", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print('Classifiecation: ', voted_classifier.classify(testing_set[0][0]), 
      'Confidence percentage: ', voted_classifier.confidence(testing_set[0][0]))