# NLTK - Natural Language Tool Kit

In [1]:
import nltk

In [2]:
#nltk.download()
# if a pop up does not show do that on pycharm!

## Preprocessing the text

### Tokenizing

- Word tokenizers
- Sentence tokenizers
- Corpora: body of the text and language or topic
- Lexicon: investor speak "bull" = positive while for english speaker "bull" = scary animal

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
eg = "Im here working on this NLTK packages. in ten minutes i will have the CSS lesson, what A"

#### Split by sentences.
We can see at the point, capital letter and so on. but if we have as as Mr.Smith? that is not a new sentence

In [4]:
sent_tokenize(eg)

['Im here working on this NLTK packages.',
 'in ten minutes i will have the CSS lesson, what A']

### Stopwords

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
print(set(stopwords.words("english")))
stop = set(stopwords.words("english"))

{'y', 'been', "didn't", 'themselves', 'you', "haven't", 'between', 'at', 'can', 'such', 'wasn', 'and', 'mustn', 'most', 'they', 'itself', 'how', 'weren', 'herself', 'few', 'd', 'again', "it's", 'doesn', 'all', 'very', 'no', 'on', 'having', 'won', 'my', 'in', 'doing', "weren't", "wouldn't", 'only', 'll', 'aren', 'nor', 'over', 'should', "isn't", 'it', 'up', 'out', 'each', 'is', 'being', 'he', "shan't", 'will', 'was', 'its', 'there', 'shan', 'from', 'were', 'these', 'because', 'hadn', 'own', "she's", 'does', 'o', 'are', 'then', "aren't", 'against', 'didn', 'couldn', 'ours', 're', "you're", 'than', 'with', 'why', 'an', 'below', 'down', 'who', 've', 'too', "couldn't", 'both', "you'll", 'we', 'where', 'had', 'their', 'through', 'as', "that'll", 'during', 'for', 'to', 'after', 'be', 'once', 'when', 'mightn', 'above', 'ma', 'needn', 'or', 'has', 'me', 'hers', "hadn't", 'i', 't', "doesn't", 'while', 'his', 'here', "needn't", 'which', 'not', 'isn', "hasn't", 'more', 'whom', "you'd", 'shouldn', 

To be sure that we are not using word inside the sed we must perform a check one by one:

In [6]:
filtered = [word for word in stop]
filtered[0:3]

['y', 'been', "didn't"]

### Stemming

- take the root stemm of the word! reading read


In [7]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer() # Create the Obj PorterStemmer!
print(type(ps))

<class 'nltk.stem.porter.PorterStemmer'>


In [8]:
example = ["try","trying","tryed","tryly"]
for word in example:
    print(ps.stem(word))

tri
tri
tri
tryli


### Part of Speech tagging

Create a tuple with the word and what is in the prashe!

In [9]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [10]:
train = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [11]:
print(train)

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
February 2, 2005


9:10 P.M. EST 

THE PRESIDENT: Mr. Speaker, Vice President Cheney, members of Congress, fellow citizens: 

As a new Congress gathers, all of us in the elected branches of government share a great privilege: We've been placed in office by the votes of the people we serve. And tonight that is a privilege we share with newly-elected leaders of Afghanistan, the Palestinian Territories, Ukraine, and a free and sovereign Iraq. (Applause.) 

Two weeks ago, I stood on the steps of this Capitol and renewed the commitment of our nation to the guiding ideal of liberty for all. This evening I will set forth policies to advance that ideal at home and around the world. 

Tonight, with a healthy, growing economy, with more Americans going back to work, with our nation an active force for good in the world -- the state of our union is confident and strong. (Applause.) 

Our generati

In [12]:
custom = PunktSentenceTokenizer(train) # we can skip this part
tokenized = custom.tokenize(sample_text)

In [13]:
for i in tokenized:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    print(tagged)

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

[('They', 'PRP'), ('seek', 'VBP'), ('to', 'TO'), ('impose', 'VB'), ('a', 'DT'), ('heartless', 'NN'), ('system', 'NN'), ('of', 'IN'), ('totalitarian', 'JJ'), ('control', 'NN'), ('throughout', 'IN'), ('the', 'DT'), ('Middle', 'NNP'), ('East', 'NNP'), (',', ','), ('and', 'CC'), ('arm', 'NN'), ('themselves', 'PRP'), ('with', 'IN'), ('weapons', 'NNS'), ('of', 'IN'), ('mass', 'NN'), ('murder', 'NN'), ('.', '.')]
[('Their', 'PRP$'), ('aim', 'NN'), ('is', 'VBZ'), ('to', 'TO'), ('seize', 'VB'), ('power', 'NN'), ('in', 'IN'), ('Iraq', 'NNP'), (',', ','), ('and', 'CC'), ('use', 'VB'), ('it', 'PRP'), ('as', 'IN'), ('a', 'DT'), ('safe', 'JJ'), ('haven', 'NN'), ('to', 'TO'), ('launch', 'VB'), ('attacks', 'NNS'), ('against', 'IN'), ('America', 'NNP'), ('and', 'CC'), ('the', 'DT'), ('world', 'NN'), ('.', '.')]
[('Lacking', 'VBG'), ('the', 'DT'), ('military', 'JJ'), ('strength', 'NN'), ('to', 'TO'), ('challenge', 'VB'), ('us', 'PRP'), ('directly', 'RB'), (',', ','), ('the', 'DT'), ('terrorists', 'NNS')

[('Members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('however', 'RB'), ('we', 'PRP'), ('feel', 'VBP'), ('about', 'IN'), ('the', 'DT'), ('decisions', 'NNS'), ('and', 'CC'), ('debates', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('past', 'NN'), (',', ','), ('our', 'PRP$'), ('nation', 'NN'), ('has', 'VBZ'), ('only', 'RB'), ('one', 'CD'), ('option', 'NN'), (':', ':'), ('We', 'PRP'), ('must', 'MD'), ('keep', 'VB'), ('our', 'PRP$'), ('word', 'NN'), (',', ','), ('defeat', 'VB'), ('our', 'PRP$'), ('enemies', 'NNS'), (',', ','), ('and', 'CC'), ('stand', 'VBP'), ('behind', 'IN'), ('the', 'DT'), ('American', 'JJ'), ('military', 'NN'), ('in', 'IN'), ('this', 'DT'), ('vital', 'JJ'), ('mission', 'NN'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('Laura', 'NNP'), ('Bush', 'NNP'), ('is', 'VBZ'), ('applauded', 'VBN'), ('as', 'IN'), ('she', 'PRP'), ('is', 'VBZ'), ('introduced', 'VBN'), ('Tuesday', 'NNP'), ('evening', 'NN'), (',', ','), ('Jan', 'NNP'), ('.', '.')]
[(

[('Isolationism', 'NNP'), ('would', 'MD'), ('not', 'RB'), ('only', 'RB'), ('tie', 'VB'), ('our', 'PRP$'), ('hands', 'NNS'), ('in', 'IN'), ('fighting', 'VBG'), ('enemies', 'NNS'), (',', ','), ('it', 'PRP'), ('would', 'MD'), ('keep', 'VB'), ('us', 'PRP'), ('from', 'IN'), ('helping', 'VBG'), ('our', 'PRP$'), ('friends', 'NNS'), ('in', 'IN'), ('desperate', 'JJ'), ('need', 'NN'), ('.', '.')]
[('We', 'PRP'), ('show', 'VBP'), ('compassion', 'JJ'), ('abroad', 'RB'), ('because', 'IN'), ('Americans', 'NNPS'), ('believe', 'VBP'), ('in', 'IN'), ('the', 'DT'), ('God-given', 'NNP'), ('dignity', 'NN'), ('and', 'CC'), ('worth', 'NN'), ('of', 'IN'), ('a', 'DT'), ('villager', 'NN'), ('with', 'IN'), ('HIV/AIDS', 'NNP'), (',', ','), ('or', 'CC'), ('an', 'DT'), ('infant', 'NN'), ('with', 'IN'), ('malaria', 'NNS'), (',', ','), ('or', 'CC'), ('a', 'DT'), ('refugee', 'JJ'), ('fleeing', 'NN'), ('genocide', 'NN'), (',', ','), ('or', 'CC'), ('a', 'DT'), ('young', 'JJ'), ('girl', 'NN'), ('sold', 'VBN'), ('into', 

[('Yet', 'RB'), ('the', 'DT'), ('tax', 'NN'), ('relief', 'NN'), ('is', 'VBZ'), ('set', 'VBN'), ('to', 'TO'), ('expire', 'VB'), ('in', 'IN'), ('the', 'DT'), ('next', 'JJ'), ('few', 'JJ'), ('years', 'NNS'), ('.', '.')]
[('If', 'IN'), ('we', 'PRP'), ('do', 'VBP'), ('nothing', 'NN'), (',', ','), ('American', 'NNP'), ('families', 'NNS'), ('will', 'MD'), ('face', 'VB'), ('a', 'DT'), ('massive', 'JJ'), ('tax', 'NN'), ('increase', 'NN'), ('they', 'PRP'), ('do', 'VBP'), ('not', 'RB'), ('expect', 'VB'), ('and', 'CC'), ('will', 'MD'), ('not', 'RB'), ('welcome', 'VB'), ('.', '.')]
[('Because', 'IN'), ('America', 'NNP'), ('needs', 'VBZ'), ('more', 'JJR'), ('than', 'IN'), ('a', 'DT'), ('temporary', 'JJ'), ('expansion', 'NN'), (',', ','), ('we', 'PRP'), ('need', 'VBP'), ('more', 'JJR'), ('than', 'IN'), ('temporary', 'JJ'), ('tax', 'NN'), ('relief', 'NN'), ('.', '.')]
[('I', 'PRP'), ('urge', 'VBP'), ('the', 'DT'), ('Congress', 'NNP'), ('to', 'TO'), ('act', 'VB'), ('responsibly', 'RB'), (',', ','), ('a

[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('We', 'PRP'), ('must', 'MD'), ('also', 'RB'), ('change', 'VB'), ('how', 'WRB'), ('we', 'PRP'), ('power', 'NN'), ('our', 'PRP$'), ('automobiles', 'NNS'), ('.', '.')]
[('We', 'PRP'), ('will', 'MD'), ('increase', 'VB'), ('our', 'PRP$'), ('research', 'NN'), ('in', 'IN'), ('better', 'JJR'), ('batteries', 'NNS'), ('for', 'IN'), ('hybrid', 'JJ'), ('and', 'CC'), ('electric', 'JJ'), ('cars', 'NNS'), (',', ','), ('and', 'CC'), ('in', 'IN'), ('pollution-free', 'JJ'), ('cars', 'NNS'), ('that', 'WDT'), ('run', 'VBP'), ('on', 'IN'), ('hydrogen', 'NN'), ('.', '.')]
[('We', 'PRP'), ("'ll", 'MD'), ('also', 'RB'), ('fund', 'VB'), ('additional', 'JJ'), ('research', 'NN'), ('in', 'IN'), ('cutting-edge', 'JJ'), ('methods', 'NNS'), ('of', 'IN'), ('producing', 'VBG'), ('ethanol', 'NN'), (',', ','), ('not', 'RB'), ('just', 'RB'), ('from', 'IN'), ('corn', 'NN'), (',', ','), ('but', 'CC'), ('from', 'IN'), ('wood', 'NN'), ('chips', 'NNS'), ('and', 'CC')

[('Honorable', 'JJ'), ('people', 'NNS'), ('in', 'IN'), ('both', 'DT'), ('parties', 'NNS'), ('are', 'VBP'), ('working', 'VBG'), ('on', 'IN'), ('reforms', 'NNS'), ('to', 'TO'), ('strengthen', 'VB'), ('the', 'DT'), ('ethical', 'JJ'), ('standards', 'NNS'), ('of', 'IN'), ('Washington', 'NNP'), ('--', ':'), ('I', 'PRP'), ('support', 'VBP'), ('your', 'PRP$'), ('efforts', 'NNS'), ('.', '.')]
[('Each', 'DT'), ('of', 'IN'), ('us', 'PRP'), ('has', 'VBZ'), ('made', 'VBN'), ('a', 'DT'), ('pledge', 'NN'), ('to', 'TO'), ('be', 'VB'), ('worthy', 'JJ'), ('of', 'IN'), ('public', 'JJ'), ('responsibility', 'NN'), ('--', ':'), ('and', 'CC'), ('that', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('pledge', 'NN'), ('we', 'PRP'), ('must', 'MD'), ('never', 'RB'), ('forget', 'VB'), (',', ','), ('never', 'RB'), ('dismiss', 'NN'), (',', ','), ('and', 'CC'), ('never', 'RB'), ('betray', 'NN'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('As', 'IN'), ('we', 'PRP'), ('renew', 'VBP'), ('the', 'DT'), 

- CC coordinating conjunction
- CD cardinal digit
- DT determiner
- EX existential there (like: “there is” … think of it like “there exists”)
- FW foreign word
- IN preposition/subordinating conjunction
- JJ adjective ‘big’
- JJR adjective, comparative ‘bigger’
- JJS adjective, superlative ‘biggest’
- LS list marker 1)
- MD modal could, will
- NN noun, singular ‘desk’
- NNS noun plural ‘desks’
- NNP proper noun, singular ‘Harrison’
- NNPS proper noun, plural ‘Americans’
- PDT predeterminer ‘all the kids’
- POS possessive ending parent’s
- PRP personal pronoun I, he, she
- PRP possessive pronoun my, his, hers
- RB adverb very, silently,
- RBR adverb, comparative better
- RBS adverb, superlative best
- RP particle give up
- TO, to go ‘to’ the store.
- UH interjection, errrrrrrrm
- VB verb, base form take
- VBD verb, past tense, took
- VBG verb, gerund/present participle taking
- VBN verb, past participle is taken
- VBP verb, sing. present, known-3d take
- VBZ verb, 3rd person sing. present takes
- WDT wh-determiner which
- WP wh-pronoun who, what
- WP possessive wh-pronoun whose
- WRB wh-adverb where, when

### Chunking

In [14]:
custom = PunktSentenceTokenizer(train) 
tokenized = custom.tokenize(sample_text)
for i in tokenized:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    chunk = r"""Chunk: {<VB.?>} """ 
    chunkParser = nltk.RegexpParser(chunk)
    chunked = chunkParser.parse(tagged)
    print(words, "\n\n", chunked)
    break

['PRESIDENT', 'GEORGE', 'W.', 'BUSH', "'S", 'ADDRESS', 'BEFORE', 'A', 'JOINT', 'SESSION', 'OF', 'THE', 'CONGRESS', 'ON', 'THE', 'STATE', 'OF', 'THE', 'UNION', 'January', '31', ',', '2006', 'THE', 'PRESIDENT', ':', 'Thank', 'you', 'all', '.'] 

 (S
  PRESIDENT/NNP
  GEORGE/NNP
  W./NNP
  BUSH/NNP
  'S/POS
  ADDRESS/NNP
  BEFORE/IN
  A/NNP
  JOINT/NNP
  SESSION/NNP
  OF/IN
  THE/NNP
  CONGRESS/NNP
  ON/NNP
  THE/NNP
  STATE/NNP
  OF/IN
  THE/NNP
  UNION/NNP
  January/NNP
  31/CD
  ,/,
  2006/CD
  THE/NNP
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)


### Lemmatazing

it's similar to stemming but in this case it return a real word!

In [15]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("goodest", pos="a"))
print(lemmatizer.lemmatize("good", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))

good
good
good
best


the main argument of pos = "n" which stay for noun. if you are passing something different you must pass it!

## WordNet
- Find synonimis, meaning and so on!

In [16]:
from nltk.corpus import wordnet

### Synset

In [17]:
syns = wordnet.synsets("love")
print(syns)

[Synset('love.n.01'), Synset('love.n.02'), Synset('beloved.n.01'), Synset('love.n.04'), Synset('love.n.05'), Synset('sexual_love.n.02'), Synset('love.v.01'), Synset('love.v.02'), Synset('love.v.03'), Synset('sleep_together.v.01')]


In [18]:
print(syns[9].lemmas())

[Lemma('sleep_together.v.01.sleep_together'), Lemma('sleep_together.v.01.roll_in_the_hay'), Lemma('sleep_together.v.01.love'), Lemma('sleep_together.v.01.make_out'), Lemma('sleep_together.v.01.make_love'), Lemma('sleep_together.v.01.sleep_with'), Lemma('sleep_together.v.01.get_laid'), Lemma('sleep_together.v.01.have_sex'), Lemma('sleep_together.v.01.know'), Lemma('sleep_together.v.01.do_it'), Lemma('sleep_together.v.01.be_intimate'), Lemma('sleep_together.v.01.have_intercourse'), Lemma('sleep_together.v.01.have_it_away'), Lemma('sleep_together.v.01.have_it_off'), Lemma('sleep_together.v.01.screw'), Lemma('sleep_together.v.01.fuck'), Lemma('sleep_together.v.01.jazz'), Lemma('sleep_together.v.01.eff'), Lemma('sleep_together.v.01.hump'), Lemma('sleep_together.v.01.lie_with'), Lemma('sleep_together.v.01.bed'), Lemma('sleep_together.v.01.have_a_go_at_it'), Lemma('sleep_together.v.01.bang'), Lemma('sleep_together.v.01.get_it_on'), Lemma('sleep_together.v.01.bonk')]


### Definition

In [19]:
print(syns[7])
print(syns[7].lemmas())
print(syns[7])
print(syns[7].definition())

Synset('love.v.02')
[Lemma('love.v.02.love'), Lemma('love.v.02.enjoy')]
Synset('love.v.02')
get pleasure from


### Examples

In [20]:
print(syns[7].examples())

['I love cooking']


In [21]:
sinonimi = []
contrari = []
for syn in wordnet.synsets("love"):
    for l in syn.lemmas():
        sinonimi.append(l)
        if l.antonyms():
            contrari.append(l.antonyms()[0].name())

In [22]:
sinonimi[10:15]

[Lemma('love.n.04.erotic_love'),
 Lemma('love.n.05.love'),
 Lemma('sexual_love.n.02.sexual_love'),
 Lemma('sexual_love.n.02.lovemaking'),
 Lemma('sexual_love.n.02.making_love')]

### Find semantic similarity

In [23]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")
print(w1.lemmas())
print(w2.lemmas())

[Lemma('ship.n.01.ship')]
[Lemma('boat.n.01.boat')]


In [24]:
print(w1.wup_similarity(w2)) # nice similarity

0.9090909090909091


## Text Classifier

In this case we will cover binary situation.

In [25]:
import nltk 
import random as rd # we will use that to shuffle the dt
from nltk.corpus import movie_reviews # fancy list of reviews labelled!

### Load pre-labelled reviews

In [26]:
dir(movie_reviews)[-15:]

['__lt__',
 '__module__',
 '__name__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_unload',
 'subdir']

In [27]:
documents = []
i = 0
for category in movie_reviews.categories():
    if i == 0:
        print(category)
    for fileid in movie_reviews.fileids(category):
        if i == 0:
            print(fileid)
        documents.append((list(movie_reviews.words(fileid)), category))
        if i == 0:
            print(documents[0][0][0:20], documents[0][1])
        i+=1

neg
neg/cv000_29416.txt
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an'] neg


In [28]:
rd.shuffle(documents)
print(documents[0][0][0:10], documents[0][1])

['there', 'may', 'not', 'be', 'a', 'critic', 'alive', 'who', 'harbors', 'as'] neg


In [29]:
all_words= []
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
common = set(stopwords.words("english"))
punctuation = set([",",".",":",";","(",")","!","?","'"," \" ", "-" ])
for w in movie_reviews.words():
    if w not in punctuation and w not in common:
        all_words.append(w.lower())

### Nltk FrequencyDistribution

In [30]:
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))
print(len(all_words)) # 40000 different words

[('"', 17612), ('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1911)]
39607


In [31]:
all_words["stupid"] # is like a dictionary

253

#### Limit the important words

In [32]:
word_features = list(all_words.keys())[:3000]

In [33]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [34]:
feature_sets = [(find_features(rev),category) for (rev,category) in documents]

This feature_sets is a list of tuple having the first element a dictionary having a dictionary with all the 3000 most common words and value a boolean if they are contained in that reviews and as a second element if the reviews was positive or negative!

In [35]:
feature_sets[0] 

({'plot': False,
  'two': False,
  'teen': False,
  'couples': False,
  'go': True,
  'church': False,
  'party': False,
  'drink': False,
  'drive': False,
  'get': True,
  'accident': False,
  'one': True,
  'guys': False,
  'dies': False,
  'girlfriend': False,
  'continues': False,
  'see': True,
  'life': False,
  'nightmares': False,
  'deal': False,
  'watch': False,
  'movie': True,
  '"': False,
  'sorta': False,
  'find': False,
  'critique': False,
  'mind': False,
  'fuck': False,
  'generation': False,
  'touches': False,
  'cool': False,
  'idea': False,
  'presents': False,
  'bad': False,
  'package': False,
  'makes': True,
  'review': False,
  'even': True,
  'harder': False,
  'write': False,
  'since': False,
  'generally': True,
  'applaud': False,
  'films': True,
  'attempt': False,
  'break': False,
  'mold': False,
  'mess': False,
  'head': False,
  'lost': False,
  'highway': False,
  '&': False,
  'memento': False,
  'good': True,
  'ways': False,
  'making'

### Fit the NaiveBayes Classifier

In [36]:
train_x = feature_sets[:1900]
test_x = feature_sets[1900:]

In [37]:
classifier = nltk.NaiveBayesClassifier.train(train_x)

In [38]:
print("accuracy:", nltk.classify.accuracy(classifier, test_x))

accuracy: 0.82


In [39]:
classifier.show_most_informative_features(20)

Most Informative Features
                   sucks = True              neg : pos    =     10.7 : 1.0
               atrocious = True              neg : pos    =     10.4 : 1.0
                 idiotic = True              neg : pos    =      9.1 : 1.0
                  justin = True              neg : pos    =      9.1 : 1.0
                 frances = True              pos : neg    =      8.9 : 1.0
                  annual = True              pos : neg    =      8.3 : 1.0
                 martian = True              neg : pos    =      7.7 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                  sexist = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
           unimaginative = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      6.6 : 1.0
                  crappy = True              neg : pos    =      6.4 : 1.0

## Pickle to save the algorithm

In [40]:
import pickle
save_classifier = open("naivebayes.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

### Load

In [41]:
classifier_f= open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

In [42]:
nltk.classify.accuracy(classifier, test_x)

0.82

## Integration with Sklearn

In [43]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB # not binary but shade

### Multinomial Naive Bayes

In [44]:
Multinom_classifier = SklearnClassifier(MultinomialNB())
Multinom_classifier.train(train_x)
print("accuracy", nltk.classify.accuracy(Multinom_classifier, test_x))

accuracy 0.85


### Bernoulli Naive Bayes

In [45]:
Bernoulli_classifier = SklearnClassifier(BernoulliNB())
Bernoulli_classifier.train(train_x)
print("accuracy", nltk.classify.accuracy(Bernoulli_classifier, test_x))

accuracy 0.83


### Whatever models

In [46]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [47]:
SVC_class = SklearnClassifier(SVC())
SVC_class.train(train_x)
print("accuracy", nltk.classify.accuracy(SVC_class, test_x))

accuracy 0.82


### Combining models

In [48]:
from nltk.classify import ClassifierI
from statistics import mode

In [49]:
Multinom_classifier = SklearnClassifier(MultinomialNB())
Multinom_classifier.train(train_x)

Bernoulli_classifier = SklearnClassifier(BernoulliNB())
Bernoulli_classifier.train(train_x)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(train_x)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(train_x)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(train_x)

<SklearnClassifier(NuSVC())>

In [50]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self.classifiers = classifiers
    
    def classify(self,features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_vote =  votes.count(mode(votes))
        return (choice_vote/len(votes))
    
voted_classifier = VoteClassifier(Multinom_classifier, 
                                  Bernoulli_classifier, 
                                  SGDClassifier_classifier, 
                                  LinearSVC_classifier, 
                                  NuSVC_classifier)

In [51]:
voted_classifier.classifiers

(<SklearnClassifier(MultinomialNB())>,
 <SklearnClassifier(BernoulliNB())>,
 <SklearnClassifier(SGDClassifier())>,
 <SklearnClassifier(LinearSVC())>,
 <SklearnClassifier(NuSVC())>)

In [52]:
print("Accuracy of bouch of", nltk.classify.accuracy(voted_classifier, test_x))

Accuracy of bouch of 0.89


In [53]:
new_obs = {"suks":True, "works":False, "good":True, "uni":True,"palace":True}
print(voted_classifier.classify(new_obs), voted_classifier.confidence(new_obs))

neg 0.8


to understand which word are relevant to our analysis we have to see what are inside the dictionary:

In [54]:
classifier.show_most_informative_features(1500)

Most Informative Features
                   sucks = True              neg : pos    =     10.7 : 1.0
               atrocious = True              neg : pos    =     10.4 : 1.0
                 idiotic = True              neg : pos    =      9.1 : 1.0
                  justin = True              neg : pos    =      9.1 : 1.0
                 frances = True              pos : neg    =      8.9 : 1.0
                  annual = True              pos : neg    =      8.3 : 1.0
                 martian = True              neg : pos    =      7.7 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                  sexist = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
           unimaginative = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      6.6 : 1.0
                  crappy = True              neg : pos    =      6.4 : 1.0

                goldblum = True              neg : pos    =      2.6 : 1.0
                     wig = True              neg : pos    =      2.6 : 1.0
               effective = True              pos : neg    =      2.6 : 1.0
                  enters = True              pos : neg    =      2.6 : 1.0
               thrilling = True              pos : neg    =      2.5 : 1.0
                    hide = True              neg : pos    =      2.5 : 1.0
            conventional = True              pos : neg    =      2.5 : 1.0
               endlessly = True              pos : neg    =      2.5 : 1.0
                 kingdom = True              pos : neg    =      2.5 : 1.0
                     nod = True              pos : neg    =      2.5 : 1.0
                  openly = True              pos : neg    =      2.5 : 1.0
              recognizes = True              pos : neg    =      2.5 : 1.0
                thrilled = True              pos : neg    =      2.5 : 1.0
                 vehicle 

## Short sentece dataset - Recap

In [55]:
"https://pythonprogramming.net/static/downloads/short_reviews/"
import requests
negative = requests.get("https://pythonprogramming.net/static/downloads/short_reviews/negative.txt").text
positive = requests.get("https://pythonprogramming.net/static/downloads/short_reviews/positive.txt").text

### Create whole documents with associated label

In [56]:
documents = []
for row in negative.split("\n"):
    documents.append((row, "Negative"))
for row in positive.split("\n"):
    documents.append((row, "Positive"))

In [57]:

rd.shuffle(documents)
documents[0]

('a crude teen-oriented variation on a theme that the playwright craig lucas explored with infinitely more grace and eloquence in his prelude to a kiss . ',
 'Negative')

### Create a Frequency distribution of the words

In [58]:
all_words = []
negative_word = nltk.tokenize.word_tokenize(negative)
positive_word = nltk.tokenize.word_tokenize(positive)

In [59]:

common = set(stopwords.words("english"))
punctuation = set([",",".",":",";","(",")","!","?","'"," \" ", "-" ])
for w in negative_word:
    if w not in punctuation and w not in common:
        all_words.append(w.lower())
for w in positive_word:
    if w not in punctuation and w not in common:
        all_words.append(w.lower())

In [60]:
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))
print(len(all_words))

word_features = list(all_words.keys())[:6000] # we take the first 6000
word_features

[("'s", 3537), ('film', 1590), ('movie', 1336), ("n't", 940), ('one', 739), ('like', 720), ('--', 670), ('``', 656), ('story', 493), ('much', 386), ('even', 382), ('good', 377), ('comedy', 356), ('time', 341), ('characters', 330)]
20156


['simplistic',
 'silly',
 'tedious',
 "'s",
 'laddish',
 'juvenile',
 'teenage',
 'boys',
 'could',
 'possibly',
 'find',
 'funny',
 'exploitative',
 'largely',
 'devoid',
 'depth',
 'sophistication',
 'would',
 'make',
 'watching',
 'graphic',
 'treatment',
 'crimes',
 'bearable',
 '[',
 'garbus',
 ']',
 'discards',
 'potential',
 'pathological',
 'study',
 'exhuming',
 'instead',
 'skewed',
 'melodrama',
 'circumstantial',
 'situation',
 'visually',
 'flashy',
 'narratively',
 'opaque',
 'emotionally',
 'vapid',
 'exercise',
 'style',
 'mystification',
 'story',
 'also',
 'unoriginal',
 'come',
 'already',
 'recycled',
 'times',
 "'d",
 'care',
 'count',
 'thing',
 'give',
 'movie',
 'points',
 'bravado',
 '--',
 'take',
 'entirely',
 'stale',
 'concept',
 'push',
 'audience',
 'meat',
 'grinder',
 'one',
 'time',
 'much',
 'farcical',
 'sour',
 'unfortunately',
 'actors',
 'served',
 'hack',
 'script',
 'disquieting',
 'relatively',
 'gore-free',
 'allusions',
 'serial',
 'murders',

### Create a feature set

In [61]:
def find_features(document):
    words = nltk.word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

feature_set = [(find_features(rev), category) for (rev, category) in documents ]

In [62]:
print(documents[0]) 
print(find_features(documents[0][0]))
print(feature_set[0])

('a crude teen-oriented variation on a theme that the playwright craig lucas explored with infinitely more grace and eloquence in his prelude to a kiss . ', 'Negative')


In [63]:
train_x = feature_set[:10000]
test_x = feature_set[10000:]
#18

In [64]:
# it took around 30m to fit all the models.

Multinom_classifier = SklearnClassifier(MultinomialNB())
Multinom_classifier.train(train_x)
Bernoulli_classifier = SklearnClassifier(BernoulliNB())
Bernoulli_classifier.train(train_x)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(train_x)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(train_x)
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(train_x)
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self.classifiers = classifiers
    def classify(self,features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    def confidence(self, features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_vote =  votes.count(mode(votes))
        return (choice_vote/len(votes))
    
voted_classifier = VoteClassifier(Multinom_classifier, 
                                  Bernoulli_classifier, 
                                  SGDClassifier_classifier, 
                                  LinearSVC_classifier, 
                                  NuSVC_classifier)

## Experiments

## Already implemented a trained algortimhs

it will be slower as f. but at least it consider multiple words! 
Nope does not work, the accuracy is under 0.5

In [66]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
print(sid.polarity_scores("i just want something that works with italian words"))
print(sid.polarity_scores("i just do not love you"))
x = sid.polarity_scores(" get my weed my from california that’s that shit")
print(x)

{'neg': 0.0, 'neu': 0.843, 'pos': 0.157, 'compound': 0.0772}
{'neg': 0.457, 'neu': 0.543, 'pos': 0.0, 'compound': -0.5216}
{'neg': 0.31, 'neu': 0.69, 'pos': 0.0, 'compound': -0.5574}


In [67]:
score,value = 0,0
for tupla in list(x.items()):
    if tupla[1] > score:
        score = tupla[1]
        value = tupla[0]
print(score,value)      

0.69 neu
