In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [None]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all
    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package brown to /root/nltk_data...
       |   Unzipping corpora/brown.zip.
       | Downloading package brown_tei to /root/nltk_data...
       |   Unzipping corpora/brown_tei.zip.
       | Downloading package cess_cat to /root/nltk_data...
       |   Unzipping corpora/cess_cat.zip.
       | Downloading package

True

Corpus - Body of text, singular. Corpora is the plural of this. Example: A collection of medical journals.<br>

Lexicon - Words and their meanings. Example: English dictionary. Consider, however, that various fields will have different lexicons. For example: To a financial investor, the first meaning for the word "Bull" is someone who is confident about the market, as compared to the common English lexicon, where the first meaning for the word "Bull" is an animal. As such, there is a special lexicon for financial investors, doctors, children, mechanics, and so on.<br>

Token - Each "entity" that is a part of whatever was split up based on rules. For examples, each word is a token when a sentence is "tokenized" into words. Each sentence can also be a token, if you tokenized the sentences out of a paragraph.

Tokenizing - Splitting sentences and words from the body of text.

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

print(sent_tokenize(EXAMPLE_TEXT))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]


In [None]:
print(word_tokenize(EXAMPLE_TEXT))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


In [None]:
example_sent = "This is a sample sentence, showing off the stop words filtration."

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

# filtered_sentence = []

# for w in word_tokens:
#     if w not in stop_words:
#         filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


The idea of stemming is a sort of normalizing method. Many variations of words carry the same meaning, other than when tense is involved.

The reason why we stem is to shorten the lookup, and normalize sentences.

Consider:

1. I was taking a ride in the car.
2. I was riding in the car.

This sentence means the same thing. in the car is the same. I was is the same. the ing denotes a clear past-tense in both cases, so is it truly necessary to differentiate between ride and riding, in the case of just trying to figure out the meaning of what this past-tense activity was?

In [None]:
ps = PorterStemmer()

example_words = ['Study','Studious','Studyness']

for w in example_words:
    print(ps.stem(w))

studi
studiou
study


A very similar operation to stemming is called lemmatizing. The major difference between these is, as you saw earlier, stemming can often create non-existent words, whereas lemmas are actual words.

So, your root stem, meaning the word you end up with, is not something you can just look up in a dictionary, but you can look up a lemma.

In [None]:
lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

cat
cactus
goose
rock
python
good
best
run
run


One of the more powerful aspects of the NLTK module is the Part of Speech tagging that it can do for you. This means labeling words in a sentence as nouns, adjectives, verbs...etc. Even more impressive, it also labels by tense, and more. Here's a list of the tags, what they mean, and some examples:

POS tag list:<br>

CC	coordinating conjunction<br>
CD	cardinal digit<br>
DT	determiner<br>
EX	existential there (like: "there is" ... think of it like "there exists")<br>
FW	foreign word<br>
IN	preposition/subordinating conjunction<br>
JJ	adjective	'big'<br>
JJR	adjective, comparative	'bigger'<br>
JJS	adjective, superlative	'biggest'<br>
LS	list marker	1)<br>
MD	modal	could, will<br>
NN	noun, singular 'desk'<br>
NNS	noun plural	'desks'<br>
NNP	proper noun, singular	'Harrison'<br>
NNPS	proper noun, plural	'Americans'<br>
PDT	predeterminer	'all the kids'<br>
POS	possessive ending	parent\'s<br>
PRP	personal pronoun	I, he, she<br>
PRP dollar	possessive pronoun	my, his, hers<br>
RB	adverb	very, silently,<br>
RBR	adverb, comparative	better<br>
RBS	adverb, superlative	best<br>
RP	particle	give up<br>
TO	to	go 'to' the store.<br>
UH	interjection	errrrrrrrm<br>
VB	verb, base form	take<br>
VBD	verb, past tense	took<br>
VBG	verb, gerund/present participle	taking<br>
VBN	verb, past participle	taken<br>
VBP	verb, sing. present, non-3d	take<br>
VBZ	verb, 3rd person sing. present	takes<br>
WDT	wh-determiner	which<br>
WP	wh-pronoun	who, what<br>
WP dollar	possessive wh-pronoun	whose<br>
WRB	wh-abverb	where, when<br>

In [None]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [None]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [None]:
custom_sent_tokenizer

<nltk.tokenize.punkt.PunktSentenceTokenizer at 0x7f9a28193610>

In [None]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [None]:
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

One of the main goals of chunking is to group into what are known as "noun phrases." These are phrases of one or more words that contain a noun, maybe some descriptive words, maybe a verb, and maybe something like an adverb. The idea is to group nouns with the words that are in relation to them.

In order to chunk, we combine the part of speech tags with regular expressions. Mainly from regular expressions, we are going to utilize the following:

1. (+) = match 1 or more
2. (?) = match 0 or 1 repetitions.
3. (*) = match 0 or MORE repetitions	  
4. (=) Any character except a new line

In [None]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            print(chunked)    

    except Exception as e:
        print(str(e))

process_content()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  dramatic/JJ
  progress/NN
  of/IN
  a/DT
  new/JJ
  democracy/NN
  ./.)
(S
  In/IN
  less/JJR
  than/IN
  three/CD
  years/NNS
  ,/,
  the/DT
  nation/NN
  has/VBZ
  gone/VBN
  from/IN
  dictatorship/NN
  to/TO
  liberation/NN
  ,/,
  to/TO
  sovereignty/VB
  ,/,
  to/TO
  a/DT
  constitution/NN
  ,/,
  to/TO
  national/JJ
  elections/NNS
  ./.)
(S
  At/IN
  the/DT
  same/JJ
  time/NN
  ,/,
  our/PRP$
  coalition/NN
  has/VBZ
  been/VBN
  relentless/VBN
  in/IN
  shutting/VBG
  off/RP
  terrorist/JJ
  infiltration/NN
  ,/,
  clearing/VBG
  out/RP
  insurgent/JJ
  strongholds/NNS
  ,/,
  and/CC
  turning/VBG
  over/RP
  territory/NN
  to/TO
  (Chunk Iraqi/NNP security/NN)
  forces/NNS
  ./.)
(S
  I/PRP
  am/VBP
  confident/JJ
  in/IN
  our/PRP$
  plan/NN
  for/IN
  victory/NN
  ;/:
  I/PRP
  am/VBP
  confident/JJ
  in/IN
  the/DT
  will/MD
  of/IN
  the/DT
  (Chunk Iraqi/NNP)
  people/NNS
  ;/:
  I/PRP
  am/VBP
  confide

chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""<br>
This line, broken down:

<RB.?>* = "0 or more of any tense of adverb," followed by:

<VB.?>* = "0 or more of any tense of verb," followed by:

<NNP>+ = "One or more proper nouns," followed by

<NN>? = "zero or one singular noun."

You may find that, after a lot of chunking, you have some words in your chunk you still do not want, but you have no idea 



how to get rid of them by chunking. You may find that chinking is your solution.

Chinking is a lot like chunking, it is basically a way for you to remove a chunk from a chunk. The chunk that you remove from your chunk is your chink.

The code is very similar, you just denote the chink, after the chunk, with }{ instead of the chunk's {}.

In [None]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            print(chunked)

    except Exception as e:
        print(str(e))

process_content()

(S (Chunk 31/CD ,/, 2006/CD ./.))
(S
  (Chunk White/NNP House/NNP photo/NN)
  by/IN
  (Chunk Eric/NNP DraperEvery/NNP time/NN I/PRP)
  'm/VBP
  (Chunk invited/JJ)
  to/TO
  this/DT
  (Chunk rostrum/NN ,/, I/PRP)
  'm/VBP
  humbled/VBN
  by/IN
  the/DT
  (Chunk privilege/NN ,/, and/CC mindful/NN)
  of/IN
  the/DT
  (Chunk history/NN we/PRP)
  've/VBP
  seen/VBN
  (Chunk together/RB ./.))
(S
  (Chunk We/PRP)
  have/VBP
  gathered/VBN
  under/IN
  this/DT
  (Chunk Capitol/NNP dome/NN)
  in/IN
  (Chunk moments/NNS)
  of/IN
  (Chunk
    national/JJ
    mourning/NN
    and/CC
    national/JJ
    achievement/NN
    ./.))
(S
  (Chunk We/PRP)
  have/VBP
  served/VBN
  (Chunk America/NNP)
  through/IN
  (Chunk one/CD)
  of/IN
  the/DT
  (Chunk most/RBS consequential/JJ periods/NNS)
  of/IN
  (Chunk our/PRP$ history/NN --/: and/CC it/PRP)
  has/VBZ
  been/VBN
  (Chunk my/PRP$ honor/NN)
  to/TO
  serve/VB
  with/IN
  (Chunk you/PRP ./.))
(S
  In/IN
  a/DT
  (Chunk system/NN)
  of/IN
  (Chunk
    t

Now, the main difference here is:

}<VB.?|IN|DT|TO>+{<br>
This means we're removing from the chink one or more verbs, prepositions, determiners, or the word 'to'.

One of the most major forms of chunking in natural language processing is called "Named Entity Recognition." The idea is to have the machine immediately be able to pull out "entities" like people, places, things, locations, monetary figures, and more.

This can be a bit of a challenge, but NLTK is this built in for us. There are two major options with NLTK's named entity recognition: either recognize all named entities, or recognize named entities as their respective type, like people, places, locations, etc.

In [None]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            print(namedEnt)
    except Exception as e:
        print(str(e))


process_content()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  a/DT
  constitution/NN
  ,/,
  to/TO
  national/JJ
  elections/NNS
  ./.)
(S
  At/IN
  the/DT
  same/JJ
  time/NN
  ,/,
  our/PRP$
  coalition/NN
  has/VBZ
  been/VBN
  relentless/VBN
  in/IN
  shutting/VBG
  off/RP
  terrorist/JJ
  infiltration/NN
  ,/,
  clearing/VBG
  out/RP
  insurgent/JJ
  strongholds/NNS
  ,/,
  and/CC
  turning/VBG
  over/RP
  territory/NN
  to/TO
  (NE Iraqi/NNP)
  security/NN
  forces/NNS
  ./.)
(S
  I/PRP
  am/VBP
  confident/JJ
  in/IN
  our/PRP$
  plan/NN
  for/IN
  victory/NN
  ;/:
  I/PRP
  am/VBP
  confident/JJ
  in/IN
  the/DT
  will/MD
  of/IN
  the/DT
  (NE Iraqi/NNP)
  people/NNS
  ;/:
  I/PRP
  am/VBP
  confident/JJ
  in/IN
  the/DT
  skill/NN
  and/CC
  spirit/NN
  of/IN
  our/PRP$
  military/JJ
  ./.)
(S
  (NE Fellow/NNP)
  citizens/NNS
  ,/,
  we/PRP
  are/VBP
  in/IN
  this/DT
  fight/NN
  to/TO
  win/VB
  ,/,
  and/CC
  we/PRP
  are/VBP
  winning/VBG
  ./.)
(S (/( (NE Applause/N

Here, with the option of binary = True, this means either something is a named entity, or not. 

When Binary is False, it picked up the same things, but wound up splitting up terms like White House into "White" and "House" as if they were different, whereas we could see in the binary = True option, the named entity recognition was correct to say White House was part of the same named entity.

Depending on your goals, you may use the binary option how you see fit. Here are the types of Named Entities that you can get if you have binary as false:

NE Type and Examples
ORGANIZATION - Georgia-Pacific Corp., WHO
PERSON - Eddy Bonte, President Obama
LOCATION - Murray River, Mount Everest
DATE - June, 2008-06-29
TIME - two fifty a m, 1:30 p.m.
MONEY - 175 million Canadian Dollars, GBP 10.40
PERCENT - twenty pct, 18.75 %
FACILITY - Washington Monument, Stonehenge
GPE - South East Asia, Midlothian

In [None]:
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

print(documents[1])

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]



In [None]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [None]:
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))



In [None]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [None]:
training_set = featuresets[:1900]

testing_set = featuresets[1900:]

In [None]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [None]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 83.0
