In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

nltk.download()
# Separating by Sentence with Parragraph identifier.
# Tokenizing: word tokenizers... Sentence tokenizers.
# Lexicon and corpora.
# corpora - body of text. ex: medical journals, presidential speeches, 
# English language.
# Lexicon - words and their means.

# investor-speak ... regular english-speak

# investor speak 'bull' = 'someone who is positive about the market'.
# english-speak 'bull' = scary animal you don't want running at you

example_text = "Hello Mr. Smith, how are you doing today? The weather is great and python is awesome. The sky is pinkish-blue. You should not eat cardboard"

#print("Tokenize by sentence:")
#print(sent_tokenize(example_text))
#print("Tokenize by word: ")
#print(word_tokenize(example_text))

for i in word_tokenize(example_text):
    print(i)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
Hello
Mr.
Smith
,
how
are
you
doing
today
?
The
weather
is
great
and
python
is
awesome
.
The
sky
is
pinkish-blue
.
You
should
not
eat
cardboard


In [9]:
# Tutorial 2. Stop Words. 
# Words that make you leave a text analysis. 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sentence = "This is an example showing off stop word filtration."
stop_words = set(stopwords.words("english")) #Considerar que stopwords trabaja con diferentes lenguajes. 
words = word_tokenize(example_sentence)
filtered_sentence = []
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
print(filtered_sentence)

# One Liner.
filtered_sentence_OL = [w for w in words if not w in stop_words]
print(filtered_sentence_OL)

['This', 'example', 'showing', 'stop', 'word', 'filtration', '.']
['This', 'example', 'showing', 'stop', 'word', 'filtration', '.']


In [10]:
# T3.Stemming
# Takes the root stem of the word.
# You can have different words with the same roots that mean the same.
# Useful to economize space. 

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]

#for w in example_words:
    #print(ps.stem(w))
    
new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)
for w in words:
    print(ps.stem(w))

python
python
python
python
pythonli
It
is
veri
import
to
be
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


In [14]:
# Part 4. Part of Speech tagging.
#Creates tuples with the word and the parts of speech. 
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer #Unsupervised learning tokenizer.

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try: 
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            print(tagged)
            
    except Exception as e:
        print(str(e))
        
process_content()

Z'), ('been', 'VBN'), ('falling', 'VBG'), ('for', 'IN'), ('a', 'DT'), ('dozen', 'NN'), ('years', 'NNS'), ('in', 'IN'), ('a', 'DT'), ('row', 'NN'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('These', 'DT'), ('gains', 'NNS'), ('are', 'VBP'), ('evidence', 'NN'), ('of', 'IN'), ('a', 'DT'), ('quiet', 'JJ'), ('transformation', 'NN'), ('--', ':'), ('a', 'DT'), ('revolution', 'NN'), ('of', 'IN'), ('conscience', 'NN'), (',', ','), ('in', 'IN'), ('which', 'WDT'), ('a', 'DT'), ('rising', 'VBG'), ('generation', 'NN'), ('is', 'VBZ'), ('finding', 'VBG'), ('that', 'IN'), ('a', 'DT'), ('life', 'NN'), ('of', 'IN'), ('personal', 'JJ'), ('responsibility', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('life', 'NN'), ('of', 'IN'), ('fulfillment', 'NN'), ('.', '.')]
[('Government', 'NNP'), ('has', 'VBZ'), ('played', 'VBN'), ('a', 'DT'), ('role', 'NN'), ('.', '.')]
[('Wise', 'NNP'), ('policies', 'NNS'), (',', ','), ('such', 'JJ'), ('as', 'IN'), ('welfare', 'NN'), ('reform', 'NN'), ('and', 

In [12]:
# Part 5. Chunking
#Creates tuples with the word and the parts of speech. 
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer #Unsupervised learning tokenizer.

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try: 
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            print(chunked)

            
    except Exception as e:
        print(str(e))
        
process_content()


  of/IN
  producing/VBG
  ethanol/NN
  ,/,
  not/RB
  just/RB
  from/IN
  corn/NN
  ,/,
  but/CC
  from/IN
  wood/NN
  chips/NNS
  and/CC
  stalks/NNS
  ,/,
  or/CC
  switch/VB
  grass/NN
  ./.)
(S
  Our/PRP$
  goal/NN
  is/VBZ
  to/TO
  make/VB
  this/DT
  new/JJ
  kind/NN
  of/IN
  ethanol/JJ
  practical/JJ
  and/CC
  competitive/JJ
  within/IN
  six/CD
  years/NNS
  ./.)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
  Breakthroughs/NNS
  on/IN
  this/DT
  and/CC
  other/JJ
  new/JJ
  technologies/NNS
  will/MD
  help/VB
  us/PRP
  reach/VB
  another/DT
  great/JJ
  goal/NN
  :/:
  to/TO
  replace/VB
  more/JJR
  than/IN
  75/CD
  percent/NN
  of/IN
  our/PRP$
  oil/NN
  imports/NNS
  from/IN
  the/DT
  (Chunk Middle/NNP East/NNP)
  by/IN
  2025/CD
  ./.)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
  By/IN
  applying/VBG
  the/DT
  talent/NN
  and/CC
  technology/NN
  of/IN
  (Chunk America/NNP)
  ,/,
  this/DT
  country/NN
  can/MD
  dramatically/RB
  improve/VB
  our/PRP$
  environment/NN
 

In [13]:
# Part 6. Chinking
#Creates tuples with the word and the parts of speech. 
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer #Unsupervised learning tokenizer.

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try: 
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            chunked.draw()

            
    except Exception as e:
        print(str(e))
        
process_content()

KeyboardInterrupt: 

In [16]:
# Part 7. Name Entity Recognotion
#Creates tuples with the word and the parts of speech. 
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer #Unsupervised learning tokenizer.

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try: 
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            nameEnt = nltk.ne_chunk(tagged, binary= True)

            nameEnt.draw()

            
    except Exception as e:
        print(str(e))
         
process_content()

KeyboardInterrupt: 

In [23]:
# Part 8  - Lemmatizing
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words_to_lemmatize = ['cats', 'cacti', 'geese', 'rocks', 'python']

for word in words_to_lemmatize:
    print(lemmatizer.lemmatize(word))

print(lemmatizer.lemmatize("better", pos="a")) #Adjetive
print(lemmatizer.lemmatize("best", pos="a"))


cat
cactus
goose
rock
python
good
best


In [25]:
# Part 9 - NLTK Corpora
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

sample = gutenberg.raw("bible-kjv.txt")

tok = sent_tokenize(sample)

print(tok[5:15])


['1:5 And God called the light Day, and the darkness he called Night.', 'And the evening and the morning were the first day.', '1:6 And God said, Let there be a firmament in the midst of the waters,\nand let it divide the waters from the waters.', '1:7 And God made the firmament, and divided the waters which were\nunder the firmament from the waters which were above the firmament:\nand it was so.', '1:8 And God called the firmament Heaven.', 'And the evening and the\nmorning were the second day.', '1:9 And God said, Let the waters under the heaven be gathered together\nunto one place, and let the dry land appear: and it was so.', '1:10 And God called the dry land Earth; and the gathering together of\nthe waters called he Seas: and God saw that it was good.', '1:11 And God said, Let the earth bring forth grass, the herb yielding\nseed, and the fruit tree yielding fruit after his kind, whose seed is\nin itself, upon the earth: and it was so.', '1:12 And the earth brought forth grass, and

In [39]:
# Part 10 - WordNet
from nltk.corpus import wordnet

syns = wordnet.synsets("program")

#synset
print(syns[0].name())

#Word
print(syns[0].lemmas()[0].name())

# definitions
print(syns[0].definition())

#examples
print(syns[0].examples())

synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))

#Similarity

w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")

print(w1.wup_similarity(w2))

w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")

print(w1.wup_similarity(w2))

w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")

print(w1.wup_similarity(w2))

plan.n.01
plan
a series of steps to be carried out or goals to be accomplished
['they drew up a six-step plan', 'they discussed plans for a new bond issue']
{'effective', 'skillful', 'honorable', 'right', 'in_force', 'upright', 'sound', 'secure', 'just', 'unspoiled', 'estimable', 'near', 'well', 'respectable', 'unspoilt', 'undecomposed', 'goodness', 'skilful', 'adept', 'good', 'commodity', 'beneficial', 'full', 'soundly', 'expert', 'thoroughly', 'safe', 'ripe', 'proficient', 'dear', 'trade_good', 'dependable', 'practiced', 'in_effect', 'salutary', 'serious', 'honest'}
{'evil', 'evilness', 'ill', 'bad', 'badness'}
0.9090909090909091
0.6956521739130435
0.32


In [45]:
# Part 11 Text Classification
import nltk
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))
print(all_words['stupid'])


[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]
253


In [48]:
# Part 12 Words as Features for Learning
import nltk
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = []
    for w in word_features:
        features[w] = (w in words)
    return features

print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev), category) for (rev, category) in documents]

TypeError: list indices must be integers or slices, not str