In [2]:
import numpy as np
import pandas as pd

In [3]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Tokenizing Words and Sentences with NLTK

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize

EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

# Sentence Tokenizer
print(sent_tokenize(EXAMPLE_TEXT))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]


In [6]:
# Word Tokenizer
print(word_tokenize(EXAMPLE_TEXT))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


# Stop words with NLTK

In [7]:
from nltk.corpus import stopwords


In [14]:
stop_words = set(stopwords.words('english'))
print(len(stop_words))
print(list(stop_words)[:10])

179
["aren't", 'them', 'in', 'is', 'are', 'o', 'the', 'over', 'where', 'only']


In [17]:
example_sent = "This is a sample sentence, showing off the stop words filtration."
# word tokenizer
word_tokens = word_tokenize(example_sent)
# filtered sentence : remove stopwords
filtered_sentence = [w for w in word_tokens if not w in stop_words]
print(filtered_sentence)

['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


# Stemming words with NLTK

In [19]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
# porter stemmer
ps = PorterStemmer()

In [24]:
# Stemming example words
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
stemmed_words = []
for w in example_words:
    stemmed_words.append(ps.stem(w))
print(stemmed_words)

['python', 'python', 'python', 'python', 'pythonli']


In [25]:
# Stemming Sentences
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)
sentence_stemmed_words = []
for w in words:
    sentence_stemmed_words.append(ps.stem(w))
print(sentence_stemmed_words)

['it', 'is', 'import', 'to', 'by', 'veri', 'pythonli', 'while', 'you', 'are', 'python', 'with', 'python', '.', 'all', 'python', 'have', 'python', 'poorli', 'at', 'least', 'onc', '.']


# Part of Speech Tagging with NLTK

In [26]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [27]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [30]:
# train PunktSentenceTokenizer on training data 
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [34]:
# Test tokeniser
tokenized = custom_sent_tokenizer.tokenize(sample_text)
print(len(tokenized), tokenized[:5])

346 ["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.", 'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.', 'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.', '(Applause.)', 'President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.']


In [41]:
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))
process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

# Chunking with NLTK