In [None]:
import nltk


In [None]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> Download
Command 'Download' unrecognized

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> l
Packages:
  [ ] abc................. Australian Broadcasting Commission 2006
  [ ] alpino.............. Alpino Dutch Treebank
  [ ] averaged_perceptron_tagger Averaged Perceptron Tagger
  [ ] averaged_perceptron_tagger_ru Averaged Perceptron Tagger (Russian)
  [ ] basque_grammars..... Grammars for Basque
  [ ] bcp47............... BCP-47 Language Tags
  [ ] biocreative_ppi..... BioCreAtIvE (Critical Assessm

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

text = "Hello students, how are you doing today? The olympics are inspiring, and Python is awesome. You look nice today."

print(sent_tokenize(text))

In [None]:
print(word_tokenize(text))

In [None]:
from nltk.corpus import stopwords
print(set(stopwords.words('english')))

In [None]:
example_sent = "This is some sample text, showing off the stop words filtration."
stop_words = set(stopwords.words(english))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tenkens if not w in stop_words]
filtered_sentence = []

for w in word_tokens:
  if w not in stop_words:
    filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

## Stemming Words With NLTK

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
example_words = ["ride", "riding", "rider", "rides"]
for w in example_words:
  print(ps.stem(w))


In [None]:
# Now lets try stemming an entire sentence!The olympics are inspiring, and Python is awesome. You look nice today.
new_text = "When riders are riding their horses, they often think of how cowboys rode horses."
words = word_tokenize(new_text)
for w in words:
  print(ps.stem(w))

In [None]:
nltk.download()

In [None]:
# We can documents from the nltk.corpus. As an example, lets load the universal declaration
from nltk.corpus import udhr
print(udhr.raw('English-Latin1'))

In [None]:
# Lets import some sample and training text - George Bush's 2005 and 2006 state of the union addresses.
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [None]:
# Now that we have some text, we can train the PunkSentenceTokenizer.
custom_sent_tokenizer = PunkSentenceTokenizer(train_text)

In [None]:
# This function will tag each tokenized word with a part of speech
def process_content():
  try:
    for i in tokenized[:5]:
      words = nltk.word_tokenize(i)
      tagged = nltk.pos_tag(words)
      print(tagged)

    except Exception as e:
      print(str(e))


process_content()


## Chunking With NLTK

In [None]:
'''
+ = match 1 or more
? = match 0 or 1 repetitions.
* = match 0 or MORE repetitions
. = Any character except a new line
'''

In [None]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunkSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
  try:
    for i in tokenized:
      words = nltk.word_tokenize(i)
      tagged = nltk.pos_tag(words)

      # combine the part of the speech tag with a regular expression

      chunkGram = r"""Chunk:{<RB.?>*<VB.?>*<NNP>+<NN??}"""
      chunkParser = nltk.RegexParser(chunkGram)
      chunked = chunkParser.parse(tagged)

      #draw the chunks with nltk
      #chunked.draw()

  except Exception as e:
    print(str(e))

process_content()

The main line in question is:

In [None]:
'''
chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
'''

This line, broken down

In [None]:
'''
<RB.?>* = "0 or more of any tense of adverb," followed by:

<VB.?>* = "0 or more of any tense of verb," followed by:

<NNP>+ = "One or more proper nouns," followed by

<NN>? = "zero or one singular noun."

'''

In [None]:
def process_content():
  try:
    for i in tokenized:
      words = nltk.word_tokenize(i)
      tagged = nltk.pos_tag(words)

      # combine the part of the speech tag with regular expression

      chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
      chunkParser = nltk.RegexpParser(chunkGram)
      chunked = chunkParser.parse(tagged)

      for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk')
        print(subtree)


  except Exception as e:
    print(str(e))

process_content()

In [None]:
def process_content():
  try:
    for i in tokenized:
      words = nltk.word_tokenize(i)
      tagged = nltk.pos_tag(words)

      chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

      chunkParser = nltk.RegexpParser(chunkGram)
      chunked = chunkParser.parse(tagged)

      for subtree in chunked.subtree(filter= lambda t: t.label() =='Chunk')
        print(subtree)


  except Exception as e:
    print(str(e))

process_content()


In [None]:
def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            # namedEnt.draw()

    except Exception as e:
        print(str(e))


process_content()

## Text Classification Using Natural Language Processing

In [None]:
import random
import nltk
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
for categroy in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]

# shuffle the documents
random.shuffle(documents)

print('Number of Documents: {}'.format(len(documents)))
print('First Review: {}'.format(documents[1]))

all_words = []
for w in movie_reviews.words():
  all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

print('Most common words: {}'.format(all_words.most_common(15)))
print('The word happy: {}'.format(all_words["happy"]))


In [None]:
# We'll use the 4000 most common words as features
print(len(all_words))
word_features = list(all_words.keys())[:4000]

In [None]:
# The find_features funciton will determine which of the 3000 word features are contained in the review
def find_features(document):
  words = set(document)
  features = {}
  for w in word_features:
    features[w] = (w in words)

  return features


features = find_features(movie_reviews.words('neg/cv000_29416.txt'))
for key, value in features.items():
  if value ==True:
    print(key)

In [None]:
#Now lets do it for all the documents
featuresets = [(find_features(rev), category) for (rev, category) in documents]


In [None]:
from sklearn import model_selection

# define a seed for reproducibility
seed = 1

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size=0.25, random_state=seed)

In [None]:
print(len(training))
print(len(testing))

In [None]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifer(SVC(kernel= 'linear'))
#train the model
model.train(training)

#and test on the testset
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))