In [None]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
# an nltk.FreqDist() is like a dictionary,

# but it is ordered by frequency.

# Also, nltk automatically fills the dictionary

# with counts when given a list of words.


In [None]:
freq_brown = nltk.FreqDist(brown.words())

list(freq_brown.keys())[:20]


In [None]:
freq_brown.most_common(20)


In [None]:
# an nltk.ConditionalFreqDist() counts frequencies of pairs.

# When given a list of bigrams, it maps each first word of a bigram

# to a FreqDist over the second words of the bigram.


cfreq_brown_2gram = nltk.ConditionalFreqDist(nltk.bigrams(brown.words()))
# conditions() in a ConditionalFreqDist are like keys()

# in a dictionary

cfreq_brown_2gram.conditions()


In [None]:


# the cfreq_brown_2gram entry for "my" is a FreqDist.

cfreq_brown_2gram["my"]
# here are the words that can follow after "my".


In [None]:
# We first access the FreqDist associated with "my",

# then the keys in that FreqDist

cfreq_brown_2gram["my"].keys()


In [None]:
# here are the 20 most frequent words to come after "my", with their frequencies

cfreq_brown_2gram["my"].most_common(20)


In [None]:
# an nltk.ConditionalProbDist() maps pairs to probabilities.

# One way in which we can do this is by using Maximum Likelihood Estimation (MLE)

cprob_brown_2gram = nltk.ConditionalProbDist(cfreq_brown_2gram, nltk.MLEProbDist)


In [None]:
# This again has conditions() wihch are like dictionary keys

cprob_brown_2gram.conditions()


In [None]:
# Here is what we find for "my": a Maximum Likelihood Estimation-based probability distribution,

# as a MLEProbDist object.

cprob_brown_2gram["my"]


In [None]:
# We can find the words that can come after "my" by using the function samples()

cprob_brown_2gram["my"].samples()


In [None]:
# Here is the probability of a particular pair:

cprob_brown_2gram["my"].prob("own")


In [None]:
#####

# We can also compute unigram probabilities (probabilities of individual words)

freq_brown_1gram = nltk.FreqDist(brown.words())

len_brown = len(brown.words())


def unigram_prob(word):

   return freq_brown_1gram[word] / len_brown

#############


In [None]:
# The contents of cprob_brown_2gram, all these probabilities, now form a

# trained bigram language model. The typical use for a language model is

# to ask it for the probabillity of a word sequence

# P(how do you do) = P(how) * P(do|how) * P(you|do) * P(do | you)

prob_sentence = unigram_prob("how") * cprob_brown_2gram["how"].prob("do") * cprob_brown_2gram["do"].prob("you") * \
    cprob_brown_2gram["you"].prob("do")

# result: 1.5639033871961e-09

###############


In [None]:
# We can also use a language model in another way:

# We can let it generate text at random

# This can provide insight into what it is that

# the language model has been learning

cprob_brown_2gram["my"].generate()


In [None]:

# We can use this to generate text at random

# based on a given text of bigrams.

# Let's do this for the Sam "corpus"
with open('./seuss_script.txt', 'r') as seuss_script:
    # open txt file and read to string, string to lower
    seussical = seuss_script.read()
corpus = """<s> I am Sam </s>

<s> Sam I am </s>

<s> I do not like green eggs and ham </s>"""

words = corpus.split()

cfreq_sam = nltk.ConditionalFreqDist(nltk.bigrams(words))

cprob_sam = nltk.ConditionalProbDist(cfreq_sam, nltk.MLEProbDist)

word = "<s>"

for index in range(50):

   word = cprob_sam[word].generate()

   print(word, end=" ")

print("\n")


In [None]:
# Not a lot of variety. We need a bigger corpus.

# What kind of genres do we have in the Brown corpus?

brown.categories()


In [None]:
# Let's try Science Fiction.

cfreq_scifi = nltk.ConditionalFreqDist(
    nltk.bigrams(brown.words(categories="science_fiction")))

cprob_scifi = nltk.ConditionalProbDist(cfreq_scifi, nltk.MLEProbDist)

word = "in"

for index in range(50):

   word = cprob_scifi[word].generate()

   print(word, end=" ")

print


In [None]:


# try this with other Brown corpus categories.

# Here is how to do this with NLTK books:

import nltk

from nltk.book import *

def generate_text(text, initialword, numwords):

   bigrams = list(nltk.ngrams(text, 2))

   cpd = nltk.ConditionalProbDist(nltk.ConditionalFreqDist(bigrams), nltk.MLEProbDist)

   word = initialword

   for i in range(numwords):

     print(word, end = " ")

     word = cpd[ word].generate()

   print(word)

# Holy Grail

generate_text(text6, "I", 100)

# sense and sensibility

generate_text(text2, "I", 100)

In [None]:

import string


# Word bigrams are just pairs of words.



with open('./seuss_script.txt', 'r') as seuss_script:
    # open txt file and read to string, string to lower
    seussical = seuss_script.read()

words_punct = seussical.split()


# count bigrams

bigrams = {}



# strip all punctuation at the beginning and end of words, and

# convert all words to lowercase.


words = [w.strip(string.punctuation).lower() for w in words_punct]

# add special START, END tokens

words = ["START"] + words + ["END"]

for index, word in enumerate(words):

    if index < len(words) - 1:

        # we only look at indices up to the

        # next-to-last word, as this is

        # the last one at which a bigram starts

        w1 = words[index]

        w2 = words[index + 1]

        # bigram is a tuple,

        # like a list, but fixed.

        # Tuples can be keys in a dictionary

        bigram = (w1, w2)

        if bigram in bigrams:

            bigrams[bigram] = bigrams[bigram] + 1

        else:

            bigrams[bigram] = 1

        # or, more simply, like this:

        # bigrams[bigram] = bigrams.get(bigram, 0) + 1

# sort bigrams by their counts

sorted_bigrams = sorted(
    bigrams.items(), key=lambda pair: pair[1], reverse=True)

for bigram, count in sorted_bigrams:

    print(bigram, ":", count)


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
# nltk.download()

nltk.download('movie_reviews')


In [None]:
import re
from contextlib import redirect_stdout
from io import StringIO


def token_to_words(str):
    f = StringIO()
    with redirect_stdout(f):
        for i in str:
            regex_of_word = re.findall('([\w]{0,})', i)
            regex_of_word = [x for x in regex_of_word if x is not '']
            for word in regex_of_word:
                print(regex_of_word)
        words = (f.getvalue()).split('\n')


words = token_to_words(sentence)


In [18]:
from nltk.tokenize import word_tokenize
import re
import nltk 
import nltk.data
seuss_script = open('./seuss_script.txt', 'r')
    # open txt file and read to string, string to lower
seussical = seuss_script.read()
# seussical = seussical.lower()
# green_eggs = re.findall(r"[\w']+|[.,!?;]", seussical)
tokenlist = word_tokenize(seussical)


# print(tok)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/home/fossa/nltk_data'
    - '/home/fossa/data/dont_worry_about_it/dr_seuss_markov/venv/nltk_data'
    - '/home/fossa/data/dont_worry_about_it/dr_seuss_markov/venv/share/nltk_data'
    - '/home/fossa/data/dont_worry_about_it/dr_seuss_markov/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [20]:
import nltk
import random
import re
from nltk.corpus import movie_reviews

with open('./seuss_script.txt', 'r') as seuss_script:
    # open txt file and read to string, string to lower
    seussical = seuss_script.read()
    seussical = seussical.lower()

with open('extra_text.txt', 'r') as lots_o_books:
    # open txt file and read to string, string to lower
    read_books = lots_o_books.read()

seuss_books = re.findall(r"[\w']+|[.,!?;]", read_books)
green_eggs = re.findall(r"[\w']+|[.,!?;]", seussical)

green_egg_words = []

for word in green_eggs:
    green_egg_words.append(word)

green_egg_words = nltk.FreqDist(green_egg_words)

common_words = list(green_egg_words.keys())
# print(common_words)


def find_features(seuss_books):
    words = set(seuss_books)
    features = {}
    for w in common_words:
        features[w] = (w in words)

    return features
print(find_features(seuss_books))

# print(features)
tokenlist = word_tokenize(seussical)

seuss_training = seuss_books
# classifier = nltk.NaiveBayesClassifier.train(seuss_training)
# # random.shuffle(documents)
# print("Classifier accuracy percent:",
#       (nltk.classify.accuracy(classifier, seuss_training))*100)
# print(documents[1])

# all_words = []
# for w in movie_reviews.words():
#     all_words.append(w.lower())

# all_words = nltk.FreqDist(all_words)
# print(all_words.most_common(15))
# print(all_words["stupid"])
# # Mostly the same as before, only with now a new variable, word_features, which contains the top 3, 000 most common words
# word_features = list(all_words.keys())[:3000]


{'green': True, 'eggs': False, 'and': True, 'ham': False, 'i': True, 'am': True, 'sam': False, '.': True, 'that': True, '!': True, 'do': True, 'not': True, 'like': True, 'would': True, 'you': True, '?': True, 'them': True, ',': True, 'here': True, 'or': True, 'there': True, 'anywhere': False, 'in': True, 'a': True, 'house': True, 'then': True, 'with': True, 'mouse': True, 'eat': False, 'box': True, 'fox': True, 'could': True, 'car': True, 'they': True, 'are': True, 'may': True, 'will': True, 'see': True, 'tree': True, 'let': True, 'me': True, 'be': True, 'train': True, 'on': True, 'say': True, 'the': True, 'dark': True, 'rain': True, 'goat': False, 'boat': False, 'so': True, 'try': False, 'if': True, 'good': True, 'thank': False}


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/home/fossa/nltk_data'
    - '/home/fossa/data/dont_worry_about_it/dr_seuss_markov/venv/nltk_data'
    - '/home/fossa/data/dont_worry_about_it/dr_seuss_markov/venv/share/nltk_data'
    - '/home/fossa/data/dont_worry_about_it/dr_seuss_markov/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [None]:
# Next, we're going to build a quick function that will find these top 3,000 words in our positive and negative documents, marking their presence as either positive or negative:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

# Next, we can print one feature set like:
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))


In [None]:
"""Then we can do this for all of our documents, saving the feature existence booleans and their respective positive or negative categories by doing:"""

featuresets = [(find_features(rev), category) for (rev, category) in documents]


In [None]:
from nltk.corpus import stopwords
# Stop Words removal:
# When we use the features from a text to model, we will encounter a lot of noise. These are the stop words like the, he, her, etc… which don’t help us and , just be removed before processing for cleaner processing inside the model. With NLTK we can see all the stop words available in the English language.


print(stopwords.words("english"))
# Remove stop words
words = [w for w in words if w not in stopwords.words(“english”)]
print(words)


In [None]:
"""Stemming:
In our text we may find many words like playing, played, playfully, etc… 
which have a root word, play all of these convey the same meaning. So we 
can just extract the root word and remove the rest. Here the root word formed i
s called ‘stem’ and it is not necessarily that stem needs to exist and have a meaning."""
#  Just by committing the suffix and prefix, we generate the stems.
# NLTK provides us with PorterStemmer LancasterStemmer and SnowballStemmer packages
from nltk.stem.porter import PorterStemmer
# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)


In [None]:
"""Lemmatization:
We want to extract the base form of the word here. The word extracted 
here is called Lemma and it is available in the dictionary. We have the 
WordNet corpus and the lemma generated will be available in this corpus. 
NLTK provides us with the WordNet Lemmatizer that makes use of the WordNet 
Database to lookup lemmas of words."""

from nltk.stem.wordnet import WordNetLemmatizer
# Reduce words to their root form
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)


In [None]:
"""Parse tree or Syntax Tree generation :
We can define grammar and then use NLTK RegexpParser to extract all parts of 
speech from the sentence and draw functions to visualize it."""

# Import required libraries
from nltk import pos_tag, word_tokenize, RegexpParser
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Example text
sample_text = "The quick brown fox jumps over the lazy dog"
# Find all parts of speech in above sentence
tagged = pos_tag(word_tokenize(sample_text))
#Extract all parts of speech from any text
chunker = RegexpParser("""NP: {?*} #To extract Noun Phrases
					    P: {}			 #To extract Prepositions
					    V: {}			 #To extract Verbs
					    PP: {} #To extract Prepositional Phrases
                        VP: { *} #To extract Verb Phrases""")
# Print all parts of speech in above sentence
output = chunker.parse(tagged)
print(“After Extractingn”, output)


In [None]:
"""POS Tagging:
Part of Speech tagging is used in text processing to avoid confusion between 
two same words that have different meanings. With respect to the definition 
and context, we give each word a particular tag and process them. Two Steps 
are used here:"""

# Tokenize text(word_tokenize).
# Apply the pos_tag from NLTK to the above step.
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))
txt = "Natural language processing is an exciting area."
" Huge budget have been allocated for this."
# sent_tokenize is one of instances of
# PunktSentenceTokenizer from the nltk.tokenize.punkt module
tokenized = sent_tokenize(txt)
for i in tokenized:
  # Word tokenizers is used to find the words
  # and punctuation in a string
  wordsList = nltk.word_tokenize(i)
  # removing stop words from wordList
  wordsList = [w for w in wordsList if not w in stop_words]
  # Using a Tagger. Which is part-of-speech
  # tagger or POS-tagger.
  tagged = nltk.pos_tag(wordsList)
  print(tagged)
