In [6]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
# an nltk.FreqDist() is like a dictionary,

# but it is ordered by frequency.

# Also, nltk automatically fills the dictionary

# with counts when given a list of words.


[nltk_data] Downloading package brown to /home/fossa/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [None]:
freq_brown = nltk.FreqDist(brown.words())

list(freq_brown.keys())[:20]


In [8]:
freq_brown.most_common(20)


[('the', 62713),
 (',', 58334),
 ('.', 49346),
 ('of', 36080),
 ('and', 27915),
 ('to', 25732),
 ('a', 21881),
 ('in', 19536),
 ('that', 10237),
 ('is', 10011),
 ('was', 9777),
 ('for', 8841),
 ('``', 8837),
 ("''", 8789),
 ('The', 7258),
 ('with', 7012),
 ('it', 6723),
 ('as', 6706),
 ('he', 6566),
 ('his', 6466)]

In [10]:
# an nltk.ConditionalFreqDist() counts frequencies of pairs.

# When given a list of bigrams, it maps each first word of a bigram

# to a FreqDist over the second words of the bigram.


cfreq_brown_2gram = nltk.ConditionalFreqDist(nltk.bigrams(brown.words()))
# conditions() in a ConditionalFreqDist are like keys()

# in a dictionary

cfreq_brown_2gram.conditions()


['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.',
 'jury',
 'further',
 'in',
 'term-end',
 'presentments',
 'the',
 'City',
 'Executive',
 'Committee',
 ',',
 'which',
 'had',
 'over-all',
 'charge',
 'deserves',
 'praise',
 'and',
 'thanks',
 'Atlanta',
 'for',
 'manner',
 'was',
 'conducted',
 'September-October',
 'term',
 'been',
 'charged',
 'by',
 'Superior',
 'Court',
 'Judge',
 'Durwood',
 'Pye',
 'to',
 'investigate',
 'reports',
 'possible',
 'hard-fought',
 'won',
 'Mayor-nominate',
 'Ivan',
 'Allen',
 'Jr.',
 'Only',
 'a',
 'relative',
 'handful',
 'such',
 'received',
 'considering',
 'widespread',
 'interest',
 'number',
 'voters',
 'size',
 'this',
 'city',
 'it',
 'did',
 'find',
 'many',
 "Georgia's",
 'registration',
 'laws',
 'are',
 'outmoded',
 'or',
 'inadeq

In [11]:


# the cfreq_brown_2gram entry for "my" is a FreqDist.

cfreq_brown_2gram["my"]
# here are the words that can follow after "my".


FreqDist({'own': 52, 'hand': 19, 'life': 19, 'mind': 19, 'first': 15, 'wife': 14, 'hands': 14, 'eyes': 13, 'father': 13, 'mother': 12, ...})

In [None]:
# We first access the FreqDist associated with "my",

# then the keys in that FreqDist

cfreq_brown_2gram["my"].keys()


In [13]:
# here are the 20 most frequent words to come after "my", with their frequencies

cfreq_brown_2gram["my"].most_common(20)


[('own', 52),
 ('hand', 19),
 ('life', 19),
 ('mind', 19),
 ('first', 15),
 ('wife', 14),
 ('hands', 14),
 ('eyes', 13),
 ('father', 13),
 ('mother', 12),
 ('husband', 12),
 ('way', 12),
 ('head', 11),
 ('left', 8),
 ('heart', 7),
 ('point', 7),
 ('body', 7),
 ('Uncle', 7),
 ('best', 6),
 ('family', 6)]

In [14]:
# an nltk.ConditionalProbDist() maps pairs to probabilities.

# One way in which we can do this is by using Maximum Likelihood Estimation (MLE)

cprob_brown_2gram = nltk.ConditionalProbDist(cfreq_brown_2gram, nltk.MLEProbDist)


In [15]:
# This again has conditions() wihch are like dictionary keys

cprob_brown_2gram.conditions()


['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.',
 'jury',
 'further',
 'in',
 'term-end',
 'presentments',
 'the',
 'City',
 'Executive',
 'Committee',
 ',',
 'which',
 'had',
 'over-all',
 'charge',
 'deserves',
 'praise',
 'and',
 'thanks',
 'Atlanta',
 'for',
 'manner',
 'was',
 'conducted',
 'September-October',
 'term',
 'been',
 'charged',
 'by',
 'Superior',
 'Court',
 'Judge',
 'Durwood',
 'Pye',
 'to',
 'investigate',
 'reports',
 'possible',
 'hard-fought',
 'won',
 'Mayor-nominate',
 'Ivan',
 'Allen',
 'Jr.',
 'Only',
 'a',
 'relative',
 'handful',
 'such',
 'received',
 'considering',
 'widespread',
 'interest',
 'number',
 'voters',
 'size',
 'this',
 'city',
 'it',
 'did',
 'find',
 'many',
 "Georgia's",
 'registration',
 'laws',
 'are',
 'outmoded',
 'or',
 'inadeq

In [16]:
# Here is what we find for "my": a Maximum Likelihood Estimation-based probability distribution,

# as a MLEProbDist object.

cprob_brown_2gram["my"]


<MLEProbDist based on 1161 samples>

In [None]:
# We can find the words that can come after "my" by using the function samples()

cprob_brown_2gram["my"].samples()


In [18]:
# Here is the probability of a particular pair:

cprob_brown_2gram["my"].prob("own")


0.04478897502153316

In [19]:
#####

# We can also compute unigram probabilities (probabilities of individual words)

freq_brown_1gram = nltk.FreqDist(brown.words())

len_brown = len(brown.words())


def unigram_prob(word):

   return freq_brown_1gram[word] / len_brown

#############


In [21]:
# The contents of cprob_brown_2gram, all these probabilities, now form a

# trained bigram language model. The typical use for a language model is

# to ask it for the probabillity of a word sequence

# P(how do you do) = P(how) * P(do|how) * P(you|do) * P(do | you)

prob_sentence = unigram_prob("how") * cprob_brown_2gram["how"].prob("do") * cprob_brown_2gram["do"].prob("you") * \
    cprob_brown_2gram["you"].prob("do")

# result: 1.5639033871961e-09

###############


In [24]:
# We can also use a language model in another way:

# We can let it generate text at random

# This can provide insight into what it is that

# the language model has been learning

cprob_brown_2gram["my"].generate()


'particular'

In [30]:

# We can use this to generate text at random

# based on a given text of bigrams.

# Let's do this for the Sam "corpus"
with open('./seuss_script.txt', 'r') as seuss_script:
    # open txt file and read to string, string to lower
    seussical = seuss_script.read()
corpus = """<s> I am Sam </s>

<s> Sam I am </s>

<s> I do not like green eggs and ham </s>"""

words = corpus.split()

cfreq_sam = nltk.ConditionalFreqDist(nltk.bigrams(words))

cprob_sam = nltk.ConditionalProbDist(cfreq_sam, nltk.MLEProbDist)

word = "<s>"

for index in range(50):

   word = cprob_sam[word].generate()

   print(word, end=" ")

print("\n")


Sam </s> <s> Sam I do not like green eggs and ham </s> <s> I am Sam I do not like green eggs and ham </s> <s> I do not like green eggs and ham </s> <s> I am </s> <s> I do not like green eggs and ham </s> 



In [31]:
# Not a lot of variety. We need a bigger corpus.

# What kind of genres do we have in the Brown corpus?

brown.categories()


['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [32]:
# Let's try Science Fiction.

cfreq_scifi = nltk.ConditionalFreqDist(
    nltk.bigrams(brown.words(categories="science_fiction")))

cprob_scifi = nltk.ConditionalProbDist(cfreq_scifi, nltk.MLEProbDist)

word = "in"

for index in range(50):

   word = cprob_scifi[word].generate()

   print(word, end=" ")

print


an `` worship '' ? ? ? What shall never live and the fuses back to Hesperus' snapping turtle on Yancey-6 138 . Why pick me . `` At first two Earth-weeks each party gives you . His small -- unless one huge '' . Mike ? `` Hesperus is 

<function print>

In [33]:


# try this with other Brown corpus categories.

# Here is how to do this with NLTK books:

import nltk

from nltk.book import *

def generate_text(text, initialword, numwords):

   bigrams = list(nltk.ngrams(text, 2))

   cpd = nltk.ConditionalProbDist(nltk.ConditionalFreqDist(bigrams), nltk.MLEProbDist)

   word = initialword

   for i in range(numwords):

     print(word, end = " ")

     word = cpd[ word].generate()

   print(word)

# Holy Grail

generate_text(text6, "I", 100)

# sense and sensibility

generate_text(text2, "I", 100)

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.


LookupError: 
**********************************************************************
  Resource [93mgutenberg[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('gutenberg')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/gutenberg[0m

  Searched in:
    - '/home/fossa/nltk_data'
    - '/home/fossa/data/dont_worry_about_it/dr_seuss_markov/venv/nltk_data'
    - '/home/fossa/data/dont_worry_about_it/dr_seuss_markov/venv/share/nltk_data'
    - '/home/fossa/data/dont_worry_about_it/dr_seuss_markov/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [37]:

import string


# Word bigrams are just pairs of words.



with open('./seuss_script.txt', 'r') as seuss_script:
    # open txt file and read to string, string to lower
    seussical = seuss_script.read()




# count bigrams

bigrams = {}

words_punct = seussical.split()

# strip all punctuation at the beginning and end of words, and

# convert all words to lowercase.


words = [w.strip(string.punctuation).lower() for w in words_punct]

# add special START, END tokens

words = ["START"] + words + ["END"]

for index, word in enumerate(words):

    if index < len(words) - 1:

        # we only look at indices up to the

        # next-to-last word, as this is

        # the last one at which a bigram starts

        w1 = words[index]

        w2 = words[index + 1]

        # bigram is a tuple,

        # like a list, but fixed.

        # Tuples can be keys in a dictionary

        bigram = (w1, w2)

        if bigram in bigrams:

            bigrams[bigram] = bigrams[bigram] + 1

        else:

            bigrams[bigram] = 1

        # or, more simply, like this:

        # bigrams[bigram] = bigrams.get(bigram, 0) + 1

# sort bigrams by their counts

sorted_bigrams = sorted(
    bigrams.items(), key=lambda pair: pair[1], reverse=True)

for bigram, count in sorted_bigrams:

    print(bigram, ":", count)


('not', 'like') : 35
('i', 'do') : 34
('do', 'not') : 34
('like', 'them') : 32
('in', 'a') : 29
('eat', 'them') : 21
('with', 'a') : 18
('not', 'in') : 18
('i', 'will') : 15
('i', 'would') : 14
('them', 'in') : 14
('would', 'not') : 13
('green', 'eggs') : 12
('eggs', 'and') : 12
('and', 'ham') : 12
('would', 'you') : 12
('in', 'the') : 11
('ham', 'i') : 10
('them', 'here') : 10
('not', 'eat') : 10
('like', 'green') : 9
('them', 'with') : 9
('here', 'or') : 8
('or', 'there') : 8
('there', 'i') : 8
('them', 'anywhere') : 8
('them', 'sam-i-am') : 8
('a', 'house') : 8
('could', 'not') : 8
('will', 'not') : 8
('anywhere', 'i') : 7
('a', 'mouse') : 7
('a', 'box') : 7
('a', 'fox') : 7
('not', 'with') : 7
('a', 'car') : 7
('not', 'could') : 7
('a', 'train') : 7
('the', 'dark') : 7
('and', 'i') : 7
('will', 'eat') : 7
('could', 'you') : 6
('a', 'tree') : 6
('mouse', 'i') : 5
('on', 'a') : 5
('i', 'am') : 4
('you', 'like') : 4
('house', 'i') : 4
('you', 'could') : 4
('you', 'may') : 4
('let', 'm

In [42]:
import nltk
nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
text = "Natural language processing is an exciting area."




NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------


In [None]:
from nltk.corpus import stopwords
# Stop Words removal:
# When we use the features from a text to model, we will encounter a lot of noise. These are the stop words like the, he, her, etc… which don’t help us and , just be removed before processing for cleaner processing inside the model. With NLTK we can see all the stop words available in the English language.


print(stopwords.words("english"))
# Remove stop words
words = [w for w in words if w not in stopwords.words(“english”)]
print(words)


In [None]:
"""Stemming:
In our text we may find many words like playing, played, playfully, etc… 
which have a root word, play all of these convey the same meaning. So we 
can just extract the root word and remove the rest. Here the root word formed i
s called ‘stem’ and it is not necessarily that stem needs to exist and have a meaning."""
#  Just by committing the suffix and prefix, we generate the stems.
# NLTK provides us with PorterStemmer LancasterStemmer and SnowballStemmer packages
from nltk.stem.porter import PorterStemmer
# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)


In [None]:
"""Lemmatization:
We want to extract the base form of the word here. The word extracted 
here is called Lemma and it is available in the dictionary. We have the 
WordNet corpus and the lemma generated will be available in this corpus. 
NLTK provides us with the WordNet Lemmatizer that makes use of the WordNet 
Database to lookup lemmas of words."""

from nltk.stem.wordnet import WordNetLemmatizer
# Reduce words to their root form
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)


In [None]:
"""Parse tree or Syntax Tree generation :
We can define grammar and then use NLTK RegexpParser to extract all parts of 
speech from the sentence and draw functions to visualize it."""

# Import required libraries
from nltk import pos_tag, word_tokenize, RegexpParser
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Example text
sample_text = "The quick brown fox jumps over the lazy dog"
# Find all parts of speech in above sentence
tagged = pos_tag(word_tokenize(sample_text))
#Extract all parts of speech from any text
chunker = RegexpParser("""NP: {?*} #To extract Noun Phrases
					    P: {}			 #To extract Prepositions
					    V: {}			 #To extract Verbs
					    PP: {} #To extract Prepositional Phrases
                        VP: { *} #To extract Verb Phrases""")
# Print all parts of speech in above sentence
output = chunker.parse(tagged)
print(“After Extractingn”, output)


In [None]:
"""POS Tagging:
Part of Speech tagging is used in text processing to avoid confusion between 
two same words that have different meanings. With respect to the definition 
and context, we give each word a particular tag and process them. Two Steps 
are used here:"""

# Tokenize text(word_tokenize).
# Apply the pos_tag from NLTK to the above step.
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))
txt = "Natural language processing is an exciting area."
" Huge budget have been allocated for this."
# sent_tokenize is one of instances of
# PunktSentenceTokenizer from the nltk.tokenize.punkt module
tokenized = sent_tokenize(txt)
for i in tokenized:
  # Word tokenizers is used to find the words
  # and punctuation in a string
  wordsList = nltk.word_tokenize(i)
  # removing stop words from wordList
  wordsList = [w for w in wordsList if not w in stop_words]
  # Using a Tagger. Which is part-of-speech
  # tagger or POS-tagger.
  tagged = nltk.pos_tag(wordsList)
  print(tagged)
