In [1]:
import nltk
from nltk.book import *

import re

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


## Normalization

Questions:

1. Find emojis in the chat corpus.

1. Determine a normalization scheme. (What needs to be normalized, how would you do it?)

1. Count the happy vs sad emojis.

In [2]:
chat = text5 # give it a nice name. 

# Let's find emojis in chat. 
potential_emojis = {w for w in chat if ":" in w or ";" in w or "=" in w}

In [3]:
potential_emojis

{'!=',
 '.:',
 '.;)',
 '//www.wunderground.com/cgi-bin/findweather/getForecast?query=95953#FIR',
 '10:49',
 '2:55',
 '3:45',
 '4:03',
 '6:38',
 '6:41',
 '6:51',
 '6:53',
 '7:45',
 '9:10',
 ':',
 ':(',
 ':)',
 ':):):)',
 ':-(',
 ':-)',
 ':-@',
 ':-o',
 ':.',
 ':/',
 ':@',
 ':D',
 ':O',
 ':P',
 ':]',
 ':beer:',
 ':blush:',
 ':love:',
 ':o *',
 ':p',
 ':tongue:',
 ':|',
 ';',
 '; ..',
 ';)',
 ';-(',
 ';-)',
 ';0',
 ';]',
 ';p',
 '=',
 "='s",
 '=(',
 '=)',
 '=-\\',
 '=/',
 '=D',
 '=O',
 '=[',
 '=]',
 '=p',
 '>:->',
 ']:)',
 'capab;e',
 'd=',
 'http://forums.talkcity.com/tc-adults/start ',
 'http://www.shadowbots.com',
 'n;t',
 'o<|=D'}

Clearly we're catching some non-emojis, but let's assume we're getting most of the list. 

In [4]:
# These are all oriented left-to-right, so let's make a regex to find them. 
emoji = re.compile(r"^[:;=]-?[)(\]PD@op|O]$") # misses '>:->' and ']:)' and repeats. Insert shruggie
emoji2 = re.compile(r"^[:;=]-?.$")
emojis = {w for w in chat if emoji2.search(w)}
sorted(emojis)
#len(emojis)
# could normalize by removing hyphens, case letters to upper case

[':(',
 ':)',
 ':-(',
 ':-)',
 ':-@',
 ':-o',
 ':.',
 ':/',
 ':@',
 ':D',
 ':O',
 ':P',
 ':]',
 ':p',
 ':|',
 ';)',
 ';-(',
 ';-)',
 ';0',
 ';]',
 ';p',
 '=(',
 '=)',
 '=-\\',
 '=/',
 '=D',
 '=O',
 '=[',
 '=]',
 '=p']

In [5]:
# Count happy vs sad
happy = [w for w in chat if w in {":-)",":)",":D",";-)","=)"}]
sad = [w for w in chat if w in {":-(",":(",";-(","=("}]

print(len(happy))
print(len(sad))

159
20


---

## Stemming

Let's go through some stemming examples from the NLTK.

In [7]:
vowels = re.compile(r'[aeiouyAEIOU]')

len({w for w in nltk.corpus.words.words() if not vowels.search(w[:-3]) and w[-3:] == "ing"})

35

In [8]:
porter = nltk.PorterStemmer() # give it a short name.
start = 30000
distance = 100

print(" ".join(text4[start:(start + distance)]))
print("\n\n")
print(" ".join([porter.stem(w) for w in text4[start:(start + distance)]]))



aid of that Almighty Power which has hitherto protected me and enabled me to bring to favorable issues other important but still greatly inferior trusts heretofore confided to me by my country . The broad foundation upon which our Constitution rests being the people -- a breath of theirs having made , as a breath can unmake , change , or modify it -- it can be assigned to none of the great divisions of government but to that of democracy . If such is its theory , those who are called upon to administer it must recognize as its



aid of that almighti power which ha hitherto protect me and enabl me to bring to favor issu other import but still greatli inferior trust heretofor confid to me by my countri . the broad foundat upon which our constitut rest be the peopl -- a breath of their have made , as a breath can unmak , chang , or modifi it -- it can be assign to none of the great divis of govern but to that of democraci . If such is it theori , those who are call upon to administ it mu

In [9]:
# words in inaugural addresses
print(len(set(text4)))

9754


In [10]:
inaug_stemmed = {porter.stem(w.lower()) for w in text4}

print(len(inaug_stemmed))

print(len(set(text4))/len(inaug_stemmed))

5470
1.783180987202925


---

## Language Models
Let's find some common n-grams in S&S.

In [11]:
fd = FreqDist(text2)

In [12]:
fd.freq('a') # what does this mean? It's the number of times "a" is used in "Sense & Sensibility"

0.01443041193422614

In [13]:
nltk.corpus.stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [14]:
fd = FreqDist([w.lower() for w in text2 
               if w.lower() not in 
               nltk.corpus.stopwords.words("english") 
               and w.isalpha()])

total_words = sum([count for word, count in fd.items()])

for pairs in fd.most_common(20) :
    print(" : ".join([pairs[0],str(pairs[1]),str(pairs[1]/total_words)]))
    
#4063/3861

elinor : 685 : 0.012688474789760307
could : 578 : 0.010706479457637166
marianne : 566 : 0.010484199607305598
mrs : 530 : 0.009817360056310896
would : 515 : 0.009539510243396436
said : 397 : 0.0073537583818026895
every : 377 : 0.0069832919645834105
one : 331 : 0.006131219204979069
much : 290 : 0.005371763049679547
must : 283 : 0.005242099803652799
sister : 282 : 0.005223576482791835
edward : 263 : 0.00487163338643352
mother : 258 : 0.0047790167821287
dashwood : 252 : 0.004667876856962916
well : 240 : 0.004445597006631349
time : 239 : 0.004427073685770385
know : 232 : 0.004297410439743637
jennings : 230 : 0.004260363798021709
though : 216 : 0.004001037305968214
willoughby : 216 : 0.004001037305968214


In [25]:
fd = FreqDist([" ".join(b) for b in nltk.ngrams(text2,3) if b[0] == "I" and b[1] == "am"]) # could use bigram function instead

In [26]:
fd.most_common(10)

[('I am sure', 72),
 ('I am not', 12),
 ('I am afraid', 11),
 ('I am so', 11),
 ('I am sorry', 11),
 ('I am very', 10),
 ('I am glad', 4),
 ('I am now', 4),
 ('I am monstrous', 4),
 ('I am convinced', 3)]

In [27]:
for gram, count in fd.items() :
    if gram[0] == "I" :
        print(gram + ": " + str(count))

        

I am sure: 72
I am convinced: 3
I am afraid: 11
I am by: 1
I am very: 10
I am before: 1
I am glad: 4
I am able: 1
I am ready: 1
I am not: 12
I am much: 3
I am with: 1
I am almost: 2
I am particularly: 2
I am the: 2
I am heartily: 1
I am ,": 1
I am now: 4
I am unable: 1
I am persuaded: 1
I am happy: 1
I am perfectly: 3
I am guilty: 1
I am so: 11
I am only: 2
I am going: 1
I am ,: 3
I am confined: 1
I am monstrous: 4
I am flattered: 1
I am .: 1
I am always: 3
I am at: 1
I am ;: 1
I am sorry: 11
I am a: 1
I am determined: 1
I am in: 3
I am alive: 2
I am resolved: 1
I am rather: 1
I am bound: 1
I am capable: 2
I am delighted: 1
I am ever: 1
I am writing: 1
I am NOT: 1
I am commissioned: 1
I am quite: 2
I am miserable: 1
I am to: 2
I am wretched: 1
I am justified: 1
I am right: 1
I am well: 2
I am amazingly: 1
I am come: 1
I am extremely: 2
I am excessively: 1
I am informed: 1
I am charged: 1
I am no: 1
I am talking: 1
I am ruined: 1
I am shut: 1
I am allowed: 1
I am thankful: 1
I am doing:

In [28]:
total_words = sum([count for pair, count in fd.items() if pair[0] == "I"])

In [29]:
total_words

223

In [33]:
for gram, count in fd.most_common(30) :
    if gram[0] == "I" : 
        print(gram)
        print(count)

I am sure
72
I am not
12
I am afraid
11
I am so
11
I am sorry
11
I am very
10
I am glad
4
I am now
4
I am monstrous
4
I am convinced
3
I am much
3
I am perfectly
3
I am ,
3
I am always
3
I am in
3
I am almost
2
I am particularly
2
I am the
2
I am only
2
I am alive
2
I am capable
2
I am quite
2
I am to
2
I am well
2
I am extremely
2
I am grown
2
I am by
1
I am before
1
I am able
1
I am ready
1


In [34]:
for gram,count in sorted(fd.items(), key=lambda pair: pair[1], reverse=True) : 
    if gram[0] == "I" :
        print(" : ".join([str(gram),str(count),str(round(count/total_words,3))]))        

I am sure : 72 : 0.323
I am not : 12 : 0.054
I am afraid : 11 : 0.049
I am so : 11 : 0.049
I am sorry : 11 : 0.049
I am very : 10 : 0.045
I am glad : 4 : 0.018
I am now : 4 : 0.018
I am monstrous : 4 : 0.018
I am convinced : 3 : 0.013
I am much : 3 : 0.013
I am perfectly : 3 : 0.013
I am , : 3 : 0.013
I am always : 3 : 0.013
I am in : 3 : 0.013
I am almost : 2 : 0.009
I am particularly : 2 : 0.009
I am the : 2 : 0.009
I am only : 2 : 0.009
I am alive : 2 : 0.009
I am capable : 2 : 0.009
I am quite : 2 : 0.009
I am to : 2 : 0.009
I am well : 2 : 0.009
I am extremely : 2 : 0.009
I am grown : 2 : 0.009
I am by : 1 : 0.004
I am before : 1 : 0.004
I am able : 1 : 0.004
I am ready : 1 : 0.004
I am with : 1 : 0.004
I am heartily : 1 : 0.004
I am ," : 1 : 0.004
I am unable : 1 : 0.004
I am persuaded : 1 : 0.004
I am happy : 1 : 0.004
I am guilty : 1 : 0.004
I am going : 1 : 0.004
I am confined : 1 : 0.004
I am flattered : 1 : 0.004
I am . : 1 : 0.004
I am at : 1 : 0.004
I am ; : 1 : 0.004
I am

In [35]:
fd.most_common(10)

[('I am sure', 72),
 ('I am not', 12),
 ('I am afraid', 11),
 ('I am so', 11),
 ('I am sorry', 11),
 ('I am very', 10),
 ('I am glad', 4),
 ('I am now', 4),
 ('I am monstrous', 4),
 ('I am convinced', 3)]

In [36]:
fd = FreqDist(nltk.ngrams(text2,3))

In [37]:
total_words = 0

for gram,count in sorted(fd.items(), key=lambda pair: pair[1], reverse=True) : 
    if gram[0] == "I" and gram[1] == "am" :
        total_words += count
        print(" : ".join([str(gram),str(count)])) 
        

print(72/total_words)
print(12/total_words)

('I', 'am', 'sure') : 72
('I', 'am', 'not') : 12
('I', 'am', 'afraid') : 11
('I', 'am', 'so') : 11
('I', 'am', 'sorry') : 11
('I', 'am', 'very') : 10
('I', 'am', 'glad') : 4
('I', 'am', 'now') : 4
('I', 'am', 'monstrous') : 4
('I', 'am', 'convinced') : 3
('I', 'am', 'much') : 3
('I', 'am', 'perfectly') : 3
('I', 'am', ',') : 3
('I', 'am', 'always') : 3
('I', 'am', 'in') : 3
('I', 'am', 'almost') : 2
('I', 'am', 'particularly') : 2
('I', 'am', 'the') : 2
('I', 'am', 'only') : 2
('I', 'am', 'alive') : 2
('I', 'am', 'capable') : 2
('I', 'am', 'quite') : 2
('I', 'am', 'to') : 2
('I', 'am', 'well') : 2
('I', 'am', 'extremely') : 2
('I', 'am', 'grown') : 2
('I', 'am', 'by') : 1
('I', 'am', 'before') : 1
('I', 'am', 'able') : 1
('I', 'am', 'ready') : 1
('I', 'am', 'with') : 1
('I', 'am', 'heartily') : 1
('I', 'am', ',"') : 1
('I', 'am', 'unable') : 1
('I', 'am', 'persuaded') : 1
('I', 'am', 'happy') : 1
('I', 'am', 'guilty') : 1
('I', 'am', 'going') : 1
('I', 'am', 'confined') : 1
('I', 'am',

In [44]:
text2.concordance("I")

Displaying 25 of 2004 matches:
 to me ," replied her husband , " that I should assist his widow and daughters 
 did not know what he was talking of , I dare say ; ten to one but he was light
ly to myself . He could hardly suppose I should neglect them . But as he requir
hem . But as he required the promise , I could not do less than give it ; at le
ld not do less than give it ; at least I thought so at the time . The promise ,
t you have such a generous spirit !" " I would not wish to do any thing mean ,"
little . No one , at least , can think I have not done enough for them : even t
can afford to do ." " Certainly -- and I think I may afford to give them five h
rd to do ." " Certainly -- and I think I may afford to give them five hundred p
 That is very true , and , therefore , I do not know whether , upon the whole ,
 them -- something of the annuity kind I mean .-- My sisters would feel the goo
 are not aware of what you are doing . I have known a great deal of the trouble
such an a

---

## N-gram models

Let's make a function that takes in text, builds a freq dist and generates text with various n-grams.

In [45]:
import random

def weighted_choice(freq_dist):
    weight_total = sum([count for token,count in freq_dist.items()])
    n = random.uniform(0, weight_total)
    for token, count in freq_dist.items() :
        if n < count:
            return(token)
        n = n - count
    return(token)

In [46]:
weighted_choice(FreqDist(text5))

'deal'

In [47]:
def generate_unigram(text,length=10) :
    fd = FreqDist(text)
    
    results = []
    for i in range(length) :
        results.append(weighted_choice(fd))
        
    return(" ".join(results))


In [48]:
generate_unigram(text1)

'In in what God come s and art is headmost'

In [49]:
generate_unigram(text2)

'ever Elinor her their attachment , a ; name him'

In [50]:
generate_unigram(text5)

'I right spell thing back . job strait here brightened'

In [51]:
def weighted_choice_ngram(cur_word,freq_dist) :
    ''' Starts with a current word and randomly chooses 
        a following word based on the bigrams. '''
    
    # First, build list of tuples of the form
    # ('a_word',count)
    # where our freq_dist has an entry like 
    # ('cur_word','a_word',count)
    sub_dist = {}
    
    for bigram, count in freq_dist.items() :
        if bigram[0] == cur_word :
            sub_dist[bigram[1]] = count
    
    return(weighted_choice(sub_dist))

def generate_bigram(text,length=10,start=None) :
    
    if not start :
        uni_fd = FreqDist(text)
        start = weighted_choice(uni_fd)
        
    fd = FreqDist(nltk.bigrams(text))
    
    results = []
    this_word = start
    for i in range(length) :
        this_word = weighted_choice_ngram(this_word,fd)
        results.append(this_word)
        
    return(" ".join(results))


In [52]:
generate_bigram(text1)

'. " Turn to show his blood - mill .'

In [53]:
generate_bigram(text2)

'done wondering at present , " Yes , and the'

In [54]:
generate_bigram(text5)

'yes wasup room JOIN . Temp. -6 F / New'