In [1]:
import nltk
from nltk.book import *

import re

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


## Normalization

Questions:

1. Find emojis in the chat corpus.

1. Determine a normalization scheme. (What needs to be normalized, how would you do it?)

1. Count the happy vs sad emojis.

In [2]:
chat = text5 # give it a nice name. 

# Let's find emojis in chat. 
potential_emojis = {w for w in chat if ":" in w or ";" in w or "=" in w}

In [3]:
potential_emojis

{'!=',
 '.:',
 '.;)',
 '//www.wunderground.com/cgi-bin/findweather/getForecast?query=95953#FIR',
 '10:49',
 '2:55',
 '3:45',
 '4:03',
 '6:38',
 '6:41',
 '6:51',
 '6:53',
 '7:45',
 '9:10',
 ':',
 ':(',
 ':)',
 ':):):)',
 ':-(',
 ':-)',
 ':-@',
 ':-o',
 ':.',
 ':/',
 ':@',
 ':D',
 ':O',
 ':P',
 ':]',
 ':beer:',
 ':blush:',
 ':love:',
 ':o *',
 ':p',
 ':tongue:',
 ':|',
 ';',
 '; ..',
 ';)',
 ';-(',
 ';-)',
 ';0',
 ';]',
 ';p',
 '=',
 "='s",
 '=(',
 '=)',
 '=-\\',
 '=/',
 '=D',
 '=O',
 '=[',
 '=]',
 '=p',
 '>:->',
 ']:)',
 'capab;e',
 'd=',
 'http://forums.talkcity.com/tc-adults/start ',
 'http://www.shadowbots.com',
 'n;t',
 'o<|=D'}

Clearly we're catching some non-emojis, but let's assume we're getting most of the list. 

In [4]:
# We haven't talked regex yet, but here's a regex that finds most
# left-to-right emojis
emoji = re.compile(r"^[:;=]-?[)(\]PD@op|O]$") # misses '>:->' and ']:)' and repeats. Insert shruggie
emojis = {w for w in chat if emoji.search(w)}
sorted(emojis)

[':(',
 ':)',
 ':-(',
 ':-)',
 ':-@',
 ':-o',
 ':@',
 ':D',
 ':O',
 ':P',
 ':]',
 ':p',
 ':|',
 ';)',
 ';-(',
 ';-)',
 ';]',
 ';p',
 '=(',
 '=)',
 '=D',
 '=O',
 '=]',
 '=p']

In [5]:
# Count happy vs sad
happy_emojis = [":-)"] # add others here. 
sad_emojis = []

happy = [w for w in chat if w in happy_emojis]
sad = [w for w in chat if w in sad_emojis]

print(len(happy))
print(len(sad))

27
0


---

## Stemming

Let's go through some stemming examples from the NLTK.

In [7]:
# count how many words in nltk.corpus.words.words() have the following two-letter combinations

#pattern = "ng"

#pattern = "st"

pattern = "tk"

len([w for w in nltk.corpus.words.words() if pattern in w])


40

In [8]:
[w for w in nltk.corpus.words.words() if pattern in w]

['Aitkenite',
 'Atka',
 'boatkeeper',
 'catkin',
 'catkinate',
 'doitkin',
 'dotkin',
 'giantkind',
 'Gitksan',
 'hutkeeper',
 'Jatki',
 'jutka',
 'Kamchatkan',
 'kibitka',
 'Kitkahaxki',
 'Kitkehahki',
 'lightkeeper',
 'matka',
 'Nootka',
 'Notkerian',
 'otkon',
 'outkeeper',
 'outkick',
 'outkill',
 'outking',
 'outkiss',
 'outkitchen',
 'outknave',
 'outknee',
 'petkin',
 'planetkin',
 'pocketknife',
 'Sitka',
 'Sitkan',
 'thoughtkin',
 'vetkousie',
 'whatkin',
 'wicketkeep',
 'wicketkeeper',
 'wicketkeeping']

In [None]:
# Bonus, what two-letter combinations are the least common in English? What are the most? 

In [9]:
# Let's compare stemmed text to not-stemmed. 

porter = nltk.PorterStemmer() # give it a short name.
start = 30000
distance = 100

print(" ".join(text4[start:(start + distance)]))
print("\n\n")
print(" ".join([porter.stem(w) for w in text4[start:(start + distance)]]))


aid of that Almighty Power which has hitherto protected me and enabled me to bring to favorable issues other important but still greatly inferior trusts heretofore confided to me by my country . The broad foundation upon which our Constitution rests being the people -- a breath of theirs having made , as a breath can unmake , change , or modify it -- it can be assigned to none of the great divisions of government but to that of democracy . If such is its theory , those who are called upon to administer it must recognize as its



aid of that almighti power which ha hitherto protect me and enabl me to bring to favor issu other import but still greatli inferior trust heretofor confid to me by my countri . the broad foundat upon which our constitut rest be the peopl -- a breath of their have made , as a breath can unmak , chang , or modifi it -- it can be assign to none of the great divis of govern but to that of democraci . If such is it theori , those who are call upon to administ it mu

In [None]:
# count the number of unique words in the inaugural address 
# corpus.

# Now cast to lower-case and stem. Count those unique tokens

# what does the ratio tell us?


---

## Language Models
Let's find some common n-grams in S&S.

In [None]:
fd = FreqDist(text2)

In [None]:
fd.freq('a') # what does this mean? 

In [None]:
# Let's look at the stopwords in English
nltk.corpus.stopwords.words("english")

In [None]:
# Run this next set of code. What does it do?

fd = FreqDist([w.lower() for w in text2 
               if w.lower() not in 
               nltk.corpus.stopwords.words("english") 
               and w.isalpha()])

total_words = sum([count for word, count in fd.items()])

for pairs in fd.most_common(20) :
    print(" : ".join([pairs[0],str(pairs[1]),str(pairs[1]/total_words)]))

In [None]:
fd = FreqDist([" ".join(b) for b in nltk.ngrams(text2,3) if b[0] == "I" and b[1] == "am"]) # could use bigram function instead

In [None]:
fd.most_common(10)

In [None]:
total_words = sum([count for pair, count in fd.items() if pair[0] == "I"])

In [None]:
total_words

In [None]:
for gram,count in sorted(fd.items(), key=lambda pair: pair[1], reverse=True) : 
    if gram[0] == "I" :
        print(" : ".join([str(gram),str(count),str(round(count/total_words,3))]))        

In [None]:
fd.most_common(10)

In [None]:
fd = FreqDist(nltk.ngrams(text2,3))

In [None]:
total_words = 0

for gram,count in sorted(fd.items(), key=lambda pair: pair[1], reverse=True) : 
    if gram[0] == "I" and gram[1] == "am" :
        total_words += count
        print(" : ".join([str(gram),str(count)])) 
        

print(72/total_words)
print(12/total_words)

In [None]:
text2.concordance("I")

In [None]:
text2.concordance("sure")

---

## N-gram models

Let's make a function that takes in text, builds a freq dist and generates text with various n-grams.

In [None]:
import random

# Write a function called weighted_choice that selects 
# a word from a FreqDist based on the probability
# derived from the count of that word compared to the
# count of all words. 


In [None]:
weighted_choice(FreqDist(text5))

In [None]:
def generate_unigram(text,length=10) :
    # A function that generates `length` words from
    # a body of text using `weighted_choice`
    
    fd = FreqDist(text)
    
    results = []
    for i in range(length) :
        results.append(weighted_choice(fd))
        
    return(" ".join(results))


In [None]:
generate_unigram(text1)

In [None]:
generate_unigram(text2)

In [None]:
generate_unigram(text5)

Now repeat the above, but have it work correctly with bi-grams. This is a bit trickier.

In [None]:
generate_bigram(text1)

In [None]:
generate_bigram(text2)

In [None]:
generate_bigram(text5)