In [None]:
import nltk
from nltk.book import *

import re

## Normalization

Questions:

1. Find emojis in the chat corpus.

1. Determine a normalization scheme. (What needs to be normalized, how would you do it?)

1. Count the happy vs sad emojis.

In [None]:
chat = text5 # give it a nice name. 

# Let's find emojis in chat. 
potential_emojis = {w for w in chat if ":" in w or ";" in w or "=" in w}

In [None]:
potential_emojis

Clearly we're catching some non-emojis, but let's assume we're getting most of the list. 

In [None]:
# These are all oriented left-to-right, so let's make a regex to find them. 
emoji = re.compile(r"^[:;=]-?[)(\]PD@op|O]$") # misses '>:->' and ']:)' and repeats. Insert shruggie
emoji2 = re.compile(r"^[:;=]-?.$")
emojis = {w for w in chat if emoji2.search(w)}
sorted(emojis)
#len(emojis)
# could normalize by removing hyphens, case letters to upper case

In [None]:
# Count happy vs sad
happy = [w for w in chat if w in {":-)",":)",":D",";-)","=)"}]
sad = [w for w in chat if w in {":-(",":(",";-(","=("}]

print(len(happy))
print(len(sad))

---

## Stemming

Let's go through some stemming examples from the NLTK.

In [None]:
x = text4[:30]

In [None]:
vowels = re.compile(r'[aeiouyAEIOU]')

len({w for w in nltk.corpus.words.words() if not vowels.search(w[:-3]) and w[-3:] == "ing"})

In [None]:
porter = nltk.PorterStemmer() # give it a short name.
start = 30000
distance = 100

print(" ".join(text4[start:(start + distance)]))
print("\n\n")
print(" ".join([porter.stem(w) for w in text4[start:(start + distance)]]))



In [None]:
# words in inaugural addresses
print(len(set(text4)))

In [None]:
inaug_stemmed = {porter.stem(w.lower()) for w in text4}

print(len(inaug_stemmed))

print(len(set(text4))/len(inaug_stemmed))

---

## Language Models
Let's find some common n-grams in S&S.

In [None]:
fd = FreqDist(text2)

In [None]:
fd.freq('a')

In [None]:
nltk.corpus.stopwords.words("english")

In [None]:
fd = FreqDist([w.lower() for w in text2 
               if w.lower() not in 
               nltk.corpus.stopwords.words("english") 
               and w.isalpha()])

total_words = sum([count for word, count in fd.items()])

for pairs in fd.most_common(20) :
    print(" : ".join([pairs[0],str(pairs[1]),str(pairs[1]/total_words)]))
    
#4063/3861

In [None]:
fd = FreqDist([" ".join(b) for b in nltk.ngrams(text2,3) if b[0] == "I" and b[1] == "am"]) # could use bigram function instead

In [None]:
fd.most_common(10)

In [None]:
for gram, count in fd.items() :
    if gram[0] == "I" :
        print(" ".join(gram) + ": " + str(count))

        

In [None]:
total_words = sum([count for pair, count in fd.items() if pair[0] == "I"])

In [None]:
total_words

In [None]:
for gram, count in sorted(fd.items(), key ) :
    if gram[0] == "I" : 
        print(gram)
        print(count)

In [None]:
for gram,count in sorted(fd.items(), key=lambda pair: pair[1], reverse=True) : 
    if gram[0] == "I" :
        print(" : ".join([str(gram),str(count),str(round(count/total_words,3))]))        

In [None]:
fd.most_common(10)

In [None]:
fd = FreqDist(nltk.ngrams(text2,3))

In [None]:
total_words = 0

for gram,count in sorted(fd.items(), key=lambda pair: pair[1], reverse=True) : 
    if gram[0] == "I" and gram[1] == "am" :
        total_words += count
        print(" : ".join([str(gram),str(count)])) 
        

print(72/total_words)
print(12/total_words)

In [None]:
text2.concordance("I")

In [None]:
# need this for phrases
from nltk.app import concordance

In [None]:
text2.concordance("sure")

---

## N-gram models

Let's make a function that takes in text, builds a freq dist and generates text with various n-grams.

In [None]:
import random

def weighted_choice(freq_dist):
    weight_total = sum([count for token,count in freq_dist.items()])
    n = random.uniform(0, weight_total)
    for token, count in freq_dist.items() :
        if n < count:
            return(token)
        n = n - count
    return(token)

In [None]:
weighted_choice(FreqDist(text5))

In [None]:
def generate_unigram(text,length=10) :
    fd = FreqDist(text)
    
    results = []
    for i in range(length) :
        results.append(weighted_choice(fd))
        
    return(" ".join(results))


In [None]:
generate_unigram(text1)

In [None]:
generate_unigram(text2)

In [None]:
generate_unigram(text5)

In [None]:
def weighted_choice_ngram(cur_word,freq_dist) :
    ''' Starts with a current word and randomly chooses 
        a following word based on the bigrams. '''
    
    # First, build list of tuples of the form
    # ('a_word',count)
    # where our freq_dist has an entry like 
    # ('cur_word','a_word',count)
    sub_dist = {}
    
    for bigram, count in freq_dist.items() :
        if bigram[0] == cur_word :
            sub_dist[bigram[1]] = count
    
    return(weighted_choice(sub_dist))

def generate_bigram(text,length=10,start=None) :
    
    if not start :
        uni_fd = FreqDist(text)
        start = weighted_choice(uni_fd)
        
    fd = FreqDist(nltk.bigrams(text))
    
    results = []
    this_word = start
    for i in range(length) :
        this_word = weighted_choice_ngram(this_word,fd)
        results.append(this_word)
        
    return(" ".join(results))


In [None]:
generate_bigram(text1)

In [None]:
generate_bigram(text2)

In [None]:
generate_bigram(text5)