## <center> <h1> CS 401: Natural Language Processing</h1></center>
### <center> <h1> Project 2 </h1></center>

### Question 1

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize # implicitly calls punkt
import string

In [2]:
def wordCount(words):
    """
        Helper function that takes the words from 
        the data and returns frequency counters for 
        each word as a dictionary.
        Parameters:
             words: list of all words in data
        Return value:
            dictionary of frequencies
    """
    worder = {}
    for word in words:
        if word not in worder:
            worder[word] = 0
        worder[word] += 1
    return worder

In [3]:
def getBigrams(filename):
    """
        Takes the filename and returns a tuple
        of bigram and word frequencies.
        Parameters:
        filename: textfile
        Return Value:
            Tuple of bigram and word freqs. 
    """
    text1 = open(filename, "r")
    data1 = text1.read()
    bigram_freq = {}
    wordlist = []
    sent_list = sent_tokenize(data1.lower()) # normalizes text to all lowercase
    for sent in sent_list:
        words = word_tokenize(sent.strip(),preserve_line=False)
        words.append('</s>')
        wprev = '<s>'
        for w in words:
            wordlist.append(w)
            bigram = (wprev, w)
            if bigram not in bigram_freq:
                bigram_freq[bigram] = 0
            bigram_freq[bigram] += 1
            wprev = w
    wordlist.append('<s>')
    freqlist = wordCount(wordlist)
    return (bigram_freq,freqlist)

In [4]:
Bigrams,worder = getBigrams("TheUsualSuspects.txt")

In [5]:
def BigramDict(grams,diction,word,worder):
    """
        Function to create bigram matrices
        for every unique word in data. Includes
        zero probability cases.
        Parameters:
            grams: bigrams
            diction: dictionary to store bigram frequencies
            word: unique word in data
            worder: word dictionary
        Return value:
            Updated `diction`
    """
    dictionary = {}
    for each in grams:
        if each[0] == word:
            dictionary[each[1]] = (grams[each]/worder[word])
        if each[0] not in dictionary:
            dictionary[each[0]] = (0/worder[word])
        diction[word] = dictionary
    return diction

In [6]:
diction = {}
for word in worder:
    BigramDict(Bigrams,diction,word,worder) # will take a min or so to run. Runtime (O(n^2)) over all words (3874) in data.

In [None]:
#diction['the'] # includes zero probability cases too.

### Question 2

In [7]:
import operator
def sorter(diction,word):
    """
        Function to sort probabilities stored in 
        dictionary.
        Parameters:
            diction: unsorted dictionary
            word: word in dictionary
        Return Value:
            sorted dictionary
    """
    x = diction[word]
    sorted_x = list(sorted(x.items(), key=operator.itemgetter(1),reverse=True))
    for each in sorted_x:
        if each[0] in string.punctuation: # not sure why this doesn't work
            sorted_x.remove(each)
    return sorted_x

In [8]:
Finaldict = {}
for word in diction:
    Finaldict[word] = sorter(diction,word)

In [None]:
#Finaldict['the'] # sorted dictionary for every unique word

In [9]:
# would recommend starting with 'my'
import time
stringer = ""
start_word = input('First word (only lowercase): ')
if start_word in diction:
    word = start_word
    index = 1
    while index != 0:
        values = Finaldict[word]
        for each in values:
            if each[0] in string.punctuation:
                values.remove(each)
        stringer += word + " "
        print(stringer)
        index = int(input("Choose: 1 %s, 2 %s, 3 %s: "%(values[0][0],values[1][0],values[2][0])))
        if index > 3:
            print("Choice not valid. Try again.")
            break
        word = values[index-1][0]
else:
    print("Word not in corpus. Try again.")
    pass
print('Ok, ending')

First word (only lowercase): 0
Word not in corpus. Try again.
Ok, ending


### Question 3

In [12]:
import re
def getBigramsSmoothed(filename,cutoff):
    """
        Function to create bigram probabilities
        with unknown words.
        Parameters:
            filename: text file
            cutoff: frequency cutoff for UNK
        Return Value:
            Tuple of bigram probs. and word freqs.
    """
    text1 = open(filename, "r")
    data1 = text1.read()
    data1 = data1.lower()
    bigram_freq = {}
    wordlist = []
    p,freqlist = getBigrams(filename) #calls getBigrams to get word frequencies
    for word in freqlist:
        if freqlist[word] <= cutoff and len(word)>=2: 
            strings = r"\b"+word+r"\b"
            data1 = re.sub(strings,"UNK",data1) # makes sure only whole words are replaced
    sent_list = sent_tokenize(data1)
    for sent in sent_list:
        words = word_tokenize(sent.strip(),preserve_line=False)
        words.append('</s>')
        wprev = '<s>'
        for w in words:
            wordlist.append(w)
            bigram = (wprev, w)
            if bigram not in bigram_freq:
                bigram_freq[bigram] = 0
            bigram_freq[bigram] += 1
            wprev = w
    wordlist.append('<s>')
    freqlist2 = wordCount(wordlist)
    return (bigram_freq,freqlist2)

In [13]:
Bigrams2,worder2 = getBigramsSmoothed("TheUsualSuspects.txt",2)

In [14]:
def BigramDictSmoother(See,diction,word,worder):
    """
        Function to implement Add-one smoothing
        to bigram probabilities.
        Parameters:
            See: Bigrams dictionary
            diction: word probabilities
            word: unique word
            worder: word dictionary
        Return Value:
            Updated `diction2`
    """
    dictionary = {}
    for each in See:
        if each[0] == word:
            dictionary[each[1]] = ((See[each]+1)/(worder[word]+len(worder)))
        if each[0] not in dictionary:
            dictionary[each[0]] = ((0+1)/(worder[word]+len(worder)))
        diction[word] = dictionary
    return diction

In [15]:
diction2 = {}
for word in worder2:
    BigramDictSmoother(Bigrams2,diction2,word,worder2)

In [16]:
#diction2['UNK']

0.0001806684733514002

### Question 4

In [17]:
import math
def GenPerplexity(filename,worder):
    """
        Function to return bigram
        frequency dict. of test data.
        Parameters:
            filename: text file
            worder: word dict. of train data
        Return Value:
            bigrams
    """
    text2 = open(filename, "r")
    data2 = text2.read()
    data2 = data2.lower()
    bigram_freq = {}
    wordlist2 = []
    
    # tokenizing words in new text
    sent_list = sent_tokenize(data2)
    for sent in sent_list:
        words = word_tokenize(sent.strip(),preserve_line=False)
        for w in words:
            wordlist2.append(w)
            
    # replacing unseen words in new text with 'UNK'
    FullList = wordCount(wordlist2)
    for word in FullList:
        if word not in worder:
            strings = r"\b"+word+r"\b"
            data2 = re.sub(strings,"UNK",data2)
            
    # retokenizing new text to compare bigrams
    wordlist2 = []
    sent_list = sent_tokenize(data2)
    for sent in sent_list:
        words = word_tokenize(sent.strip(),preserve_line=False)
        words.append('</s>')
        wprev = '<s>'
        for w in words:
            wordlist2.append(w)
            bigram = (wprev, w)
            if bigram not in bigram_freq:
                bigram_freq[bigram] = 0
            bigram_freq[bigram] += 1
            wprev = w
    wordlist2.append('<s>')
    FullList = wordCount(wordlist2)
    return(bigram_freq)

In [18]:
bigram_freq = GenPerplexity("Script_ShiningThe.txt",worder) #10-15 sec runtime. (O(n^2))
Commons = dict(set(Bigrams2.keys()).intersection(set(bigram_freq.keys())))
Endbigram = {}
for each in Commons:
    if each in diction2:
        if Commons[each] in diction2[each]:
            bi = (each,Commons[each])
            Endbigram[bi] = diction2[each][Commons[each]]

In [19]:
Logsum = 0
for each in Endbigram:
    Logsum = Logsum + math.log(Endbigram[each], 2)
Perplexity = -Logsum/len(Commons)
Perplexity

8.850814479065267

In [None]:
# The Shining script - 8.850814479065267
# The Heist script - 8.932796821044693
# Ranger Boys book - 9.11441064829222