In [1]:
from google.colab import drive
drive.mount('/content/gdrive') 

Mounted at /content/gdrive


In [2]:
import os
code_path = "/content/gdrive/MyDrive/cse512hw2Challenge/"
os.chdir(code_path)

In [3]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import copy

import random


In [4]:
corpus = []
f = open('alice_in_wonderland.txt','r')
while(1):
    line =  f.readline()
    if len(line) == 0: break
    corpus.extend(line.split())
        
f.close()
corpus = ' '.join(corpus)

def clean_word(word):
    word = word.lower()
    for punctuation in ['"',"'",'.',',','-','?','!',';',':','—','(',')','[',']']:
        word = word.split(punctuation)[0]
    return word



corpus = [clean_word(word) for word in corpus.split()]
corpus = [word for word in corpus if len(word) > 0]
print(corpus[:25])
D = len(corpus)
print('corpus len: ',D)

['alice', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', 'the', 'millennium', 'fulcrum', 'edition', '3', 'contents', 'chapter', 'i', 'down', 'the', 'rabbit', 'chapter', 'ii', 'the', 'pool', 'of', 'tears', 'chapter']
corpus len:  25320


In [5]:
tokenize = {}
dictionary = []
token = 0
for word in corpus:
    if word not in tokenize.keys():
        tokenize[word] = token
        dictionary.append(word)
        token += 1
    
V = len(dictionary)
print('dictionary size (number of distinct words): ', V)



dictionary size (number of distinct words):  2501


In [6]:
#past word as feature

posterior_1word = np.zeros((V, V))
prior = np.zeros(V)
# Calculate the prior of the words.
for i in range(len(corpus)):
    prior[tokenize[corpus[i]]] += 1
    if i > 0:
       posterior_1word[tokenize[corpus[i-1]]][tokenize[corpus[i]]] += 1

posterior_1word = posterior_1word / prior
prior = prior / len(corpus)

def get_likelihood_2gram(word):
    likelihood = posterior_1word[tokenize[word], :] * prior
    return(likelihood)
def pred_2gram(word):
    likelihood = get_likelihood_2gram(word)
    i = np.argmax(likelihood)
    return(dictionary[i], likelihood[i])
print(pred_2gram('alice'))
print(pred_2gram('the'))
print(pred_2gram('cheshire'))
print(pred_2gram('mock'))
    

('was', 0.0007109004739336493)
('queen', 0.0027646129541864135)
('cat', 0.00019747235387045816)
('turtle', 0.0022511848341232226)


In [7]:
# Using the likelihoods computed from the bigram classiffer, and starting with a seed word "alice", 
# generate the next 25 words by always picking the most likely next word.
word = "alice"
article = word
for i in range(25):
    word, _ = pred_2gram(word)
    article = article + " " + word
print(article)


alice was a little thing i can remember ever saw in a little thing i can remember ever saw in a little thing i can remember


In [8]:
# Using random choices method
word = "alice"
article = word
for i in range(25):
    likelihood = get_likelihood_2gram(word)
    word = random.choices(dictionary, weights = likelihood)[0]
    article = article + " " + word
print(article)

alice hastily your head uncomfortable the mouse sharply advise you couldn have grown woman and she could not notice this elegant thimble looking at each side


In [9]:
# Calculate the acc of 1 word.
positive = 0
for i in range(len(corpus) - 1):
    if pred_2gram(corpus[i])[0] == corpus[i+1]:
        positive += 1
print("The accuracy:", positive / (len(corpus)-1))

The accuracy: 0.2453493423910897


In [10]:
#past 2 words as features

posterior_2words = np.zeros((V, V))
for i in range(0, len(corpus) - 2):
    posterior_2words[tokenize[corpus[i]]][tokenize[corpus[i + 2]]] += 1 
posterior_2words /= prior

posterior_2gram = np.vstack([posterior_1word,posterior_2words])



def get_likelihood_3gram(word2ago,word1ago):
    likelihood = posterior_1word[tokenize[word1ago], :] * posterior_2words[tokenize[word2ago], :] * prior
    return likelihood
def pred_3gram(word2ago,word1ago):
    likelihood = get_likelihood_3gram(word2ago,word1ago)
    i = np.argmax(likelihood)
    return dictionary[i], likelihood[i]
print(pred_3gram('pack','of'))
print(pred_3gram('the','mad'))
print(pred_3gram('she','jumped'))

    

('cards', 3.0)
('you', 0.14447592067988668)
('up', 0.5416666666666666)


In [11]:
first_word = "alice"
second_word = "was"
article = first_word + " " + second_word
for i in range(25):
    new, _ = pred_3gram(first_word, second_word)
    article = article + " " + new
    first_word = second_word
    second_word = new
print(article)


alice was not easy to take this young lady tells us a story afraid i am i ah that the queen who was peeping anxiously into its


In [12]:
# Using random choices method
first_word = "alice"
second_word = "was"
article = first_word + " " + second_word
for i in range(25):
    likelihood = get_likelihood_3gram(first_word, second_word)
    new = random.choices(dictionary, weights = likelihood)[0]
    article = article + " " + new
    first_word = second_word
    second_word = new
print(article)

alice was not like mad tea chapter viii the queen smiled and passed by his garden and she had caught the baby joined wow wow while the


In [13]:
# Calculate the acc of 2 word.
positive = 0
for i in range(len(corpus) - 2):
    if pred_3gram(corpus[i], corpus[i + 1])[0] == corpus[i+2]:
        positive += 1
print("The accuracy:", positive / (len(corpus)-2))

The accuracy: 0.5047397108776365


# Challenge

In [14]:
test_corpus = []
f = open('through_the_looking_glass.txt','r')
while(1):
    line =  f.readline()
    if len(line) == 0: break
    test_corpus.extend(line.split())
        
f.close()
test_corpus = ' '.join(test_corpus)

test_corpus = [clean_word(word) for word in test_corpus.split()]
new_corpus = []
for word in test_corpus:
    if len(word) > 0 and word in dictionary:
        new_corpus.append(word)
test_corpus = new_corpus
print(test_corpus[:25])
test_D = len(test_corpus)
print('test corpus len: ',test_D)

['through', 'the', 'looking', 'and', 'what', 'alice', 'found', 'there', 'by', 'lewis', 'carroll', 'child', 'of', 'the', 'and', 'dreaming', 'eyes', 'of', 'wonder', 'though', 'time', 'be', 'and', 'i', 'and']
test corpus len:  24823


In [15]:
test_tokenize = {}
test_dictionary = []
test_token = 0
for word in test_corpus:
    if word not in test_tokenize.keys():
        test_tokenize[word] = token
        test_dictionary.append(word)
        test_token += 1

test_V = len(test_dictionary)
print('test dictionary size (number of distinct words): ', test_V)

test dictionary size (number of distinct words):  1470


In [16]:
posterior_n_words = []
for i in range(1, 101):
    posterior_i_words = np.zeros((V, V))
    for j in range(0, len(corpus) - i):
        posterior_i_words[tokenize[corpus[j]]][tokenize[corpus[j + i]]] += 1 
    posterior_i_words /= prior
    posterior_n_words.append(posterior_i_words)

In [17]:
def get_likelihood_n_gram(word_n_agos):
    n = len(word_n_agos)
    likelihood = 1
    for i in range(n):
        likelihood *= posterior_n_words[i][tokenize[word_n_agos[i]], :]
    likelihood *= prior
    return likelihood
def pred_n_gram(word_n_agos):
    likelihood = get_likelihood_n_gram(word_n_agos)
    i = np.argmax(likelihood)
    return dictionary[i], likelihood[i]

In [18]:
# Calculate the acc of n word.
n_gram = 100
positive = 0
for i in range(len(corpus) - n_gram):
    if pred_n_gram(corpus[i:i+n_gram][::-1])[0] == corpus[i+n_gram]:
        positive += 1
print("The accuracy:", positive / (len(corpus) - n_gram))

  """


The accuracy: 1.0


Training accuracy

In [19]:
for n_gram in range(1, 101):
    positive = 0
    for i in range(len(corpus) - n_gram):
        if pred_n_gram(corpus[i:i+n_gram][::-1])[0] == corpus[i+n_gram]:
            positive += 1
    print("n_gram = ", n_gram, " The accuracy:", positive / (len(corpus) - n_gram))

n_gram =  1  The accuracy: 0.2453493423910897
n_gram =  2  The accuracy: 0.5047002132869894
n_gram =  3  The accuracy: 0.7499703756369238
n_gram =  4  The accuracy: 0.8784563122136199
n_gram =  5  The accuracy: 0.9401935611297649
n_gram =  6  The accuracy: 0.9667772773959074
n_gram =  7  The accuracy: 0.9831311974084462
n_gram =  8  The accuracy: 0.9897281921618205
n_gram =  9  The accuracy: 0.9937971632886887
n_gram =  10  The accuracy: 0.9960489924930858
n_gram =  11  The accuracy: 0.997313208739974
n_gram =  12  The accuracy: 0.9983404457088667
n_gram =  13  The accuracy: 0.9988935867546529
n_gram =  14  The accuracy: 0.9991701572749545
n_gram =  15  The accuracy: 0.999407231772377
n_gram =  16  The accuracy: 0.9995652861207714
n_gram =  17  The accuracy: 0.9996443109512706
n_gram =  18  The accuracy: 0.9996442968935262
n_gram =  19  The accuracy: 0.9998023793525948
n_gram =  20  The accuracy: 0.9998418972332016
n_gram =  21  The accuracy: 0.9999209454919167
n_gram =  22  The accura

  """


n_gram =  71  The accuracy: 1.0
n_gram =  72  The accuracy: 1.0
n_gram =  73  The accuracy: 1.0
n_gram =  74  The accuracy: 1.0
n_gram =  75  The accuracy: 1.0
n_gram =  76  The accuracy: 1.0
n_gram =  77  The accuracy: 1.0
n_gram =  78  The accuracy: 1.0
n_gram =  79  The accuracy: 1.0
n_gram =  80  The accuracy: 1.0
n_gram =  81  The accuracy: 1.0
n_gram =  82  The accuracy: 1.0
n_gram =  83  The accuracy: 1.0
n_gram =  84  The accuracy: 1.0
n_gram =  85  The accuracy: 1.0
n_gram =  86  The accuracy: 1.0
n_gram =  87  The accuracy: 1.0
n_gram =  88  The accuracy: 1.0
n_gram =  89  The accuracy: 1.0
n_gram =  90  The accuracy: 1.0
n_gram =  91  The accuracy: 1.0
n_gram =  92  The accuracy: 1.0
n_gram =  93  The accuracy: 1.0
n_gram =  94  The accuracy: 1.0
n_gram =  95  The accuracy: 1.0
n_gram =  96  The accuracy: 1.0
n_gram =  97  The accuracy: 1.0
n_gram =  98  The accuracy: 1.0
n_gram =  99  The accuracy: 1.0
n_gram =  100  The accuracy: 1.0


Test accuracy

In [20]:
for n_gram in range(1, 101):
    positive = 0
    for i in range(len(test_corpus) - n_gram):
        if pred_n_gram(test_corpus[i:i+n_gram][::-1])[0] == test_corpus[i+n_gram]:
            positive += 1
    print("n_gram = ", n_gram, " The test accuracy:", positive / (len(test_corpus) - n_gram))

n_gram =  1  The test accuracy: 0.12730642172266537
n_gram =  2  The test accuracy: 0.09085048950485476
n_gram =  3  The test accuracy: 0.07626913779210315
n_gram =  4  The test accuracy: 0.06269390386397518
n_gram =  5  The test accuracy: 0.05230074945603997
n_gram =  6  The test accuracy: 0.043921505419672
n_gram =  7  The test accuracy: 0.037757898130238554
n_gram =  8  The test accuracy: 0.03256095103767882
n_gram =  9  The test accuracy: 0.029580075763681792
n_gram =  10  The test accuracy: 0.02684076895175916
n_gram =  11  The test accuracy: 0.025270030630340157
n_gram =  12  The test accuracy: 0.023981298617548667
n_gram =  13  The test accuracy: 0.022934300685207576
n_gram =  14  The test accuracy: 0.022048450159216415
n_gram =  15  The test accuracy: 0.021122218639148663
n_gram =  16  The test accuracy: 0.020316846051517717
n_gram =  17  The test accuracy: 0.019592034185277756
n_gram =  18  The test accuracy: 0.019068736141906874
n_gram =  19  The test accuracy: 0.018505079825