In [1]:
import numpy as np
import matplotlib as plt
import nltk               # NLP toolkit
import re

nltk.download('punkt')    # Download the Punkt sentence tokenizer 

[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>


False

In [2]:
def read_corpus(filepath):
    f = open(filepath, "r")
    return (f.read())

In [3]:
corpus = read_corpus("Data/catinthehat.txt")

In [4]:
def preprocess(corpus, n):
    corpus = corpus.lower()
    #print(corpus)
    corpus = re.sub(r"[^a-zA-Z0-9.?! \n]+", "", corpus)
    #print(corpus)
    sentences = corpus.split("\n")
    #print(sentences)
    tokens = []
    for sentence in sentences:
        tokenized_sentence = nltk.word_tokenize(sentence)
        tokens.append(tokenized_sentence)
    return tokens


In [5]:
def count_prob(tokens):
    count = {}
    for sentence in tokens:
        for token in sentence:
            count[token] = count.get(token,0)+1
    return count

counts = count_prob(corpus)

In [6]:
train_set = preprocess(corpus, 2)
counts = count_prob(train_set)

In [7]:
def  create_closed_vocab(counts, threshold=2):
    vocab = []
    for k,v in counts.items():
        if v >=threshold:
            vocab.append(k)
    return vocab

In [8]:
vocab = create_closed_vocab(counts)
print(vocab)

['o', 'for', 'a', 'muse', 'of', 'fire', 'that', 'would', 'the', 'brightest', 'heaven', 'invention', 'kingdom', 'stage', 'princes', 'to', 'act', 'and', 'monarchs', 'behold', 'scene', '!', 'then', 'should', 'warlike', 'like', 'himself', 'port', 'mars', 'at', 'his', 'heels', 'in', 'hounds', 'famine', 'sword', 'employment', '.', 'but', 'pardon', 'all', 'spirits', 'have', 'on', 'this', 'unworthy', 'bring', 'forth', 'so', 'great', 'an', 'object', 'can', 'hold', 'fields', 'france', '?', 'or', 'may', 'we', 'within', 'very', 'did', 'air', 'since', 'crooked', 'figure', 'little', 'place', 'million', 'let', 'us', 'your', 'imaginary', 'forces', 'work', 'suppose', 'these', 'walls', 'are', 'now', 'confined', 'two', 'mighty', 'whose', 'high', 'narrow', 'ocean', 'parts', 'piece', 'out', 'our', 'with', 'thoughts', 'into', 'thousand', 'divide', 'man', 'make', 'think', 'when', 'talk', 'horses', 'you', 'see', 'them', 'their', 'proud', 'i', 'receiving', 'earth', 'tis', 'must', 'kings', 'carry', 'here', 'the

In [9]:
print(train_set)



In [10]:
def replace_oov_words(train_set, closed_vocab, unkown_token="<UNK>"):
    processed_train_set = train_set.copy()
    for sentence in processed_train_set:
        for i in range(len(sentence)):
            if sentence[i] not in closed_vocab:
                sentence[i] = unkown_token 
    return processed_train_set

In [11]:
processed_train_set = replace_oov_words(train_set, vocab)
print(processed_train_set)

[['o', 'for', 'a', 'muse', 'of', 'fire', 'that', 'would', '<UNK>'], ['the', 'brightest', 'heaven', 'of', 'invention'], ['a', 'kingdom', 'for', 'a', 'stage', 'princes', 'to', 'act'], ['and', 'monarchs', 'to', 'behold', 'the', '<UNK>', 'scene', '!'], ['then', 'should', 'the', 'warlike', '<UNK>', 'like', 'himself'], ['<UNK>', 'the', 'port', 'of', 'mars', 'and', 'at', 'his', 'heels'], ['<UNK>', 'in', 'like', 'hounds', 'should', 'famine', 'sword', 'and', 'fire'], ['<UNK>', 'for', 'employment', '.', 'but', 'pardon', 'and', '<UNK>', 'all'], ['the', '<UNK>', '<UNK>', 'spirits', 'that', 'have', '<UNK>'], ['on', 'this', 'unworthy', '<UNK>', 'to', 'bring', 'forth'], ['so', 'great', 'an', 'object', 'can', 'this', '<UNK>', 'hold'], ['the', '<UNK>', 'fields', 'of', 'france', '?', 'or', 'may', 'we', '<UNK>'], ['within', 'this', '<UNK>', 'o', 'the', 'very', '<UNK>'], ['that', 'did', '<UNK>', 'the', 'air', 'at', '<UNK>', '?'], ['o', 'pardon', '!', 'since', 'a', 'crooked', 'figure', 'may'], ['<UNK>', 'i

In [12]:
def count_n_grams(processed_train_set, n=3, start_token='<s>', end_token = '<e>'):
    n_grams = {}
    for sentence in processed_train_set:
        sentence = [start_token] * (n-1) + sentence + [end_token]
        for i in range(len(sentence)-(n-1)): # complete this line

            # Get the n-gram from i to i+n
            n_gram = ([sentence[i] for i in range(i,i+n)])
            n_gram = tuple(n_gram)
            
            n_grams[n_gram] = n_grams.get(n_gram,0)+1
    return n_grams
    
    


In [13]:
n_grams = count_n_grams(processed_train_set)
print(n_grams)

{('<s>', '<s>', 'o'): 57, ('<s>', 'o', 'for'): 2, ('o', 'for', 'a'): 1, ('for', 'a', 'muse'): 1, ('a', 'muse', 'of'): 1, ('muse', 'of', 'fire'): 1, ('of', 'fire', 'that'): 1, ('fire', 'that', 'would'): 1, ('that', 'would', '<UNK>'): 1, ('would', '<UNK>', '<e>'): 1, ('<s>', '<s>', 'the'): 222, ('<s>', 'the', 'brightest'): 1, ('the', 'brightest', 'heaven'): 1, ('brightest', 'heaven', 'of'): 1, ('heaven', 'of', 'invention'): 1, ('of', 'invention', '<e>'): 1, ('<s>', '<s>', 'a'): 69, ('<s>', 'a', 'kingdom'): 1, ('a', 'kingdom', 'for'): 2, ('kingdom', 'for', 'a'): 2, ('for', 'a', 'stage'): 1, ('a', 'stage', 'princes'): 1, ('stage', 'princes', 'to'): 1, ('princes', 'to', 'act'): 1, ('to', 'act', '<e>'): 1, ('<s>', '<s>', 'and'): 405, ('<s>', 'and', 'monarchs'): 1, ('and', 'monarchs', 'to'): 1, ('monarchs', 'to', 'behold'): 1, ('to', 'behold', 'the'): 1, ('behold', 'the', '<UNK>'): 1, ('the', '<UNK>', 'scene'): 1, ('<UNK>', 'scene', '!'): 1, ('scene', '!', '<e>'): 1, ('<s>', '<s>', 'then'): 3

In [14]:
def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram,0)
            
    denominator = previous_n_gram_count + k*vocabulary_size

    n_plus1_gram = previous_n_gram + (word,)
 
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram,0)
            
    numerator = n_plus1_gram_count+k
        
    probability = numerator/denominator
    
    
    return probability    


In [15]:
bigrams = count_n_grams(processed_train_set, 3)
trigrams = count_n_grams(processed_train_set, 4)
print(bigrams)

{('<s>', '<s>', 'o'): 57, ('<s>', 'o', 'for'): 2, ('o', 'for', 'a'): 1, ('for', 'a', 'muse'): 1, ('a', 'muse', 'of'): 1, ('muse', 'of', 'fire'): 1, ('of', 'fire', 'that'): 1, ('fire', 'that', 'would'): 1, ('that', 'would', '<UNK>'): 1, ('would', '<UNK>', '<e>'): 1, ('<s>', '<s>', 'the'): 222, ('<s>', 'the', 'brightest'): 1, ('the', 'brightest', 'heaven'): 1, ('brightest', 'heaven', 'of'): 1, ('heaven', 'of', 'invention'): 1, ('of', 'invention', '<e>'): 1, ('<s>', '<s>', 'a'): 69, ('<s>', 'a', 'kingdom'): 1, ('a', 'kingdom', 'for'): 2, ('kingdom', 'for', 'a'): 2, ('for', 'a', 'stage'): 1, ('a', 'stage', 'princes'): 1, ('stage', 'princes', 'to'): 1, ('princes', 'to', 'act'): 1, ('to', 'act', '<e>'): 1, ('<s>', '<s>', 'and'): 405, ('<s>', 'and', 'monarchs'): 1, ('and', 'monarchs', 'to'): 1, ('monarchs', 'to', 'behold'): 1, ('to', 'behold', 'the'): 1, ('behold', 'the', '<UNK>'): 1, ('the', '<UNK>', 'scene'): 1, ('<UNK>', 'scene', '!'): 1, ('scene', '!', '<e>'): 1, ('<s>', '<s>', 'then'): 3

In [16]:
x = estimate_probability("chocolate", ("lyn", "drinks"), bigrams, trigrams, len(vocab))
print(x)

0.00034494653328734045


In [17]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', unknown_token="<unk>",  k=1.0):

    previous_n_gram = tuple(previous_n_gram)    
    vocabulary = vocabulary + [end_token, unknown_token]    
    vocabulary_size = len(vocabulary)    
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=k)  
        probabilities[word] = probability

    return probabilities

In [18]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    n = len(list(n_gram_counts.keys())[0])
    previous_tokens = ['<s>'] * (n-1) + previous_tokens
    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=k)
    suggestion = None
    max_prob = 0
    
    for word, prob in probabilities.items():
        if start_with and not word.startswith(start_with):
            continue
        if prob > max_prob:
            suggestion = word
            max_prob = prob
    
    return suggestion, max_prob


In [19]:
def generate_sentence(tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', unknown_token="<unk>", k=1.0):
    prev_word = ""
    while True:
        next_word, _ = suggest_a_word(tokens, n_gram_counts, n_plus1_gram_counts, vocabulary)
        if next_word == end_token:
            break
        if prev_word == next_word:
            break
        prev_word = next_word
        tokens.append(next_word)
        print(tokens)
    return tokens

In [30]:
sent = generate_sentence(["i"], bigrams, trigrams, vocab)
print(sent)

['i', 'have']
['i', 'have', 'no']
['i', 'have', 'no', 'wife']
['i', 'have', 'no', 'wife', 'i']
['i', 'have', 'no', 'wife', 'i', 'have']
['i', 'have', 'no', 'wife', 'i', 'have', 'nothing']
['i', 'have', 'no', 'wife', 'i', 'have', 'nothing', 'in']
['i', 'have', 'no', 'wife', 'i', 'have', 'nothing', 'in', 'france']
['i', 'have', 'no', 'wife', 'i', 'have', 'nothing', 'in', 'france', '.']
['i', 'have', 'no', 'wife', 'i', 'have', 'nothing', 'in', 'france', '.']
