In [3]:
import numpy as np
import matplotlib as plt
import nltk               # NLP toolkit
import re

nltk.download('punkt')    # Download the Punkt sentence tokenizer 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adithyashanker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
def read_corpus(filepath):
    f = open(filepath, "r")
    return (f.read())

In [5]:
corpus = read_corpus("Data/catinthehat.txt")

In [6]:
def preprocess(corpus, n):
    corpus = corpus.lower()
    #print(corpus)
    corpus = re.sub(r"[^a-zA-Z0-9.?! \n]+", "", corpus)
    #print(corpus)
    sentences = corpus.split("\n")
    #print(sentences)
    tokens = []
    for sentence in sentences:
        tokenized_sentence = nltk.word_tokenize(sentence)
        tokens.append(tokenized_sentence)
    return tokens


In [7]:
def count_prob(tokens):
    count = {}
    for sentence in tokens:
        for token in sentence:
            count[token] = count.get(token,0)+1
    return count

counts = count_prob(corpus)

In [8]:
train_set = preprocess(corpus, 2)
counts = count_prob(train_set)

In [9]:
def  create_closed_vocab(counts, threshold=2):
    vocab = []
    for k,v in counts.items():
        if v >=threshold:
            vocab.append(k)
    return vocab

In [10]:
vocab = create_closed_vocab(counts)
print(vocab)

['the', 'cat', 'in', 'hat', 'sun', 'did', 'not', '.', 'it', 'was', 'too', 'wet', 'to', 'play', 'so', 'we', 'sat', 'house', 'all', 'that', 'cold', 'day', 'i', 'there', 'with', 'sally', 'two', 'and', 'said', 'how', 'wish', 'had', 'something', 'do', '!', 'go', 'out', 'ball', 'at', 'could', 'sit', 'like', 'one', 'little', 'bit', 'bump', 'then', 'went', 'us', 'looked', 'saw', 'him', 'on', 'he', 'why', 'you', '?', 'know', 'is', 'but', 'can', 'have', 'lots', 'of', 'good', 'fun', 'some', 'new', 'tricks', 'a', 'lot', 'will', 'show', 'them', 'your', 'mother', 'if', 'what', 'say', 'our', 'fish', 'no', 'make', 'away', 'tell', 'want', 'should', 'be', 'here', 'about', 'when', 'now', 'fear', 'my', 'are', 'bad', 'game', 'call', 'put', 'me', 'down', 'this', 'fall', 'let', 'hold', 'up', 'as', 'cup', '...', 'look', 'cake', 'top', 'books', 'toy', 'ship', 'milk', 'dish', 'hop', 'oh', 'these', 'rake', 'red', 'fan', 'fell', 'his', 'head', 'came', 'things', 'pot', 'sank', 'deep', 'shook', 'get', 'another', 'r

In [11]:
print(train_set)

[['the', 'cat', 'in', 'the', 'hat'], [], ['by', 'dr.', 'seuss'], [], ['the', 'sun', 'did', 'not', 'shine', '.'], ['it', 'was', 'too', 'wet', 'to', 'play', '.'], ['so', 'we', 'sat', 'in', 'the', 'house'], ['all', 'that', 'cold', 'cold', 'wet', 'day', '.'], [], ['i', 'sat', 'there', 'with', 'sally', '.'], ['we', 'sat', 'there', 'we', 'two', '.'], ['and', 'i', 'said', 'how', 'i', 'wish'], ['we', 'had', 'something', 'to', 'do', '!'], [], ['too', 'wet', 'to', 'go', 'out'], ['and', 'too', 'cold', 'to', 'play', 'ball', '.'], ['so', 'we', 'sat', 'in', 'the', 'house', '.'], ['we', 'did', 'nothing', 'at', 'all', '.'], [], ['so', 'all', 'we', 'could', 'do', 'was', 'to'], [], ['sit', '!'], ['sit', '!'], ['sit', '!'], ['sit', '!'], [], ['and', 'we', 'did', 'not', 'like', 'it', '.'], ['not', 'one', 'little', 'bit', '.'], [], ['bump', '!'], [], ['and', 'then'], ['something', 'went', 'bump', '!'], ['how', 'that', 'bump', 'made', 'us', 'jump', '!'], [], ['we', 'looked', '!'], ['then', 'we', 'saw', 'him

In [12]:
def replace_oov_words(train_set, closed_vocab, unkown_token="<UNK>"):
    processed_train_set = train_set.copy()
    for sentence in processed_train_set:
        for i in range(len(sentence)):
            if sentence[i] not in closed_vocab:
                sentence[i] = unkown_token 
    return processed_train_set

In [13]:
processed_train_set = replace_oov_words(train_set, vocab)
print(processed_train_set)

[['the', 'cat', 'in', 'the', 'hat'], [], ['<UNK>', '<UNK>', '<UNK>'], [], ['the', 'sun', 'did', 'not', '<UNK>', '.'], ['it', 'was', 'too', 'wet', 'to', 'play', '.'], ['so', 'we', 'sat', 'in', 'the', 'house'], ['all', 'that', 'cold', 'cold', 'wet', 'day', '.'], [], ['i', 'sat', 'there', 'with', 'sally', '.'], ['we', 'sat', 'there', 'we', 'two', '.'], ['and', 'i', 'said', 'how', 'i', 'wish'], ['we', 'had', 'something', 'to', 'do', '!'], [], ['too', 'wet', 'to', 'go', 'out'], ['and', 'too', 'cold', 'to', 'play', 'ball', '.'], ['so', 'we', 'sat', 'in', 'the', 'house', '.'], ['we', 'did', '<UNK>', 'at', 'all', '.'], [], ['so', 'all', 'we', 'could', 'do', 'was', 'to'], [], ['sit', '!'], ['sit', '!'], ['sit', '!'], ['sit', '!'], [], ['and', 'we', 'did', 'not', 'like', 'it', '.'], ['not', 'one', 'little', 'bit', '.'], [], ['bump', '!'], [], ['and', 'then'], ['something', 'went', 'bump', '!'], ['how', 'that', 'bump', '<UNK>', 'us', '<UNK>', '!'], [], ['we', 'looked', '!'], ['then', 'we', 'saw',

In [14]:
def count_n_grams(processed_train_set, n=3, start_token='<s>', end_token = '<e>'):
    n_grams = {}
    for sentence in processed_train_set:
        sentence = [start_token] * (n-1) + sentence + [end_token]
        for i in range(len(sentence)-(n-1)): # complete this line

            # Get the n-gram from i to i+n
            n_gram = ([sentence[i] for i in range(i,i+n)])
            n_gram = tuple(n_gram)
            
            n_grams[n_gram] = n_grams.get(n_gram,0)+1
    return n_grams
    
    


In [15]:
n_grams = count_n_grams(processed_train_set)
print(n_grams)

{('<s>', '<s>', 'the'): 5, ('<s>', 'the', 'cat'): 3, ('the', 'cat', 'in'): 10, ('cat', 'in', 'the'): 11, ('in', 'the', 'hat'): 11, ('the', 'hat', '<e>'): 6, ('<s>', '<s>', '<e>'): 46, ('<s>', '<s>', '<UNK>'): 8, ('<s>', '<UNK>', '<UNK>'): 1, ('<UNK>', '<UNK>', '<UNK>'): 1, ('<UNK>', '<UNK>', '<e>'): 1, ('<s>', 'the', 'sun'): 1, ('the', 'sun', 'did'): 1, ('sun', 'did', 'not'): 1, ('did', 'not', '<UNK>'): 1, ('not', '<UNK>', '.'): 2, ('<UNK>', '.', '<e>'): 7, ('<s>', '<s>', 'it'): 4, ('<s>', 'it', 'was'): 2, ('it', 'was', 'too'): 1, ('was', 'too', 'wet'): 1, ('too', 'wet', 'to'): 2, ('wet', 'to', 'play'): 1, ('to', 'play', '.'): 3, ('play', '.', '<e>'): 3, ('<s>', '<s>', 'so'): 10, ('<s>', 'so', 'we'): 3, ('so', 'we', 'sat'): 2, ('we', 'sat', 'in'): 2, ('sat', 'in', 'the'): 2, ('in', 'the', 'house'): 4, ('the', 'house', '<e>'): 2, ('<s>', '<s>', 'all'): 2, ('<s>', 'all', 'that'): 1, ('all', 'that', 'cold'): 1, ('that', 'cold', 'cold'): 1, ('cold', 'cold', 'wet'): 1, ('cold', 'wet', 'day'

In [16]:
def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram,0)
            
    denominator = previous_n_gram_count + k*vocabulary_size

    n_plus1_gram = previous_n_gram + (word,)
 
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram,0)
            
    numerator = n_plus1_gram_count+k
        
    probability = numerator/denominator
    
    
    return probability    


In [17]:
bigrams = count_n_grams(processed_train_set, 3)
trigrams = count_n_grams(processed_train_set, 4)
print(bigrams)

{('<s>', '<s>', 'the'): 5, ('<s>', 'the', 'cat'): 3, ('the', 'cat', 'in'): 10, ('cat', 'in', 'the'): 11, ('in', 'the', 'hat'): 11, ('the', 'hat', '<e>'): 6, ('<s>', '<s>', '<e>'): 46, ('<s>', '<s>', '<UNK>'): 8, ('<s>', '<UNK>', '<UNK>'): 1, ('<UNK>', '<UNK>', '<UNK>'): 1, ('<UNK>', '<UNK>', '<e>'): 1, ('<s>', 'the', 'sun'): 1, ('the', 'sun', 'did'): 1, ('sun', 'did', 'not'): 1, ('did', 'not', '<UNK>'): 1, ('not', '<UNK>', '.'): 2, ('<UNK>', '.', '<e>'): 7, ('<s>', '<s>', 'it'): 4, ('<s>', 'it', 'was'): 2, ('it', 'was', 'too'): 1, ('was', 'too', 'wet'): 1, ('too', 'wet', 'to'): 2, ('wet', 'to', 'play'): 1, ('to', 'play', '.'): 3, ('play', '.', '<e>'): 3, ('<s>', '<s>', 'so'): 10, ('<s>', 'so', 'we'): 3, ('so', 'we', 'sat'): 2, ('we', 'sat', 'in'): 2, ('sat', 'in', 'the'): 2, ('in', 'the', 'house'): 4, ('the', 'house', '<e>'): 2, ('<s>', '<s>', 'all'): 2, ('<s>', 'all', 'that'): 1, ('all', 'that', 'cold'): 1, ('that', 'cold', 'cold'): 1, ('cold', 'cold', 'wet'): 1, ('cold', 'wet', 'day'

In [18]:
x = estimate_probability("chocolate", ("lyn", "drinks"), bigrams, trigrams, len(vocab))
print(x)

0.005780346820809248


In [19]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', unknown_token="<unk>",  k=1.0):

    previous_n_gram = tuple(previous_n_gram)    
    vocabulary = vocabulary + [end_token, unknown_token]    
    vocabulary_size = len(vocabulary)    
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=k)  
        probabilities[word] = probability

    return probabilities

In [20]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    n = len(list(n_gram_counts.keys())[0])
    previous_tokens = ['<s>'] * (n-1) + previous_tokens
    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=k)
    suggestion = None
    max_prob = 0
    
    for word, prob in probabilities.items():
        if start_with and not word.startswith(start_with):
            continue
        if prob > max_prob:
            suggestion = word
            max_prob = prob
    
    return suggestion, max_prob


In [26]:
def generate_sentence(tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', unknown_token="<unk>", k=1.0):
    while True:
        next_word, _ = suggest_a_word(tokens, n_gram_counts, n_plus1_gram_counts, vocabulary)
        if next_word == end_token:
            break
        tokens.append(next_word)
        print(tokens)
    return tokens

In [None]:
sent = generate_sentence(["as", "i", "stand"], bigrams, trigrams, vocab)
print(sent)

TypeError: suggest_a_word() takes from 4 to 6 positional arguments but 7 were given