In [223]:
import numpy as np
import matplotlib as plt
import nltk               # NLP toolkit
import re

nltk.download('punkt')    # Download the Punkt sentence tokenizer 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adithyashanker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [224]:
corpus = "Lyn drinks chocolate\nJohn drinks tea\nlyn eats chocolate"

In [225]:
def preprocess(corpus, n):
    corpus = corpus.lower()
    #print(corpus)
    corpus = re.sub(r"[^a-zA-Z0-9.?! \n]+", "", corpus)
    #print(corpus)
    sentences = corpus.split("\n")
    #print(sentences)
    tokens = []
    for sentence in sentences:
        tokenized_sentence = nltk.word_tokenize(sentence)
        tokens.append(tokenized_sentence)
    return tokens


In [226]:
def count_prob(tokens):
    count = {}
    for sentence in tokens:
        for token in sentence:
            count[token] = count.get(token,0)+1
    return count

counts = count_prob(corpus)

In [227]:
train_set = preprocess(corpus, 2)
counts = count_prob(train_set)

In [228]:
def  create_closed_vocab(counts, threshold=2):
    vocab = []
    for k,v in counts.items():
        if v >=threshold:
            vocab.append(k)
    return vocab

In [229]:
vocab = create_closed_vocab(counts)
print(vocab)

['lyn', 'drinks', 'chocolate']


In [230]:
print(train_set)

[['lyn', 'drinks', 'chocolate'], ['john', 'drinks', 'tea'], ['lyn', 'eats', 'chocolate']]


In [231]:
def replace_oov_words(train_set, closed_vocab, unkown_token="<UNK>"):
    processed_train_set = train_set.copy()
    for sentence in processed_train_set:
        for i in range(len(sentence)):
            if sentence[i] not in closed_vocab:
                sentence[i] = unkown_token 
    return processed_train_set

In [232]:
processed_train_set = replace_oov_words(train_set, vocab)
print(processed_train_set)

[['lyn', 'drinks', 'chocolate'], ['<UNK>', 'drinks', '<UNK>'], ['lyn', '<UNK>', 'chocolate']]


In [243]:
def count_n_grams(processed_train_set, n=3, start_token='<s>', end_token = '<e>'):
    n_grams = {}
    for sentence in processed_train_set:
        sentence = [start_token] * (n-1) + sentence + [end_token]
        for i in range(len(sentence)-(n-1)): # complete this line

            # Get the n-gram from i to i+n
            n_gram = ([sentence[i] for i in range(i,i+n)])
            n_gram = tuple(n_gram)
            
            n_grams[n_gram] = n_grams.get(n_gram,0)+1
    return n_grams
    
    


In [242]:
n_grams = count_n_grams(processed_train_set)
print(n_grams)

{('<s>', '<s>', 'lyn'): 2, ('<s>', 'lyn', 'drinks'): 1, ('lyn', 'drinks', 'chocolate'): 1, ('drinks', 'chocolate', '<e>'): 1, ('<s>', '<s>', '<UNK>'): 1, ('<s>', '<UNK>', 'drinks'): 1, ('<UNK>', 'drinks', '<UNK>'): 1, ('drinks', '<UNK>', '<e>'): 1, ('<s>', 'lyn', '<UNK>'): 1, ('lyn', '<UNK>', 'chocolate'): 1, ('<UNK>', 'chocolate', '<e>'): 1}


In [246]:
def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram,0)
            
    denominator = previous_n_gram_count + k*vocabulary_size

    n_plus1_gram = previous_n_gram + (word,)
 
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram,0)
            
    numerator = n_plus1_gram_count+k
        
    probability = numerator/denominator
    
    
    return probability    


In [249]:
bigrams = count_n_grams(processed_train_set, 2)
trigrams = count_n_grams(processed_train_set, 3)
print(bigrams)

{('<s>', 'lyn'): 2, ('lyn', 'drinks'): 1, ('drinks', 'chocolate'): 1, ('chocolate', '<e>'): 2, ('<s>', '<UNK>'): 1, ('<UNK>', 'drinks'): 1, ('drinks', '<UNK>'): 1, ('<UNK>', '<e>'): 1, ('lyn', '<UNK>'): 1, ('<UNK>', 'chocolate'): 1}


In [250]:
x = estimate_probability("chocolate", ("lyn", "drinks"), bigrams, trigrams, len(vocab))
print(x)

0.5
