In [93]:
import numpy as np
import pandas as pd
import nltk
import random
import math
from collections import deque
nltk.data.path.append('.')

In [32]:
# Loading and reading data
# data_path = "en_US_twitter.txt"
data_path = "test_text.txt"
with open(data_path, "r") as f:
    data = f.read()

# Exploring data
print("------ Data Type ------")
print(f"data: {type(data)}")
print("------ ------")
print("------ # Letter ------")
print(f"# letters: {len(data)}")
print("------ ------")
print()
print("------ ------")
print(display(data[:300]))
print("------ ------")

------ Data Type ------
data: <class 'str'>
------ ------
------ # Letter ------
# letters: 1848
------ ------

------ ------


'Reading Eliot\'s "Four Quartets" in memory of Theo Angelopoulos. "In my end is my beginning."\nScrew the law we just some mother fukkin kids.\n#YouWannaImpressMe respect Justin Bieber and I\'ll be impressed alright ;)\nJust because it\'s 4:20 somewhere\nHappy birthday\nSetting goals includes determining you'

None
------ ------


In [20]:
# data preprocessing
# 1- splitting data into sentences
def split_data_to_sentences(data):
    sents = [sent.strip() for sent in data.split("\n") if len(sent) > 0]
    return sents

In [25]:
x = split_data_to_sentences("Hello there\n welcome to the assignment.\n of this \n week. \n")
x

['Hello there', 'welcome to the assignment.', 'of this', 'week.']

In [26]:
# 2- splitting sentences into tokens
def tokenize_sents(sentences):
    tokens_l = []
    for sent in sentences:
        sent = sent.lower()
        tokens = nltk.word_tokenize(sent)
        tokens_l.append(tokens)
        
    return tokens_l

In [27]:
tokens_l = tokenize_sents(x)
tokens_l

[['hello', 'there'],
 ['welcome', 'to', 'the', 'assignment', '.'],
 ['of', 'this'],
 ['week', '.']]

In [81]:
# 3- applying preprocessing functions to the original data
def tokenize_data(data):
    sents = split_data_to_sentences(data)
    tokens_l = tokenize_sents(sents)
    return tokens_l

In [82]:
tokenize_data("Hello there\n welcome to the assignment.\n of this \n week. \n")

[['hello', 'there'],
 ['welcome', 'to', 'the', 'assignment', '.'],
 ['of', 'this'],
 ['week', '.']]

In [83]:
# 4- splitting data into train and test sets
tokenized_data = tokenize_data(data)
# setting seed for consistincy
random.seed(321)

# train size = 80%
train_size = int(len(tokenized_data) * 0.8)
# splitting data into train and test
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [84]:
print(f"length of training data {len(train_data)}")
print(f"length of test data {len(test_data)}")
print()
print(f"Train: {train_data[0]}")
print(f"Test: {test_data[0]}")

length of training data 22
length of test data 6

Train: ['reading', 'eliot', "'s", '``', 'four', 'quartets', "''", 'in', 'memory', 'of', 'theo', 'angelopoulos', '.', '``', 'in', 'my', 'end', 'is', 'my', 'beginning', '.', "''"]
Test: ['not', 'gon', 'na', 'lie', 'i', "'m", 'lovin', 'this', 'rental', 'ford', 'focus', 'hatchback']


In [85]:
# 5- get words frequency
def count_words(tokenized_l):
    # initialize counts dict
    counts = {token: 0 for sent in tokenized_l for token in sent}
    # loop through each sentence withing the list
    for sent in tokenized_l:
        # lopp through each token withing the sentence
        for token in sent:
            if token in counts:
                counts[token] +=1
    
    return counts

In [86]:
s = [["hi", "i", "am"], ["ali", "am", "i"], ["welcome", "to", "my", "channel", "ali", "channel"], ["i", "love", "python", "."], ["python", "coding", "is", "amazing", "."]]
test_counts = count_words(s)
test_counts

{'hi': 1,
 'i': 3,
 'am': 2,
 'ali': 2,
 'welcome': 1,
 'to': 1,
 'my': 1,
 'channel': 2,
 'love': 1,
 'python': 2,
 '.': 2,
 'coding': 1,
 'is': 1,
 'amazing': 1}

In [87]:
# 6- Handling OOV by keeping words that satisify threshold and others will be set to <unk> token
def most_freq_words_to_keep(counts_dict, threshold):
    # store words to keep
    kept_words = [w for w, v in counts_dict.items() if v >= threshold]
            
    return kept_words

In [88]:
close_v = most_freq_words_to_keep(test_counts, 2)
close_v

['i', 'am', 'ali', 'channel', 'python', '.']

In [89]:
# 7- Replace non frequent words in each sentence with <unk> token
def replace_words_with_tokens(close_v, tokenized_l):
    # set unk token
    unk_token = "<unk>"
    # loop through indices of outer list
    for i in range(len(tokenized_l)):
        # loop through inner list indices
        for j in range(len(tokenized_l[i])):
            # if token in close vocab keep it unchanged
            if tokenized_l[i][j] in close_v:
                continue
            # else replace it with <unk>
            else:
                tokenized_l[i][j] = unk_token
                
    return tokenized_l

In [90]:
tokenized_l_oov = replace_words_with_tokens(close_v, s)
tokenized_l_oov

[['<unk>', 'i', 'am'],
 ['ali', 'am', 'i'],
 ['<unk>', '<unk>', '<unk>', 'channel', 'ali', 'channel'],
 ['i', '<unk>', 'python', '.'],
 ['python', '<unk>', '<unk>', '<unk>', '.']]

In [91]:
# 8- preprocessing entire data by storing most frequent words from training set and replace words 
# in both training and test set by <unk> token
def preprocess_tokenized_data(train_tokenized, test_tokenized, threshold):
    # first build the frequency matrix from train data
    counts_dict = count_words(train_tokenized)
    # make close_vocab using frequency matrix
    close_vocab = most_freq_words_to_keep(counts_dict, threshold)
    # preprocess both train and test tokenized sentences to handle OOV
    train_tokenized_pre = replace_words_with_tokens(close_vocab, train_tokenized)
    test_tokenized_pre = replace_words_with_tokens(close_vocab, test_tokenized)
    # return both preprocessed sentences and the close vocab
    return train_tokenized_pre, test_tokenized_pre, close_vocab

In [92]:
tmp_train = [['sky', 'is', 'blue', '.'],
     ['leaves', 'are', 'green']]
tmp_test = [['roses', 'are', 'red', '.']]

tmp_train_repl, tmp_test_repl, tmp_vocab = preprocess_tokenized_data(tmp_train, tmp_test, 1)

print("tmp_train_repl")
print(tmp_train_repl)
print()
print("tmp_test_repl")
print(tmp_test_repl)
print()
print("tmp_vocab")
print(tmp_vocab)

tmp_train_repl
[['sky', 'is', 'blue', '.'], ['leaves', 'are', 'green']]

tmp_test_repl
[['<unk>', 'are', '<unk>', '.']]

tmp_vocab
['sky', 'is', 'blue', '.', 'leaves', 'are', 'green']


In [265]:
# 9- n-grams frequency dictionary
def build_n_gram_word_frequency(list_of_tokenized_sentences, n):
    sos_token = "<s>"
    eos_token = "<e>"
    
    # for each sentence 
    # for i in range(len(list_of_tokenized_sentences)):
    #     list_of_tokenized_sentences[i] = deque(list_of_tokenized_sentences[i])
    #     if n > 1:
    #         # add n-1 sos tokens to the start of the sentence
    #         for j in range(n - 1):
    #             list_of_tokenized_sentences[i].appendleft(sos_token)
    #     else:
    #         list_of_tokenized_sentences[i].appendleft(sos_token)

    #     list_of_tokenized_sentences[i] = list(list_of_tokenized_sentences[i])
         
    
    bag_counter = {}

    # loop through the entire list
    for sentence in list_of_tokenized_sentences:
        # adding eos token to the sentence
        sentence = [sos_token] * n + sentence + [eos_token]
        # convert sentence to tuple
        sentence = tuple(sentence)
        # looping through length of the sentence-n+1  
        for i in range(len(sentence)-n+1):
            # n-gram sequence starts from i to i+n
            n_gram = sentence[i: i + n]
            # check if the n-gram is in the dictionary
            if n_gram not in bag_counter:
                # initialize it with 1 if it is the first time occuring
                bag_counter[n_gram] = 1
            else:
                # increment the frequency by 1
                bag_counter[n_gram] += 1
                
    return bag_counter

In [266]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
print("Uni-gram:")
print(build_n_gram_word_frequency(sentences, 1))
print("Bi-gram:")
print(build_n_gram_word_frequency(sentences, 2))

print(sentences[0])

Uni-gram:
{('<s>',): 2, ('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('<e>',): 2, ('this',): 1, ('dog',): 1, ('is',): 1}
Bi-gram:
{('<s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('cat', '<e>'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1}
['i', 'like', 'a', 'cat']


In [267]:
tmp_train = [['sky', 'is', 'blue', '.'],
     ['leaves', 'are', 'green'],
    ['leaves', 'are', 'red'],
    ['sky', 'is', 'green']]
# unigram
unigram = build_n_gram_word_frequency(tmp_train, 1)
print(f"Unigram frequency: {unigram}")

bigram = build_n_gram_word_frequency(tmp_train, 2)
print(f"Bigram frequency: {bigram}")

trigram = build_n_gram_word_frequency(tmp_train, 3)
print(f"Trigram frequency: {trigram}")

Unigram frequency: {('<s>',): 4, ('sky',): 2, ('is',): 2, ('blue',): 1, ('.',): 1, ('<e>',): 4, ('leaves',): 2, ('are',): 2, ('green',): 2, ('red',): 1}
Bigram frequency: {('<s>', '<s>'): 4, ('<s>', 'sky'): 2, ('sky', 'is'): 2, ('is', 'blue'): 1, ('blue', '.'): 1, ('.', '<e>'): 1, ('<s>', 'leaves'): 2, ('leaves', 'are'): 2, ('are', 'green'): 1, ('green', '<e>'): 2, ('are', 'red'): 1, ('red', '<e>'): 1, ('is', 'green'): 1}
Trigram frequency: {('<s>', '<s>', '<s>'): 4, ('<s>', '<s>', 'sky'): 2, ('<s>', 'sky', 'is'): 2, ('sky', 'is', 'blue'): 1, ('is', 'blue', '.'): 1, ('blue', '.', '<e>'): 1, ('<s>', '<s>', 'leaves'): 2, ('<s>', 'leaves', 'are'): 2, ('leaves', 'are', 'green'): 1, ('are', 'green', '<e>'): 1, ('leaves', 'are', 'red'): 1, ('are', 'red', '<e>'): 1, ('sky', 'is', 'green'): 1, ('is', 'green', '<e>'): 1}


In [268]:
# 10- Building n-gram probaility matrix using k-smoothing (C(prev n-gram + word) / C(prev n-gram))
# to handle non-occuring n-grams
def build_n_gram_probability_with_k_smoothing(word, previous_n_gram, n_gram_counts, n_plus_one_gram_counts, vocab, k):
    # length of vocabulary
    v = len(vocab)
    # converting previous_n_gram to tuple
    previous_n_gram = tuple(previous_n_gram)
    # now we check if the previous n-gram in the n_gram_counts to take its value (denominator)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)
    # add the word to the previous n-gram
    prev_n_gram_plus_word = previous_n_gram + (word, )
    # check if the n-gram plus word is in the n_gram_plus_word_counts dictionary
    prev_n_gram_plus_word_count = n_plus_one_gram_counts.get(prev_n_gram_plus_word, 0)
    # Now the formula is dividing the count of previous n-gram + word + k by the previous_n_gram_count + kXv
    numerator = prev_n_gram_plus_word_count + k
    denominator = previous_n_gram_count + (k * v)
    # getting the conditional probability
    cond_prob = numerator / denominator
    # return probability
    return cond_prob

In [269]:
# test case
sentences = [["i", "love", "cats", "and", "dogs", "."], ["cats", "are", "cute", "animals", "."]]
vocab = list(set(sentences[0] + sentences[1]))

unigram_counts = build_n_gram_word_frequency(sentences, 1)
bigram_counts = build_n_gram_word_frequency(sentences, 2)

build_n_gram_probability_with_k_smoothing("cats", "are", unigram_counts, bigram_counts, vocab, 1.0)

0.1111111111111111

In [270]:
# 11- now we will use the previous function to estimate the probabilities of all words in the vocabulary
def estimate_probabilities(prev_n_gram, n_gram_counts, n_plus_one_gram_counts, vocab, k):
    # add end of sentence token and unknown token to the vocab.
    # execlude start of the sentence token as it shouldn't be the next word
    vocab = vocab + ["<unk>", "<e>"]
    # estimate probability for each word in the vocab
    n_gram_probs = {word: build_n_gram_probability_with_k_smoothing(word, prev_n_gram, n_gram_counts, n_plus_one_gram_counts, vocab, k)
                    for word in vocab}
    
    return n_gram_probs

In [271]:
# test case
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
unigram_counts = build_n_gram_word_frequency(sentences, 1)
bigram_counts = build_n_gram_word_frequency(sentences, 2)
estimate_probabilities("a", unigram_counts, bigram_counts, unique_words, k=1)

{'a': 0.09090909090909091,
 'like': 0.09090909090909091,
 'is': 0.09090909090909091,
 'i': 0.09090909090909091,
 'this': 0.09090909090909091,
 'cat': 0.2727272727272727,
 'dog': 0.09090909090909091,
 '<unk>': 0.09090909090909091,
 '<e>': 0.09090909090909091}

In [272]:
# Additional test
trigram_counts = build_n_gram_word_frequency(sentences, 3)
estimate_probabilities(["<s>", "<s>"], bigram_counts, trigram_counts, unique_words, k=1)

{'a': 0.09090909090909091,
 'like': 0.09090909090909091,
 'is': 0.09090909090909091,
 'i': 0.18181818181818182,
 'this': 0.18181818181818182,
 'cat': 0.09090909090909091,
 'dog': 0.09090909090909091,
 '<unk>': 0.09090909090909091,
 '<e>': 0.09090909090909091}

In [273]:
def make_count_matrix(n_plus1_gram_counts, vocabulary):
    # add <e> <unk> to the vocabulary
    # <s> is omitted since it should not appear as the next word
    vocabulary = vocabulary + ["<e>", "<unk>"]
    
    # obtain unique n-grams
    n_grams = []
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[0:-1]
        n_grams.append(n_gram)
    n_grams = list(set(n_grams))
    
    # mapping from n-gram to row
    row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}
    # mapping from next word to column
    col_index = {word:j for j, word in enumerate(vocabulary)}
    
    nrow = len(n_grams)
    ncol = len(vocabulary)
    count_matrix = np.zeros((nrow, ncol))
    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[0:-1]
        word = n_plus1_gram[-1]
        if word not in vocabulary:
            continue
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i, j] = count
    
    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
    return count_matrix

In [274]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = build_n_gram_word_frequency(sentences, 2)

print('bigram counts')
display(make_count_matrix(bigram_counts, unique_words))

bigram counts


Unnamed: 0,a,like,is,i,this,cat,dog,<e>,<unk>
"(like,)",2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(i,)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>,)",0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
"(this,)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"(is,)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(cat,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(a,)",0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
"(dog,)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [275]:
trigram_counts = build_n_gram_word_frequency(sentences, 3)

print('trigram counts')
display(make_count_matrix(trigram_counts, unique_words))

trigram counts


Unnamed: 0,a,like,is,i,this,cat,dog,<e>,<unk>
"(i, like)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>, i)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(this, dog)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>, <s>)",0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
"(dog, is)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>, this)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"(is, like)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(a, cat)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(like, a)",0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0


In [276]:
def make_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    count_matrix = make_count_matrix(n_plus1_gram_counts, unique_words)
    count_matrix += k
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

In [277]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = build_n_gram_word_frequency(sentences, 2)
print("bigram probabilities")
display(make_probability_matrix(bigram_counts, unique_words, k=1))

bigram probabilities


Unnamed: 0,a,like,is,i,this,cat,dog,<e>,<unk>
"(like,)",0.272727,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(i,)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>,)",0.090909,0.090909,0.090909,0.181818,0.181818,0.090909,0.090909,0.090909,0.090909
"(this,)",0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1
"(is,)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(cat,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909
"(a,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909
"(dog,)",0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1


In [278]:
trigram_counts = build_n_gram_word_frequency(sentences, 3)
print("trigram probabilities")
display(make_probability_matrix(trigram_counts, unique_words, k=1))

trigram probabilities


Unnamed: 0,a,like,is,i,this,cat,dog,<e>,<unk>
"(i, like)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>, i)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(this, dog)",0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>, <s>)",0.090909,0.090909,0.090909,0.181818,0.181818,0.090909,0.090909,0.090909,0.090909
"(dog, is)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>, this)",0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1
"(is, like)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(a, cat)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909
"(like, a)",0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909


In [287]:
# 12- now we want to evaluate the n-gram language model, Perplexity is used to evaluate the model
# performance on test set
def perplexity_score(sentence, n_gram_counts, n_plus_one_gram_counts, vocab, k = 1.0):
    # length of previous words
    n = len(list(n_gram_counts.keys())[0])
    # appending start and end token to the sentence
    sentence = ["<s>"] * n + sentence + ["<e>"]
    # convert sentence from list to tuple
    sentence = tuple(sentence)
    # length of sentence
    N = len(sentence)
    # initialize p_prod to be the product of n-gram probabilities
    p_prod = 1.0
    
    # optional: 
    # log_probs = 0
    
    for t in range(n, N):
        # getting prev n-gram
        prev_n_gram = sentence[t - n : t]
        # getting the current word
        curr_word = sentence[t]
        # probability of the curr word given n-gram
        prob = build_n_gram_probability_with_k_smoothing(curr_word, prev_n_gram, n_gram_counts, n_plus_one_gram_counts, vocab, k)
        # updating the p_prod
        p_prod *= (1 / prob)
        # updating log_probs
        # log_probs += math.log(prob)
        
    # perplexity score is the N-th root of the comulative product of probability of the word given n-gram
    perplexity_score = p_prod ** (1 / N)
    # perplexity_score = log_probs * (1 / N)
    # return perplexity score
    return perplexity_score

In [288]:
# test your code

sentences_test = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]

unique_words = list(set(sentences_test[0] + sentences_test[1]))

unigram_counts = build_n_gram_word_frequency(sentences_test, 1)
bigram_counts = build_n_gram_word_frequency(sentences_test, 2)

perplexity_train1 = perplexity_score(sentences[0],
                                         unigram_counts, bigram_counts,
                                         unique_words, k=1.0)
print(f"Perplexity for first train sample: {perplexity_train1:.4f}")

test_sentence = ['i', 'like', 'a', 'dog']
perplexity_test = perplexity_score(test_sentence,
                                       unigram_counts, bigram_counts,
                                       unique_words, k=1.0)
print(f"Perplexity for test sample: {perplexity_test:.4f}")

Perplexity for first train sample: 2.8040
Perplexity for test sample: 3.9654


In [291]:
# 12- Building autocomplete model
def suggest_word(previous_tokens, n_gram_counts, n_plus_one_gram_counts, vocab, k = 1.0, start_with = None):
    # length of previous words
    n = len(list(n_gram_counts.keys())[0])
    # get the words already inputted
    prev_n_gram = previous_tokens[-n:]
    # getting probability of the next words
    probs = estimate_probabilities(prev_n_gram, n_gram_counts, n_plus_one_gram_counts, vocab, k)
    # initilize suggested word to None
    suggestion = None
    # initialize max_prob of the suggested word to 0
    max_prob = 0
    
    for word, prob in probs.items():
        if start_with is not None:
            # execlude words that do not match with the start_with token
            if not word.startswith(start_with):
                continue
        
        # check if the current word is the most probable next word
        if prob > max_prob:
            suggestion = word
            max_prob = prob

    return suggestion, max_prob

In [293]:
# test case
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = build_n_gram_word_frequency(sentences, 1)
bigram_counts = build_n_gram_word_frequency(sentences, 2)

# inputted tokens
input = ["i", "like"]

suggestion, max_prob = suggest_word(input, unigram_counts, bigram_counts, unique_words, k = 1.0, start_with = None)
print(f"Most probable word for the sentence\n{input}\nis\n({suggestion}) with probability {max_prob}")

Most probable word for the sentence
['i', 'like']
is
(a) with probability 0.2727272727272727


In [294]:
# 13- define function to make multible suggestion based on n-gram
def get_multible_suggestion(previous_tokens, n_gram_count_list, vocab, k = 1.0, start_with = None):
    model_counts = len(n_gram_count_list)
    # initialize variable to hold suggested words and their probbility
    suggestions = []
    probs = []
    for i in range(model_counts - 1):
        # get n_gram_counts
        n_gram_counts = n_gram_count_list[i]
        # get n_plus_one_gram_counts
        n_plus_one_gram_counts = n_gram_count_list[i + 1]
        # get suggested word
        suggestion, prob = suggest_word(previous_tokens, n_gram_counts, n_plus_one_gram_counts, vocab, k = k, start_with = start_with)
        # append suggesed word to suggestions
        suggestions.append(suggestion)
        probs.append(prob)
    return suggestions, probs

In [297]:
# test case
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = build_n_gram_word_frequency(sentences, 1)
bigram_counts = build_n_gram_word_frequency(sentences, 2)
trigram_counts = build_n_gram_word_frequency(sentences, 3)
quadgram_counts = build_n_gram_word_frequency(sentences, 4)
quintgram_counts = build_n_gram_word_frequency(sentences, 5)

n_gram_count_list = [unigram_counts, bigram_counts, trigram_counts, quadgram_counts, quintgram_counts]
input = ["i", "like"]

suggestions, probs = get_multible_suggestion(input, n_gram_count_list, unique_words, k = 1.0, start_with = None)
print(f"Suggestions for '{input}'")
print()
print(suggestions)
print(probs)

Suggestions for '['i', 'like']'

['a', 'a', 'a', 'a']
[0.2727272727272727, 0.2, 0.1111111111111111, 0.1111111111111111]
