In [1]:
import numpy as np
import pandas as pd
import nltk
from collections import Counter

In [2]:
with open('en_US.twitter.txt','r') as f:
    data = f.read()

In [3]:
print(len(data)) # letters count

3335477


In [4]:
# preprocess steps

# split into sentences
# split into tokens
# train test
# build vocab with N times appears in training data
# handling the unknowing words

In [5]:
print(data[500:1000])

t another day off from skool due to the wonderful snow (: and THIS wakes me up...damn thing
I'm coo... Jus at work hella tired r u ever in cali
The new sundrop commercial ...hehe love at first sight
we need to reconnect THIS WEEK
I always wonder how the guys on the auctions shows learned to talk so fast!? all I hear is djsosnekspqnslanskam.
Dammnnnnn what a catch
such a great picture! The green shirt totally brings out your eyes!
Desk put together, room all set up. Oh boy, oh boy
I'm doing it!👦



In [6]:
def split_to_sentences(data):

    sentences = data.split('\n')

    #remove any space in the start or end the sentence
    sentences = [s.strip() for s in sentences if len(s) > 0]

    return sentences

In [7]:
x = """
I have a pen.
I have an apple.
"""

print(split_to_sentences(x))

['I have a pen.', 'I have an apple.']


In [8]:
nltk.download('punkt_tab')

def tokenize_sentences(sentences):

    tokens = [nltk.word_tokenize(sent.lower()) for sent in sentences]

    return tokens

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [9]:
sentences = ["Sky is blue.", "Leaves are green.", "Roses are red."]
print(tokenize_sentences(sentences))

[['sky', 'is', 'blue', '.'], ['leaves', 'are', 'green', '.'], ['roses', 'are', 'red', '.']]


In [10]:
def get_tokenized_data(data):

    sents = split_to_sentences(data)
    tokens = tokenize_sentences(sents)

    return tokens

In [11]:
def train_test_split(data,train_size):

    train_percent =int(len(data) * train_size)
    np.random.seed(2)
    np.random.shuffle(data)

    train = data[ : train_percent]
    test = data[train_percent : ]

    return train, test

In [12]:
tokenized_data = get_tokenized_data(data)

train, test = train_test_split(tokenized_data, 0.8)

In [13]:
print("train data percent:",len(train) / len(tokenized_data))

train data percent: 0.7999833197806551


In [14]:
print(train[0])

['do', 'you', 'happen', 'to', 'have', 'a', 'extra', 'pair', 'of', 'tickets', 'for', 'tommarow', '?', '?', '?', '...']


In [15]:
def count_words(tokenized_sents):

    count_dict = Counter()

    for sent in tokenized_sents:
        count_dict.update(sent)

    return count_dict

In [16]:
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]

count_words(tokenized_sentences)

Counter({'sky': 1,
         'is': 1,
         'blue': 1,
         '.': 3,
         'leaves': 1,
         'are': 2,
         'green': 1,
         'roses': 1,
         'red': 1})

In [17]:
def get_words_with_nplus_frequency(tokenized_sents, count_threshold):

    # build vocab
    vocab = []
    count_dict = count_words(tokenized_sents)

    for word, count in count_dict.items():
        if count >= count_threshold:
            vocab.append(word)

    return vocab

In [18]:
print(get_words_with_nplus_frequency(tokenized_sentences, 2))

['.', 'are']


In [19]:
def replace_oov_words_by_unk(tokenized_sentences, vocab):

    # any word doesn't exist in vocab, use <UNK>

    sentences_with_unk = []

    # iterate over sentences
    for sent in tokenized_sentences:
        single_sent_with_unk = []

        # iterate over tokens
        for token in sent:

            if token in vocab:
                single_sent_with_unk.append(token)
            else:
                single_sent_with_unk.append('<UNK>')

        sentences_with_unk.append(single_sent_with_unk)

    return sentences_with_unk

In [20]:
tokenized_sentences = [["dogs", "run"], ["cats", "sleep"]]
vocabulary = ["dogs", "sleep"]
print(replace_oov_words_by_unk(tokenized_sentences,vocabulary))

[['dogs', '<UNK>'], ['<UNK>', 'sleep']]


In [21]:
def preprocess_data(train, test, count_threshold):

    # build vocab
    vocab = get_words_with_nplus_frequency(train, count_threshold)

    # get train and test data with handling OOV words
    train = replace_oov_words_by_unk(train, vocab)
    test = replace_oov_words_by_unk(test, vocab)

    return train, test, vocab

In [22]:
tmp_train = [['sky', 'is', 'blue', '.'],
     ['leaves', 'are', 'green']]
tmp_test = [['roses', 'are', 'red', '.']]

tmp_train_repl, tmp_test_repl, tmp_vocab = preprocess_data(tmp_train,
                                                           tmp_test,
                                                           count_threshold = 1)

In [23]:
print("tmp_train_repl")
print(tmp_train_repl)
print()
print("tmp_test_repl")
print(tmp_test_repl)
print()
print("tmp_vocab")
print(tmp_vocab)

tmp_train_repl
[['sky', 'is', 'blue', '.'], ['leaves', 'are', 'green']]

tmp_test_repl
[['<UNK>', 'are', '<UNK>', '.']]

tmp_vocab
['sky', 'is', 'blue', '.', 'leaves', 'are', 'green']


In [24]:
minimum_freq = 2
# apply to the real train test data
train_data, test_data, vocab = preprocess_data(train, test, minimum_freq)

In [25]:
print("First preprocessed training sample:")
print(train_data[0])
print()
print("First preprocessed test sample:")
print(test_data[0])
print()
print("First 10 vocabulary:")
print(vocab[0:10])
print()
print("Size of vocabulary:", len(vocab))

First preprocessed training sample:
['do', 'you', 'happen', 'to', 'have', 'a', 'extra', 'pair', 'of', 'tickets', 'for', '<UNK>', '?', '?', '?', '...']

First preprocessed test sample:
['miley', 'is', 'not', 'a', 'bad', 'girl', ':', 'd']

First 10 vocabulary:
['do', 'you', 'happen', 'to', 'have', 'a', 'extra', 'pair', 'of', 'tickets']

Size of vocabulary: 14764


In [26]:
def count_n_grams(data, n):

    n_gram_dict = {}

    for sent in data:

        # each sent should start with number of grams -1 of '<s>' and ends with 1 '<e>'
        sent = ['<s>'] * (n-1) + sent + ['<e>']

        for i in range(len(sent)-n):

            ngram = tuple(sent[i : i+n])

            if ngram in n_gram_dict:
                n_gram_dict[ngram] += 1
            else:
                n_gram_dict[ngram] = 1

    return n_gram_dict

In [27]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
print("Uni-gram:")
print(count_n_grams(sentences, 1))
print("Bi-gram:")
print(count_n_grams(sentences, 2))
print("TRi-gram:")
print(count_n_grams(sentences, 3))

Uni-gram:
{('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('this',): 1, ('dog',): 1, ('is',): 1}
Bi-gram:
{('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1}
TRi-gram:
{('<s>', '<s>', 'i'): 1, ('<s>', 'i', 'like'): 1, ('i', 'like', 'a'): 1, ('like', 'a', 'cat'): 2, ('<s>', '<s>', 'this'): 1, ('<s>', 'this', 'dog'): 1, ('this', 'dog', 'is'): 1, ('dog', 'is', 'like'): 1, ('is', 'like', 'a'): 1}


In [28]:
def estimate_probability(word, prev_words, ngram_counts, prev_ngram_counts, k, vocab_size):

    # the reason for the list inside tuple is to concatine words not chracters
    # without list the tuple will deal with the chracters of word
    conca = tuple(prev_words) + tuple([word])

    # get n_garm count
    c_ngram = ngram_counts.get(conca,0)
    # get previous n_garm count
    c_prev_ngram = prev_ngram_counts.get(tuple(prev_words,),0)

    # get proba, k is the smoothing number to avoid zeros
    proba = (c_ngram + k) / (c_prev_ngram + k * vocab_size)

    return proba

In [29]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
vocab_size =len(set(sentences[0]+sentences[1]))
ngram_counts = count_n_grams(sentences,2)
prev_ngram_counts = count_n_grams(sentences,1)

print("proba:",estimate_probability('cat','a' ,ngram_counts,prev_ngram_counts,1,vocab_size))

proba: 0.3333333333333333


In [30]:
def estimate_probabilities(prev_words, ngram_counts, prev_ngram_counts,vocab , k):

    probas = {}
    vocab = vocab + ['<e>' , '<UNK>']

    for word in vocab:
        proba = estimate_probability(word, prev_words,ngram_counts, prev_ngram_counts, k, len(vocab))
        probas[word] = proba

    return probas

In [31]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]

unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)

bigram_counts = count_n_grams(sentences, 2)

estimate_probabilities('a', bigram_counts, unigram_counts,unique_words, k=1)

{'is': 0.09090909090909091,
 'dog': 0.09090909090909091,
 'this': 0.09090909090909091,
 'like': 0.09090909090909091,
 'cat': 0.2727272727272727,
 'i': 0.09090909090909091,
 'a': 0.09090909090909091,
 '<e>': 0.09090909090909091,
 '<UNK>': 0.09090909090909091}

In [32]:
trigram_counts = count_n_grams(sentences, 3)
estimate_probabilities(['i', 'like'], trigram_counts, bigram_counts,unique_words, k=1)

{'is': 0.1,
 'dog': 0.1,
 'this': 0.1,
 'like': 0.1,
 'cat': 0.1,
 'i': 0.1,
 'a': 0.2,
 '<e>': 0.1,
 '<UNK>': 0.1}

In [39]:
def make_count_matrix(vocab, ngram):
    # ngram will be like (i, need) so i want to take the first word
    # and see it with the rest of vocab
    # because in dict the keys may repeated like (i, want), (i, need)
    # but in matrix each key will  be in a row and you can see the count
    # with each word in vocab

    vocab = vocab + ['<e>','<UNK>']

    n_gram_rows_l = []

    for gram in ngram.keys():
        prev_ngram = gram[:-1]
        n_gram_rows_l.append(prev_ngram)

    # Delete any duplicates
    n_gram_rows_l = list(set(n_gram_rows_l))

    row_index = { gram : i for i, gram in enumerate(n_gram_rows_l) }
    col_index = { word : i for i, word in enumerate(vocab) }

    count_matrix = np.zeros( (len(row_index), len(col_index)) )

    for gram, count in ngram.items():
        prev_ngram = gram[:-1]
        word = gram[-1]

        if word not in vocab:
            continue

        count_matrix[row_index[prev_ngram], col_index[word]] = count

    count_matrix_df = pd.DataFrame(count_matrix, index = n_gram_rows_l, columns = vocab)

    return count_matrix_df

In [40]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]

unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)
print('bigram counts')
display(make_count_matrix(unique_words, bigram_counts))

bigram counts


Unnamed: 0,is,dog,this,like,cat,i,a,<e>,<UNK>
"(<s>,)",0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
"(like,)",0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
"(dog,)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(is,)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(this,)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(a,)",0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
"(i,)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [41]:
print('\ntrigram counts')
trigram_counts = count_n_grams(sentences, 3)
display(make_count_matrix(unique_words,trigram_counts))


trigram counts


Unnamed: 0,is,dog,this,like,cat,i,a,<e>,<UNK>
"(<s>, this)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>, i)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(dog, is)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(this, dog)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(is, like)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"(<s>, <s>)",0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
"(like, a)",0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
"(i, like)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [42]:
def make_probability_matrix(vocab, ngram, k=1):

    count_matrix = make_count_matrix(vocab, ngram) + k
    proba_matrix = count_matrix.div(count_matrix.sum(axis = 1), axis=0)

    return proba_matrix

In [43]:

sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)
print("bigram probabilities")
display(make_probability_matrix(unique_words,bigram_counts))

bigram probabilities


Unnamed: 0,is,dog,this,like,cat,i,a,<e>,<UNK>
"(<s>,)",0.090909,0.090909,0.181818,0.090909,0.090909,0.181818,0.090909,0.090909,0.090909
"(like,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909
"(dog,)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(is,)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1
"(this,)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(a,)",0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909,0.090909
"(i,)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1


In [44]:
print("trigram probabilities")
trigram_counts = count_n_grams(sentences, 3)
display(make_probability_matrix(unique_words,trigram_counts,  k=1))

trigram probabilities


Unnamed: 0,is,dog,this,like,cat,i,a,<e>,<UNK>
"(<s>, this)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>, i)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1
"(dog, is)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1
"(this, dog)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(is, like)",0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1
"(<s>, <s>)",0.090909,0.090909,0.181818,0.090909,0.090909,0.181818,0.090909,0.090909,0.090909
"(like, a)",0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909,0.090909
"(i, like)",0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1


In [45]:
def calculate_perplexity(sentence, ngram_counts, prev_ngram_counts,vocab_size,k):

    # number of grams
    n = len(list(set(ngram_counts.keys()))[0])
    sentence = ['<s>'] * (n-1) + sentence + ['<e>']
    N = len(sentence)

    comulative_proba = 1

    for i in range(N-n+1):

        prev_ngram = sentence[i:i+n-1]
        word = sentence[i+1]
        proba = 1/estimate_probability(word,prev_ngram,ngram_counts,prev_ngram_counts,k,vocab_size)
        comulative_proba *= proba

    perplexity = comulative_proba ** (1 / N)

    return perplexity

In [46]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)


perplexity_train1 = calculate_perplexity(sentences[0],
                                        bigram_counts,
                                         unigram_counts,
                                         len(unique_words), k=1.0)
print(f"Perplexity for first train sample: {perplexity_train1:.4f}")

test_sentence = ['i', 'like', 'a', 'dog']
perplexity_test = calculate_perplexity(test_sentence,
                                       bigram_counts,
                                       unigram_counts,
                                       len(unique_words), k=1.0)
print(f"Perplexity for test sample: {perplexity_test:.4f}")

Perplexity for first train sample: 3.2293
Perplexity for test sample: 3.8027


In [60]:
def suggest_a_word(prev_ngram, ngram_counts, prev_ngram_counts, vocab, k=1):

    probas = estimate_probabilities(prev_ngram, ngram_counts, prev_ngram_counts, vocab, k)

    # get the word for the max proba
    most_likely_element = max(probas, key= probas.get)

    proba = probas[most_likely_element]

    return most_likely_element, proba

In [61]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 2)
bigram_counts = count_n_grams(sentences, 3)

previous_tokens = ["i", "like"]
tmp_suggest1 = suggest_a_word(previous_tokens, bigram_counts, unigram_counts,unique_words,1.0)
print(f"The previous words are 'i like',\n\tand the suggested word is `{tmp_suggest1[0]}` with a probability of {tmp_suggest1[1]:.4f}")


The previous words are 'i like',
	and the suggested word is `a` with a probability of 0.2000


In [62]:
def get_suggestions(prev_ngram, ngram_counts_list, vocab, k=1):

    suggestions = []

    for i in range(1, len(ngram_counts_list)):

        cur_ngram_counts = ngram_counts_list[i]
        prev_ngram_counts = ngram_counts_list[i-1]

        suggested_word = suggest_a_word(prev_ngram, cur_ngram_counts, prev_ngram_counts, vocab, 1)
        suggestions.append(suggested_word)

    return suggestions

In [63]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
trigram_counts = count_n_grams(sentences, 3)
quadgram_counts = count_n_grams(sentences, 4)
qintgram_counts = count_n_grams(sentences, 5)

n_gram_counts_list = [unigram_counts, bigram_counts, trigram_counts, quadgram_counts, qintgram_counts]
previous_tokens = ["i", "like"]
tmp_suggest3 = get_suggestions(previous_tokens, n_gram_counts_list, unique_words, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest3)

The previous words are ['i', 'like'], the suggestions are:


[('is', 0.1111111111111111),
 ('a', 0.2),
 ('is', 0.1111111111111111),
 ('is', 0.1111111111111111)]

In [64]:
def get_multiple_suggestions(prev_ngram,  ngram_counts, prev_ngram_counts, vocab, k=1, n_sug=5):

    suggestions_l = []

    #n = len(list(ngram_counts.keys())[0])
    #suggestions_l = ['<s>'] * (n-1) + suggestions_l

    n = len(prev_ngram)

    for w in prev_ngram:
        suggestions_l.append(w)

    for i in range(n_sug):

        prev = suggestions_l[-n:]
        sug,_ = suggest_a_word(prev, ngram_counts, prev_ngram_counts, vocab, k=1)
        suggestions_l.append(sug)

    return ' '.join(suggestions_l)

In [67]:
prev_text = 'how are you'
prev_ngram = prev_text.split()
bigram = count_n_grams(train_data,4)
uni_gram = count_n_grams(train_data,3)

get_multiple_suggestions(prev_ngram,bigram,uni_gram,vocab,1,30)

"how are you ? i 'm in the hospital at the moment . until then ! : ) rt : you can always be more fashion ! : ) rt : you can"