In [175]:
import numpy as np
import matplotlib as plt
import nltk               # NLP toolkit
import re

nltk.download('punkt')    # Download the Punkt sentence tokenizer 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adithyashanker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [176]:
corpus = "Lyn drinks chocolate\nJohn drinks tea\nlyn eats chocolate"

In [177]:
def preprocess(corpus, n):
    corpus = corpus.lower()
    #print(corpus)
    corpus = re.sub(r"[^a-zA-Z0-9.?! \n]+", "", corpus)
    #print(corpus)
    sentences = corpus.split("\n")
    #print(sentences)
    tokens = []
    for sentence in sentences:
        tokenized_sentence = nltk.word_tokenize(sentence)
        tokenized_sentence = ["<s>"] * (n - 1) + tokenized_sentence + ["</s>"]
        tokens.append(tokenized_sentence)
    return tokens


In [178]:
def count_prob(tokens):
    count = {}
    for sentence in tokens:
        for token in sentence:
            count[token] = count.get(token,0)+1
    return count

counts = count_prob(corpus)

In [179]:
tokenized = preprocess(corpus, 2)
print(tokenized)
counts = count_prob(tokenized)

[['<s>', 'lyn', 'drinks', 'chocolate', '</s>'], ['<s>', 'john', 'drinks', 'tea', '</s>'], ['<s>', 'lyn', 'eats', 'chocolate', '</s>']]


In [180]:
def generate_matrix(counts, tokens):
    counts_sorted = sorted(counts.keys())
    dim = len(counts_sorted)
    matrix = np.zeros((dim,dim))
    for sentence in tokens:
        for i in range(len(sentence)-1):
            index1 = counts_sorted.index(sentence[i])
            index2 = counts_sorted.index(sentence[i+1])
            matrix[index1][index2]+=1
    return matrix,counts_sorted


In [181]:
bigrams,counts_sorted = generate_matrix(counts, tokenized)
print(bigrams)

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 2. 0.]
 [2. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]]


In [182]:
def generate_probablities(grams):
    matrix = grams.copy()
    for row in range(len(matrix)):
        s = sum(matrix[row])
        for col in range(len(matrix)):
            if s==0:
                break
            matrix[row][col] = matrix[row][col]/s
    return matrix

In [183]:
probabilities = generate_probablities(bigrams)

In [187]:
def calculate_prob_sentence(sentence, probabilities, counts_sorted):
    prob = 1
    for i in range(len(sentence)-1):
        index1 = counts_sorted.index(sentence[i])
        index2 = counts_sorted.index(sentence[i+1])
        prob *= probabilities[index1][index2]
    return prob
            
        

In [188]:
print(calculate_prob_sentence("john drinks chocolate".split(), probabilities, counts_sorted))

0.5
