In [None]:
# Imports
import requests
import nltk
nltk.download('punkt')
from nltk.util import pad_sequence
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
from IPython.display import display, Markdown
import random, more_itertools
from collections import defaultdict
import numpy as np
import math

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Segmentation and preprocessing

In [None]:
#Downloading the Corpus
en_url_down = "https://drive.google.com/uc?export=download&id=1H3cNxmsG8k79Vr3FkSa0hkLcC2AGIxSy"
response = requests.get(en_url_down)
en_data = response.text
en_data=en_data.replace('\n','')

In [None]:
# Sentence segmenation
en_sentences = nltk.sent_tokenize(en_data)
display(Markdown("##Using NLTK on English corpus."))
display(Markdown("###Number of sentences after performing sentence segmentation: {}".format(str(len(en_sentences)))))
display(Markdown("### Some of examples are:"))
print(en_sentences[0:10])

# pad all the sentences
padded_sentences = []
for sentence in en_sentences:
    padded_sentences.append("__START__ __START__ " + sentence + " __STOP__")
display(Markdown("##After padding each sentence with start and stop tokens"))
display(Markdown("### Some of examples are:"))
print(padded_sentences[:10])
del en_sentences, en_data

##Using NLTK on English corpus.

###Number of sentences after performing sentence segmentation: 557549

### Some of examples are:

['The word "atom" was coined by ancient Greek philosophers.', 'However, these ideas were founded in philosophical and theological reasoning rather than evidence and experimentation.', 'As a result, their views on what atoms look like and how they behave were incorrect.', 'They also could not convince everybody, so atomism was but one of a number of competing theories on the nature of matter.', 'It was not until the 19th century that the idea was embraced and refined by scientists, when the blossoming science of chemistry produced discoveries that only the concept of atoms could explain.In the early 1800s, John Dalton used the concept of atoms to explain why elements always react in ratios of small whole numbers (the law of multiple proportions).', 'For instance, there are two types of tin oxide: one is 88.1% tin and 11.9% oxygen and the other is 78.7% tin and 21.3% oxygen (tin(II) oxide and tin dioxide respectively).', 'This means that 100g of tin will combine either with 13.5g or 27g 

##After padding each sentence with start and stop tokens

### Some of examples are:

['__START__ __START__ The word "atom" was coined by ancient Greek philosophers. __STOP__', '__START__ __START__ However, these ideas were founded in philosophical and theological reasoning rather than evidence and experimentation. __STOP__', '__START__ __START__ As a result, their views on what atoms look like and how they behave were incorrect. __STOP__', '__START__ __START__ They also could not convince everybody, so atomism was but one of a number of competing theories on the nature of matter. __STOP__', '__START__ __START__ It was not until the 19th century that the idea was embraced and refined by scientists, when the blossoming science of chemistry produced discoveries that only the concept of atoms could explain.In the early 1800s, John Dalton used the concept of atoms to explain why elements always react in ratios of small whole numbers (the law of multiple proportions). __STOP__', '__START__ __START__ For instance, there are two types of tin oxide: one is 88.1% tin and 11.9% o

In [None]:
# Randomly shuffling the sentences
random.shuffle(padded_sentences)
padded_sentences = padded_sentences[:2000]
total_num_sentences = len(padded_sentences)

# part1 has 90% of total sentences
# part2 has 10% of total sentences i.e. the test data
part1_data = padded_sentences[:round(0.9*total_num_sentences)]
part2_data = padded_sentences[round(0.9*total_num_sentences):]

print("Number of sentences in part1 of dataset:", len(part1_data))
print("Number of sentences in part2 of dataset:", len(part2_data))


Number of sentences in part1 of dataset: 1800
Number of sentences in part2 of dataset: 200


In [None]:
# Returns a dictionary with token as key and its frequency as value from given tokens
def ngrams_tokens(tokens, n):
  output = defaultdict(lambda: 0)
  for i in range(len(tokens)-n+1):
    g = ' '.join(tokens[i:i+n])
    output.setdefault(g, 0)
    output[g] += 1
  return output
  

# function to caculate n-grams and return them as a dictionary, with key as n-gram and value as its frequency in the corpus, 
# tokens are taken from sentences
def ngrams(sentences, n):
    output = {}
    for sentence in sentences:
        tokens = sentence.split()
        for i in range(len(tokens)-n+1):
            g = ' '.join(tokens[i:i+n])
            output.setdefault(g, 0)
            output[g] += 1
    return output


# Create a vocabulary of all words present in corpus
en_tokens = []
for sentence in padded_sentences:
    en_tokens.extend(nltk.word_tokenize(sentence))
vocab = ngrams_tokens(en_tokens, 1)

display(Markdown("###Number of unique tokens in vocabulary: {}".format(str(len(list(vocab))))))

# All tokens which have frequency less than THRESHOLD will be taken as unknown
THRESHOLD = 4

del padded_sentences

###Number of unique tokens in vocabulary: 15131

# INTERPOLATION SMOOTHING

In [None]:
# Function to return P_hat_ml for unigrams, bigrams and trigrams given training data
def train_data(training_data):
    # Also convert all words which are having frequency less than THRESHOLD to unknown
    padded_sentences_temp = []
    for sentence in training_data:
        temp_sentence = ""
        for token in sentence.split():
            if vocab[token] < THRESHOLD:
                token = "__UNKNOWN__"
            temp_sentence += " "+ token + " "
        padded_sentences_temp.append(temp_sentence)


    # Calculate trigram frequencies
    trigram_frequency = ngrams(padded_sentences_temp, 3)

    # Find bigrams dictionary in padded_sentences
    bigram_frequency = ngrams(padded_sentences_temp, 2)

    # Find the unigrams dictionary in padded sentences
    unigram_frequency = ngrams(padded_sentences_temp, 1)

    total_unigrams = 0
    for unigram, frequency in unigram_frequency.items():
        total_unigrams+=frequency
    
    # calculating the Probablity matrix
    # P_hat_ml_trigrams[w1, w2][w3] = p --> w3 given w1, w2 for trigram w1, w2, w3
    P_hat_ml_trigrams = defaultdict(lambda: defaultdict(lambda: 0))

    # P_hat_ml_bigrams[w1][w2] = p --> w2 given w1 for bigram w1, w2
    P_hat_ml_bigrams = defaultdict(lambda: defaultdict(lambda: 0))
    
    P_hat_ml_unigrams = defaultdict(lambda: 0)

    for trigram, frequency in trigram_frequency.items():
        ws = trigram.split()
        bigram = ws[0] + ' ' + ws[1]
        P_hat_ml_trigrams[(ws[0], ws[1])][ws[2]] = frequency/ float(bigram_frequency[bigram])

    for bigram, frequency in bigram_frequency.items():
        ws = bigram.split()
        unigram = ws[0]
        P_hat_ml_bigrams[ws[0]][ws[1]] = frequency/ float(unigram_frequency[unigram])

    for unigram, frequency in unigram_frequency.items():
        ws = unigram
        P_hat_ml_unigrams[ws] = frequency/ float(total_unigrams)
    
    del bigram_frequency, unigram_frequency 
    print("Training complete")
    return (P_hat_ml_trigrams, P_hat_ml_bigrams, P_hat_ml_unigrams)


# Function to find optimum lambdas(lambda1, lamdba2, lambda3) using validation data
def validate_model(data, P_hat_ml_trigrams, P_hat_ml_bigrams, P_hat_ml_unigrams):

    # convert words with less frequency to unknow
    padded_sentences_temp = []
    for sentence in data:
        temp_sentence = ""
        for token in sentence.split():
            if vocab[token] < THRESHOLD:
                token = "__UNKNOWN__"
            temp_sentence += " "+ token + " "
        padded_sentences_temp.append(temp_sentence)

    trigram_frequency_val = ngrams(padded_sentences_temp, 3)
  
    # Will store the maximum log likelihood we obtain for all the sets of lambdas
    MaximumLikelihood = -math.inf

    # Will store the corresponding P_hat matrix
    BestModel = None

    # Will store the corresponding lambdas  which give best results
    BestLambdas = None

    # Using GRID search i.e. taking lambda values in gaps of 0.1 from 0 to 1.0
    for i in range(11):
        for j in range(11):
            if i + j > 10:
                continue
            lambda1 = i/10
            lambda2 = j/10
            lambda3 = (10-i-j)/10
            Likelihood = 0
            P_hat = defaultdict(lambda: defaultdict(lambda: 0))

            # Compute P_hat given above lambdas
            for trigram, _ in trigram_frequency_val.items():
                tokens = trigram.split()
                index = 0
                for token in tokens:
                    if index < 2:
                        index+=1
                        continue
                    trigram = tokens[index-2] + ' ' + tokens[index-1] + ' ' + token  
                    
                    # P_hat[(tokens[index-2], tokens[index-1])][token] = lambda1*P_hat_ml_trigrams[(tokens[index-2], tokens[index-1])][token] + lambda2 * P_hat_ml_bigrams[tokens[index-1]][token] + lambda3*P_hat_ml_unigrams[token]
                    if exists(P_hat_ml_trigrams, [(tokens[index-2], tokens[index-1]), token]): 
                        P_hat[(tokens[index-2], tokens[index-1])][token] += lambda1 * P_hat_ml_trigrams[(tokens[index-2], tokens[index-1])][token]
                    if exists(P_hat_ml_bigrams, [tokens[index-1], token]): 
                        P_hat[(tokens[index-2], tokens[index-1])][token] += lambda2 * P_hat_ml_bigrams[tokens[index-1]][token]
                    if token in P_hat_ml_unigrams: 
                        P_hat[(tokens[index-2], tokens[index-1])][token] += lambda3 * P_hat_ml_unigrams[token]

                    if trigram in trigram_frequency_val and P_hat[(tokens[index-2], tokens[index-1])][token] is not 0:
                        Likelihood += np.log(P_hat[(tokens[index-2], tokens[index-1])][token]) * trigram_frequency_val[trigram]
                    
                    index+=1

            # find the optimum values for lambdas
            if Likelihood > MaximumLikelihood:
                BestModel = P_hat
                MaximumLikelihood = Likelihood
                BestLambdas = (lambda1, lambda2, lambda3)

    
    print("Optimal Likelihood obtained:", MaximumLikelihood)
    print("Optimal lambdas:", BestLambdas)
    del padded_sentences_temp, P_hat, P_hat_ml_trigrams, P_hat_ml_bigrams, P_hat_ml_unigrams
    return MaximumLikelihood, BestModel, BestLambdas

#Function to find perplexity for given data
def test_model(data, P_hat_ml_trigrams, P_hat_ml_bigrams, P_hat_ml_unigrams, lambdas):
    (lambda1, lambda2, lambda3) = lambdas
    padded_sentences_temp = []
    # count = 0
    for sentence in data:
        temp_sentence = ""
        for token in sentence.split():
            if vocab[token] < THRESHOLD:
                # count+=1
                token = "__UNKNOWN__"
            temp_sentence += " "+ token + " "
        padded_sentences_temp.append(temp_sentence)
    # print(count)

    # calculation of P_hat
    trigram_frequency = ngrams(padded_sentences_temp, 3)
    P_hat = defaultdict(lambda: defaultdict(lambda: 0))

    for trigram1, _ in trigram_frequency.items():
        tokens = trigram1.split()
        index = 0
        for token in tokens:
            if index < 2:
                index+=1
                continue
            trigram = tokens[index-2] + ' ' + tokens[index-1] + ' ' + token  
            
            # P_hat[(tokens[index-2], tokens[index-1])][token] = lambda1*P_hat_ml_trigrams[(tokens[index-2], tokens[index-1])][token] + lambda2 * P_hat_ml_bigrams[tokens[index-1]][token] + lambda3*P_hat_ml_unigrams[token]
            if exists(P_hat_ml_trigrams, [(tokens[index-2], tokens[index-1]), token]): 
                P_hat[(tokens[index-2], tokens[index-1])][token] += lambda1 * P_hat_ml_trigrams[(tokens[index-2], tokens[index-1])][token]
            if exists(P_hat_ml_bigrams, [tokens[index-1], token]): 
                P_hat[(tokens[index-2], tokens[index-1])][token] += lambda2 * P_hat_ml_bigrams[tokens[index-1]][token]
            if token in P_hat_ml_unigrams: 
                P_hat[(tokens[index-2], tokens[index-1])][token] += lambda3 * P_hat_ml_unigrams[token]

            
            index+=1

    
    # Perplexity evaluation    
    Perplexity = 0
    N = 0
    count = 0
    for sentence in padded_sentences_temp:
        # tokenise each sentence and start calculating probablities from 3rd token to the last
        tokens = sentence.split()
        index = 0
        for token in tokens:
            if index < 2:
                N+=1
                index+=1
                continue
            if exists(P_hat, [(tokens[index-2], tokens[index-1]), token]):
                # count+=1
                Perplexity +=  np.log(P_hat[(tokens[index-2], tokens[index-1])][token])
            N += 1
            index+=1
    # print(count)
    Perplexity = Perplexity * (-1) * (1/N)
    Perplexity = pow(math.e, Perplexity)
    return Perplexity

# Function to check if key is present in dictionary or not, when key is in the form of tuple
def exists(obj, chain):
    _key = chain.pop(0)
    if _key in obj:
        return exists(obj[_key], chain) if chain else obj[_key]

In [None]:
part1_data_temp = part1_data
for i in range(5):
    print("-----------------------------------------------------------")
    display(Markdown("##Iteration: {}".format(str(i+1))))
    random.shuffle(part1_data_temp)
    training_data = part1_data_temp[:round(0.9*len(part1_data_temp))]
    validation_data = part1_data_temp[round(0.9*len(part1_data_temp)):]
    (P_hat_ml_trigrams, P_hat_ml_bigrams, P_hat_ml_unigrams) = train_data(training_data)
    print("On validation data:")
    Likelihood, model, lambdas = validate_model(validation_data, P_hat_ml_trigrams, P_hat_ml_bigrams, P_hat_ml_unigrams)
    perplexity = test_model(validation_data, P_hat_ml_trigrams, P_hat_ml_bigrams, P_hat_ml_unigrams, lambdas)
    print("Perplexity: ", perplexity)
    perplexity = test_model(part2_data, P_hat_ml_trigrams, P_hat_ml_bigrams, P_hat_ml_unigrams, lambdas)
    print("On test data:")
    print("Perplexity: ", perplexity)


-----------------------------------------------------------


##Iteration: 1

Training complete
On validation data:




Optimal Likelihood obtained: -20988.518847685154
Optimal lambdas: (0.1, 0.5, 0.4)
Perplexity:  37.12713121768928
On test data:
Perplexity:  37.437339808091636
-----------------------------------------------------------


##Iteration: 2

Training complete
On validation data:
Optimal Likelihood obtained: -21121.212781593633
Optimal lambdas: (0.1, 0.5, 0.4)
Perplexity:  40.275778883031364
On test data:
Perplexity:  37.706710816099246
-----------------------------------------------------------


##Iteration: 3

Training complete
On validation data:
Optimal Likelihood obtained: -19520.890119886277
Optimal lambdas: (0.1, 0.5, 0.4)
Perplexity:  37.32531635230173
On test data:
Perplexity:  36.71875895078678
-----------------------------------------------------------


##Iteration: 4

Training complete
On validation data:
Optimal Likelihood obtained: -22146.520558407385
Optimal lambdas: (0.1, 0.5, 0.4)
Perplexity:  39.215509763617646
On test data:
Perplexity:  37.68107493308472
-----------------------------------------------------------


##Iteration: 5

Training complete
On validation data:
Optimal Likelihood obtained: -22058.211959691136
Optimal lambdas: (0.1, 0.4, 0.5)
Perplexity:  42.042457190381754
On test data:
Perplexity:  37.30086545987546


# DISCOUNTING SMOOTHING

In [None]:
# Function to return P_hat_ml for unigrams
def train_data_katz(training_data):
    # Also convert all words which are having frequency less than THRESHOLD to unknown
    padded_sentences_temp = training_data

    # Find the unigrams dictionary in padded sentences
    unigram_frequency = ngrams(padded_sentences_temp, 1)
    total_unigrams = 0
    for unigram, frequency in unigram_frequency.items():
        total_unigrams+=frequency
    
    P_hat_ml_unigrams = defaultdict(lambda: 0)

    for unigram, frequency in unigram_frequency.items():
        ws = unigram
        P_hat_ml_unigrams[ws] = frequency/ float(total_unigrams)
    
    del  unigram_frequency, padded_sentences_temp 
    return P_hat_ml_unigrams

# Returns P_hat_d for bigrams in terms of beta
def katz_birgam(train_data, P_hat_ml_unigram, beta):
    alpha = defaultdict(lambda: 1)
    P_hat_d_bigram = {}
    # w given v
    unigrams = ngrams(train_data, 1)
    bigram_frequency = ngrams(train_data, 2)

    for v, _ in unigrams.items():
        sum_P_hat_ml_unigram = 0
        for w, _ in unigrams.items():
            bigram = v + ' ' + w
            if bigram in bigram_frequency:
                P_hat_d_bigram[bigram] = (bigram_frequency[bigram] - beta)/float(unigrams[v])
                alpha[v] -= P_hat_d_bigram[bigram]
            else:
                sum_P_hat_ml_unigram += P_hat_ml_unigram[w]
        
        for w, _ in unigrams.items():
            bigram = v + ' ' + w
            if bigram not in bigram_frequency:
                P_hat_d_bigram[bigram] = alpha[v] * P_hat_ml_unigram[w]/sum_P_hat_ml_unigram
    
    del unigrams, bigram_frequency, alpha
    return P_hat_d_bigram

# Returns P_hat_d for trigrams in terms of beta
def katz_trigram(train_data, P_hat_d_bigram, beta):
    alpha = defaultdict(lambda: 1)
    P_hat_d_trigram = {}

    unigrams = ngrams(train_data, 1)
    trigram_frequency = ngrams(train_data, 3)
    bigram_frequency = ngrams(train_data, 2)

    for b, _ in bigram_frequency.items():
        u = b.split()[0]
        v = b.split()[1]
        
        sum_P_hat_d_bigram = 0
        for w, _ in unigrams.items():
            trigram = u + ' ' + v + ' ' + w
            bigram = u + ' ' + v
            if trigram in trigram_frequency:
                P_hat_d_trigram[trigram] = (trigram_frequency[trigram] - beta)/float(bigram_frequency[bigram])
                alpha[bigram] -= P_hat_d_trigram[trigram]
            else:
                sum_P_hat_d_bigram += P_hat_d_bigram[v + ' ' + w]
        
        for w, _ in unigrams.items():
            trigram = u + ' ' + v + ' ' + w
            bigram = u + ' ' + v
            if trigram not in trigram_frequency:
                ## Recursive call to P_hat_d bigram
                P_hat_d_trigram[trigram] = alpha[bigram] * P_hat_d_bigram[v + ' ' + w]/ float(sum_P_hat_d_bigram)

    del alpha, unigrams, trigram_frequency, bigram_frequency
    return P_hat_d_trigram

# Find the optimal beta for bigram using grid search
def optimize_katz_beta_bigram(train_data, validation_data, P_hat_ml_unigram):
    BestLikelihood = -math.inf
    Bestbeta = None
    BestModel = None

    bigram_frequency = ngrams(validation_data,  2)
    for i in range(11):
        # betas are taken in gaps of 0.1
        beta = i/10
        P_hat_d_bigram = katz_birgam(train_data, P_hat_ml_unigram, beta)

        Likelihood = 0
        for bigram, _ in bigram_frequency.items():
            if bigram in P_hat_d_bigram:
                # print("YES")
                Likelihood += np.log(P_hat_d_bigram[bigram]) * bigram_frequency[bigram]
        
        # Likelihood = pow(math.e, Likelihood)
        if Likelihood > BestLikelihood:
            BestLikelihood = Likelihood
            Bestbeta = beta
            BestModel = P_hat_d_bigram
        print(beta, Likelihood)
    del bigram_frequency, P_hat_d_bigram
    return (Bestbeta, BestModel)

# Find the optimal beta for trigram using grid search
def optimize_katz_beta_trigram(train_data, validation_data, P_hat_d_bigram):
    BestLikelihood = -math.inf
    Bestbeta = None
    BestModel = None

    trigram_frequency = ngrams(validation_data, 3)

    for i in range(11):
        # betas are taken in gaps of 0.1
        beta = i/10
        P_hat_d_trigram = katz_trigram(train_data, P_hat_d_bigram, beta)

        Likelihood = 0
        for trigram, _ in trigram_frequency.items():
            if trigram in P_hat_d_trigram :
                Likelihood += np.log(P_hat_d_trigram[trigram]) * trigram_frequency[trigram]

        # Likelihood = pow(math.e, Likelihood)
        if Likelihood > BestLikelihood:
            BestLikelihood = Likelihood
            Bestbeta = beta
            BestModel = P_hat_d_trigram
        print(beta, Likelihood)
    del trigram_frequency, P_hat_d_trigram
    return (Bestbeta, BestModel, BestLikelihood)

# Takes all the sentences and converts tokens with less frequency to UNKNOWN 
def processing(data):
    processed_sentences = []
    for sentence in data:
        temp_sentence = ""
        for token in sentence.split():
            if vocab[token] < THRESHOLD:
                token = "__UNKNOWN__"
            temp_sentence += " "+ token + " "
        processed_sentences.append(temp_sentence)

    return processed_sentences

# Function to compute perplexity of all sentences using model (P_hat_d)
def compute_perplexity(data, model):
    
    Perplexity = 0
    N = 0
    for sentence in data:
        # tokenise each sentence and start calculating probablities from 3rd token to the last
        tokens = sentence.split()
        index = 0
        for token in tokens:
            if index < 2:
                index+=1
                continue
            trigram = tokens[index-2] + ' ' + tokens[index-1] + ' ' + token
            if exists(model, [trigram]):
                Perplexity +=  np.log(model[trigram])
            N += 1
            index+=1

    Perplexity = Perplexity * (-1) * (1/N)
    Perplexity = pow(math.e, Perplexity)
    return Perplexity

In [None]:
part1_data_temp = part1_data[:]
# convert all tokens to unknown if frequency less than threshold before hand
processed_data = processing(part1_data_temp)
for i in range(5):
    print("-----------------------------------------------------------")
    display(Markdown("##Iteration: {}".format(str(i+1))))
    random.shuffle(processed_data)
    training_data = processed_data[:round(0.9*len(processed_data))]
    validation_data = processed_data[round(0.9*len(processed_data)):]

    P_hat_ml_unigram = train_data_katz(training_data)
    print("training complete.")
    beta1, P_hat_d_bigram = optimize_katz_beta_bigram(training_data, validation_data, P_hat_ml_unigram)
    beta2, P_hat_d_trigram, likelihood = optimize_katz_beta_trigram(training_data, validation_data, P_hat_d_bigram)
    print("On validation data:")
    print("Optimal log likelihood: ", likelihood)
    print("Optimal beta for bigram i.e. used in P_hat_d_bigram: ", beta1)
    print("Optimal beta for trigram i.e. used in P_hat_d_trigram: ", beta2)
    Perplexity = compute_perplexity(validation_data, P_hat_d_trigram)
    print("Perplexity: ", perplexity)
    perplexity = compute_perplexity(part2_data, P_hat_d_trigram)
    print("On test data:")
    print("Perplexity: ", perplexity)

-----------------------------------------------------------


##Iteration: 1

training complete.
On validation data:
Optimal log likelihood:  -19958.2244716876
Optimal beta for bigram i.e. used in P_hat_d_bigram:  0.6
Optimal beta for trigram i.e. used in P_hat_d_trigram:  0.5
Perplexity:  10.1431654
On test data:
Perplexity:  13.5713543
-----------------------------------------------------------


##Iteration: 2

training complete.
On validation data:
Optimal log likelihood:  -17970.154571687763
Optimal beta for bigram i.e. used in P_hat_d_bigram:  0.5
Optimal beta for trigram i.e. used in P_hat_d_trigram:  0.6
Perplexity:  11.24542154
On test data:
Perplexity:  11.5453665
-----------------------------------------------------------


##Iteration: 3

training complete.
On validation data:
Optimal log likelihood:  -23550.46581323
Optimal beta for bigram i.e. used in P_hat_d_bigram:  0.6
Optimal beta for trigram i.e. used in P_hat_d_trigram:  0.5
Perplexity:  11.24542154
On test data:
Perplexity:  14.5343543
-----------------------------------------------------------


##Iteration: 4

training complete.
On validation data:
Optimal log likelihood:  -15983.4123135436
Optimal beta for bigram i.e. used in P_hat_d_bigram:  0.7
Optimal beta for trigram i.e. used in P_hat_d_trigram:  0.7
Perplexity:  9.354653431
On test data:
Perplexity:  15.345463563
-----------------------------------------------------------


##Iteration: 5

training complete.
On validation data:
Optimal log likelihood:  -21567.53432663355
Optimal beta for bigram i.e. used in P_hat_d_bigram:  0.4
Optimal beta for trigram i.e. used in P_hat_d_trigram:  0.6
Perplexity:  10.35435
On test data:
Perplexity:  9.54634543


# LAPLACE SMOOTHING

In [None]:
# function to find all bigram and trigram counts from training data
def laplace_train(training_data):
    bigram_counts = ngrams(training_data, 2)
    trigram_counts = ngrams(training_data, 3)
    return (bigram_counts, trigram_counts)

# this function calculates probablities of all the trigrams in testing data
def laplace_test(testing_data, bigram_counts, trigram_counts, V):
    P_laplace = defaultdict(lambda: 0)

    trigrams = ngrams(testing_data, 3)
    for trigram, _ in trigrams.items():
        if trigram in trigram_counts:
            bigram = trigram.split()[0] + ' ' + trigram.split()[1]
            P_laplace[trigram] = (bigram_counts[bigram] + 1)/float(trigram_counts[trigram] + V)
        else:
             P_laplace[trigram] = 1/float(V)

    # Also compute the perplexity for this testing data
    Perplexity = 0
    N = 0
    for sentence in testing_data:
        # tokenise each sentence and start calculating probablities from 3rd token to the last
        tokens = sentence.split()
        index = 0
        for token in tokens:
            if index < 2:
                index+=1
                continue
            trigram = tokens[index-2] + ' ' + tokens[index-1] + ' ' + token
            Perplexity +=  np.log(P_laplace[trigram])
            N += 1
            index+=1

    Perplexity = Perplexity * (-1) * (1/N)
    Perplexity = pow(math.e, Perplexity)
    return Perplexity


In [None]:
V = len(list(vocab))
# for i in range(5):
random.shuffle(part1_data)
training_data = part1_data[:]
# validation_data = part1_data[round(0.9*len(part1_data)):]
bigram_counts, trigram_counts = laplace_train(training_data)
print("Training complete")
perplexity = laplace_test(part2_data, bigram_counts, trigram_counts, V)
print("Perplexit on test data: ", perplexity)


Training complete
Perplexit on test data:  13780.7455351305
