## Necessary Imports

In [1]:
import pandas as pd
import numpy as np
import spacy
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from tqdm import tqdm
import re
import os
import json
from collections import defaultdict

## A. Dataset and Preprocessing:


In [7]:
train_path = 'train.csv'
test_path = 'test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

training_data = []
test_data = []
validation_data = []

In [None]:
def create_datasets(train_df, test_df):
    shuffled = np.random.permutation(len(train_df)) #to randomly get 100 indices for validation dataset creation
    validation_indices = shuffled[:100]
    
    training_data = []
    validation_data = []
    test_data = []
    
    for index, row in train_df.iterrows():
        if index not in validation_indices:
            training_data.append(row['text'])
        else:
            validation_data.append(row['text'])
            
    for index, row in test_df.iterrows():
        test_data.append(row['text'])
                    
    return [training_data, test_data, validation_data]

training_data, test_data, validation_data = create_datasets(train_df=train_df, test_df=test_df)

In [None]:
def pre_process_data(spacy_model, data):
    
    processed_list = []
    stop_words = set(stopwords.words("english"))
    
    for text in tqdm(data, desc = 'preprocessing dataset...'):
        
        text = text.lower()         
        text = re.sub(r'[^a-zA-Z\s]', '', text)         # subsitituting any non english alphabet with ''
        text = ' '.join(text.split())               #joining all the words to form a sentence

        tokens = spacy_model(text)              #tokenization part, i.e lemmatization is done and sentence split into words
        
        processed_tokens = [token.lemma_ for token in tokens if token.text.isalpha() and token.text not in stop_words and token.text.strip() != ""]                 # only adding those tokens that have text as alphabets and no punctuation or stopwords in them
        
        processed_list.append(processed_tokens)
        
    return processed_list

def clean_data(training_data, test_data, validation_data):
    
    spacy_model = spacy.load("en_core_web_sm")
        
    
    curr_dirr = os.getcwd()
    data_path = os.path.join(curr_dirr, 'processed_data_with_lemmatizer.json')
    
    print(f'saving processed_data to {data_path}')
    
    training_data = pre_process_data(spacy_model=spacy_model, data = training_data)
    test_data = pre_process_data(spacy_model=spacy_model, data = test_data)
    validation_data = pre_process_data(spacy_model=spacy_model, data = validation_data)
    
    data_dict = {'training_data' : training_data, 'test_data' : test_data, 'validation_data' : validation_data  }
    
    data_list = [data_dict]
    
    with open(data_path, 'w') as file:
        json.dump(data_list, file, indent = 4)
        
    return

clean_data(training_data=training_data, test_data=test_data, validation_data=validation_data)

preprocessing dataset...: 100%|██████████| 13779/13779 [1:22:26<00:00,  2.79it/s]  
preprocessing dataset...: 100%|██████████| 100/100 [00:37<00:00,  2.68it/s]
preprocessing dataset...: 100%|██████████| 100/100 [00:32<00:00,  3.10it/s]


saving processed_data to /Users/aryan/Desktop/KGP_MnC/6thSEM/NLP/processed_data_with_lemmatizer.json


## B. Estimation Using Maximum Likelihood:


In [None]:
def create_ngram_model(data, n=1):
    total_ngrams = 0
    n_gram_counts = defaultdict(int)
    total_articles = len(data)
    n_1gram_counts = defaultdict(int)
    ngram_unique_article_counts = defaultdict(int)
    
    for article in tqdm(data, desc='processing articles..'):
        ngram_seen_in_article_set = set()  
        for i in range(len(article) - n + 1):
            ngram = tuple(article[i:i + n])  
            n_gram_counts[ngram] += 1
            total_ngrams += 1
            if n > 1:  # (n-1)-gram for bigrams, trigrams, etc.
                n_1gram_counts[ngram[:-1]] += 1  # Count the (n-1)-gram (prefix of the n-gram)
            
            if ngram not in ngram_seen_in_article_set:
                ngram_unique_article_counts[ngram] += 1  # count the occurrence of an ngram only once per article
                ngram_seen_in_article_set.add(ngram)  # to mark the ngram as seen in the article
        
    # to filter the articles that appear in atleast 1% of all the articles
    vocabulary = {ngram: n_gram_counts[ngram] for ngram, count in ngram_unique_article_counts.items() if count >= 0.01*total_articles}
    V = len(vocabulary)            
    
    # MLE Estimation and laplace smoothing for all ngrams
    ngram_probabilities = {}
    for ngram, count in tqdm(n_gram_counts.items(), desc = 'calculating probabilities....'):
        if n == 1: 
            ngram_probabilities[ngram] = (count + 1)/ (total_ngrams + V)  
        else:  # For bigrams, trigrams, etc.
            n_1gram = ngram[:-1]  # the n-1 gram for the bigram, trigram etc
            n_1gram_count = n_1gram_counts[n_1gram]  # no of time the n-1 gram appears
            ngram_probabilities[ngram] = (count+1) / (n_1gram_count + V)  # laplace smoothing with MLE
    
    return (ngram_probabilities, n_1gram_counts, V)

In [5]:
data_path = 'processed_data_with_lemmatizer.json'

with open(data_path, 'r', encoding="utf-8") as file:
    data = json.load(file)
    training_data = data[0]['training_data']
    validation_data = data[0]['validation_data']
    test_data = data[0]['test_data']
    
trigram_model = create_ngram_model(data=training_data, n = 3)
bigram_model =  create_ngram_model(data=training_data, n = 2)   
unigram_model = create_ngram_model(data=training_data, n = 1)


processing articles..: 100%|██████████| 13779/13779 [01:25<00:00, 160.59it/s]
calculating probabilities....: 100%|██████████| 15602595/15602595 [00:35<00:00, 443647.73it/s]
processing articles..: 100%|██████████| 13779/13779 [00:33<00:00, 416.92it/s]
calculating probabilities....: 100%|██████████| 7613173/7613173 [00:05<00:00, 1479540.34it/s]
processing articles..: 100%|██████████| 13779/13779 [00:08<00:00, 1600.56it/s]
calculating probabilities....: 100%|██████████| 449270/449270 [00:00<00:00, 3089270.95it/s]


In [6]:
unigram_probabilities, unigram_n_1_counts, unigram_vocab_size = unigram_model
bigram_probabilities, bigram_n_1_counts, bigram_vocab_size = bigram_model
trigram_probabilities, trigram_n_1_counts, trigram_vocab_size = trigram_model

## C. Evaluating an n-Gram Model using Perplexity: 


In [None]:
def compute_perplexity(test_data, ngram_probabilities, n_1gram_counts, n=1, K=1, V=1):
    mean_perplexity = 0

    for article in tqdm(test_data, desc = 'calculating perplexity for articles....'):
        log_sum_of_prob = 0
        T = len(article)
        
        for i in range(len(article) - n + 1):
            ngram = tuple(article[i:i + n])           #get ngram
            n_1gram = ngram[:-1] if n > 1 else None  # get the n-1 gram prefix of ngram

            # get probability correspoinding to the ngram or apply laplace smoothing if missing
            prob = ngram_probabilities.get(ngram, K / (n_1gram_counts.get(n_1gram, 0) + K * V))
                
            log_sum_of_prob += np.log(prob)

        # Cacluate perplexity for the article
        perplexity = np.exp(-log_sum_of_prob / T)
        mean_perplexity+=perplexity

    # Caclulate overall perplexity as the average across all articles
    mean_perplexity /= len(test_data)
    return mean_perplexity

perplexity_unigram = compute_perplexity(test_data, unigram_probabilities, unigram_n_1_counts, n = 1, K = 1, V = unigram_vocab_size)
perplexity_bigram = compute_perplexity(test_data, bigram_probabilities, bigram_n_1_counts, n = 2, K = 1, V = bigram_vocab_size)
perplexity_trigram = compute_perplexity(test_data, trigram_probabilities, trigram_n_1_counts, n = 3, K = 1, V = trigram_vocab_size)

print(f'perplexity for unigram = {perplexity_unigram}')
print()
print(f'perplexity for bigram = {perplexity_bigram}')
print()
print(f'perplexity for trigram = {perplexity_trigram}')

calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 304.94it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 101.65it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:02<00:00, 47.03it/s]

perplexity for unigram = 6439.512962763581

perplexity for bigram = 1237.20352695804

perplexity for trigram = 552.9354736657151





## D. Interpolation Model: 


In [None]:
def compute_interpolated_probabilities(trigram_probabilities, bigram_probabilities, unigram_probabilities, lambda1, lambda2, lambda3):
    
    interpolated_probabilities = dict.fromkeys(tqdm(trigram_probabilities, desc='Creating dict'), 0)

    for trigram, trigram_probability in tqdm(trigram_probabilities.items(), desc = 'interpolating probabilities....'):
        
        interpolated_probabilities[trigram] = lambda1*trigram_probability + lambda2*bigram_probabilities[trigram[1:]] + lambda3*unigram_probabilities[trigram[2:]]
        
    return interpolated_probabilities

In [None]:
interpolated_probabilities = compute_interpolated_probabilities(
        trigram_probabilities=trigram_probabilities,
    bigram_probabilities = bigram_probabilities, 
unigram_probabilities=unigram_probabilities, 
    lambda1=0.7,
        lambda2=0.3,
            lambda3= 0
        )
print(f'best perplexity using interpolation model : {compute_perplexity(test_data, interpolated_probabilities, trigram_n_1_counts, n = 3, K = 1, V = trigram_vocab_size)}\nlambda1 : 0.7, lambda2  : 0.3, lambda3 : 0')

Creating dict: 100%|██████████| 15602595/15602595 [00:05<00:00, 2625999.29it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [00:32<00:00, 484835.60it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 72.97it/s] 

best perplexity using interpolation model : 531.9069643347829
lambda1 : 0.7, lambda2  : 0.3, lambda3 : 0





In [None]:
grid_space = np.linspace(0, 1, 11)

def grid_search(grid_space):
    
    best_lambdas = [1, 0, 0]
    
    best_perplexity = 1e8

    for lambda1 in grid_space:
        for lambda2 in grid_space:
            
            if (lambda2 + lambda1 > 1):
                break
            else:
                
                lambda3 = 1-lambda1-lambda2
                print(f'checking lambda1 = {lambda1}, lambda2 = {lambda2}, lambda3 = {lambda3}')
                
                interpolated_probabilities = compute_interpolated_probabilities(trigram_probabilities=trigram_probabilities, unigram_probabilities = unigram_probabilities, bigram_probabilities=bigram_probabilities, lambda1=lambda1,lambda2=lambda2,lambda3= lambda3)
                
                current_perplexity = compute_perplexity(validation_data, interpolated_probabilities, trigram_n_1_counts, n = 3, K = 1, V = trigram_vocab_size)
                
                print("current perplexity : ", current_perplexity, [lambda1, lambda2, lambda3])
                print()
                
                if current_perplexity < best_perplexity:
                    best_perplexity = current_perplexity
                    best_lambdas = [lambda1, lambda2, lambda3]
                    print("best perplexity : ",best_perplexity, best_lambdas)
                    print()
                    
    return (best_perplexity, best_lambdas)
    
best_perplexity, best_lambdas = grid_search(grid_space)
print(best_perplexity, best_lambdas)

checking lambda1 = 0.0, lambda2 = 0.0, lambda3 = 1.0


Creating dict: 100%|██████████| 15602595/15602595 [00:05<00:00, 2613355.44it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:15<00:00, 207966.32it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 101.73it/s]


current perplexity :  1278.715756638718 [0.0, 0.0, 1.0]

best perplexity :  1278.715756638718 [0.0, 0.0, 1.0]

checking lambda1 = 0.0, lambda2 = 0.1, lambda3 = 0.9


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2371707.74it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:31<00:00, 170760.86it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 67.94it/s]


current perplexity :  851.2818698324786 [0.0, 0.1, 0.9]

best perplexity :  851.2818698324786 [0.0, 0.1, 0.9]

checking lambda1 = 0.0, lambda2 = 0.2, lambda3 = 0.8


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2237465.58it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [02:27<00:00, 106095.22it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:10<00:00,  9.40it/s]


current perplexity :  767.5663372548344 [0.0, 0.2, 0.8]

best perplexity :  767.5663372548344 [0.0, 0.2, 0.8]

checking lambda1 = 0.0, lambda2 = 0.30000000000000004, lambda3 = 0.7


Creating dict: 100%|██████████| 15602595/15602595 [00:12<00:00, 1256497.87it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:33<00:00, 167426.79it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 71.14it/s]


current perplexity :  719.3959276888703 [0.0, 0.30000000000000004, 0.7]

best perplexity :  719.3959276888703 [0.0, 0.30000000000000004, 0.7]

checking lambda1 = 0.0, lambda2 = 0.4, lambda3 = 0.6


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2572794.24it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:07<00:00, 230282.03it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 109.84it/s]


current perplexity :  686.1434067318518 [0.0, 0.4, 0.6]

best perplexity :  686.1434067318518 [0.0, 0.4, 0.6]

checking lambda1 = 0.0, lambda2 = 0.5, lambda3 = 0.5


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2552023.62it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:13<00:00, 212338.63it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 98.52it/s]


current perplexity :  661.1224468148836 [0.0, 0.5, 0.5]

best perplexity :  661.1224468148836 [0.0, 0.5, 0.5]

checking lambda1 = 0.0, lambda2 = 0.6000000000000001, lambda3 = 0.3999999999999999


Creating dict: 100%|██████████| 15602595/15602595 [00:05<00:00, 2627876.81it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:11<00:00, 219719.14it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 102.41it/s]


current perplexity :  641.3319523246311 [0.0, 0.6000000000000001, 0.3999999999999999]

best perplexity :  641.3319523246311 [0.0, 0.6000000000000001, 0.3999999999999999]

checking lambda1 = 0.0, lambda2 = 0.7000000000000001, lambda3 = 0.29999999999999993


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2543049.38it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:15<00:00, 207045.08it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 99.89it/s]


current perplexity :  625.1852228177879 [0.0, 0.7000000000000001, 0.29999999999999993]

best perplexity :  625.1852228177879 [0.0, 0.7000000000000001, 0.29999999999999993]

checking lambda1 = 0.0, lambda2 = 0.8, lambda3 = 0.19999999999999996


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2497549.34it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:16<00:00, 204664.73it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 100.18it/s]


current perplexity :  611.7689569948046 [0.0, 0.8, 0.19999999999999996]

best perplexity :  611.7689569948046 [0.0, 0.8, 0.19999999999999996]

checking lambda1 = 0.0, lambda2 = 0.9, lambda3 = 0.09999999999999998


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2578238.96it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:12<00:00, 216589.04it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 110.38it/s]


current perplexity :  600.5638124564389 [0.0, 0.9, 0.09999999999999998]

best perplexity :  600.5638124564389 [0.0, 0.9, 0.09999999999999998]

checking lambda1 = 0.0, lambda2 = 1.0, lambda3 = 0.0


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2497425.34it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:23<00:00, 187891.60it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 66.70it/s]


current perplexity :  591.4285611198426 [0.0, 1.0, 0.0]

best perplexity :  591.4285611198426 [0.0, 1.0, 0.0]

checking lambda1 = 0.1, lambda2 = 0.0, lambda3 = 0.9


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2373767.44it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:35<00:00, 163999.45it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:02<00:00, 43.20it/s]


current perplexity :  799.5956276010277 [0.1, 0.0, 0.9]

checking lambda1 = 0.1, lambda2 = 0.1, lambda3 = 0.8


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2231362.81it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:08<00:00, 227392.77it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 55.47it/s]


current perplexity :  710.8080891403257 [0.1, 0.1, 0.8]

checking lambda1 = 0.1, lambda2 = 0.2, lambda3 = 0.7


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2481872.98it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:54<00:00, 135802.98it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:05<00:00, 19.81it/s]


current perplexity :  671.7714930170914 [0.1, 0.2, 0.7]

checking lambda1 = 0.1, lambda2 = 0.30000000000000004, lambda3 = 0.6


Creating dict: 100%|██████████| 15602595/15602595 [00:09<00:00, 1570246.39it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:58<00:00, 132176.17it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:02<00:00, 36.84it/s]


current perplexity :  645.2397226622596 [0.1, 0.30000000000000004, 0.6]

checking lambda1 = 0.1, lambda2 = 0.4, lambda3 = 0.5


Creating dict: 100%|██████████| 15602595/15602595 [00:08<00:00, 1839987.94it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:33<00:00, 166856.94it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 69.48it/s]


current perplexity :  625.1358639871286 [0.1, 0.4, 0.5]

checking lambda1 = 0.1, lambda2 = 0.5, lambda3 = 0.4


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2405956.24it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:20<00:00, 192864.72it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 69.76it/s]


current perplexity :  609.0464300658489 [0.1, 0.5, 0.4]

checking lambda1 = 0.1, lambda2 = 0.6000000000000001, lambda3 = 0.29999999999999993


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2483350.01it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:20<00:00, 193904.71it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 76.87it/s]


current perplexity :  595.7483128860662 [0.1, 0.6000000000000001, 0.29999999999999993]

checking lambda1 = 0.1, lambda2 = 0.7000000000000001, lambda3 = 0.19999999999999996


Creating dict: 100%|██████████| 15602595/15602595 [00:05<00:00, 2630755.39it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:10<00:00, 221856.97it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 108.57it/s]


current perplexity :  584.5451623728121 [0.1, 0.7000000000000001, 0.19999999999999996]

best perplexity :  584.5451623728121 [0.1, 0.7000000000000001, 0.19999999999999996]

checking lambda1 = 0.1, lambda2 = 0.8, lambda3 = 0.09999999999999998


Creating dict: 100%|██████████| 15602595/15602595 [00:05<00:00, 2678293.64it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:11<00:00, 218926.99it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 68.90it/s]


current perplexity :  575.03202596717 [0.1, 0.8, 0.09999999999999998]

best perplexity :  575.03202596717 [0.1, 0.8, 0.09999999999999998]

checking lambda1 = 0.1, lambda2 = 0.9, lambda3 = 0.0


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2591491.45it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:28<00:00, 175781.86it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 67.69it/s]


current perplexity :  567.0450063508446 [0.1, 0.9, 0.0]

best perplexity :  567.0450063508446 [0.1, 0.9, 0.0]

checking lambda1 = 0.2, lambda2 = 0.0, lambda3 = 0.8


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2399183.40it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:26<00:00, 180767.79it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 77.39it/s]


current perplexity :  720.6479360913926 [0.2, 0.0, 0.8]

checking lambda1 = 0.2, lambda2 = 0.1, lambda3 = 0.7000000000000001


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2542515.66it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:17<00:00, 200681.68it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 91.41it/s]


current perplexity :  659.0390709948365 [0.2, 0.1, 0.7000000000000001]

checking lambda1 = 0.2, lambda2 = 0.2, lambda3 = 0.6000000000000001


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2509305.47it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:21<00:00, 192055.64it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 91.91it/s] 


current perplexity :  629.8595350003598 [0.2, 0.2, 0.6000000000000001]

checking lambda1 = 0.2, lambda2 = 0.30000000000000004, lambda3 = 0.5


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2525641.92it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:23<00:00, 186145.36it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 104.38it/s]


current perplexity :  609.4165357998431 [0.2, 0.30000000000000004, 0.5]

checking lambda1 = 0.2, lambda2 = 0.4, lambda3 = 0.4


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2532429.26it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:20<00:00, 193434.63it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 107.14it/s]


current perplexity :  593.6007761590052 [0.2, 0.4, 0.4]

checking lambda1 = 0.2, lambda2 = 0.5, lambda3 = 0.30000000000000004


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2579397.95it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:10<00:00, 220317.75it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 90.44it/s]


current perplexity :  580.7479145048086 [0.2, 0.5, 0.30000000000000004]

checking lambda1 = 0.2, lambda2 = 0.6000000000000001, lambda3 = 0.19999999999999996


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2575368.99it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:09<00:00, 225201.21it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 104.71it/s]


current perplexity :  570.0068439672536 [0.2, 0.6000000000000001, 0.19999999999999996]

checking lambda1 = 0.2, lambda2 = 0.7000000000000001, lambda3 = 0.09999999999999998


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2485886.52it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:09<00:00, 223357.69it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 110.60it/s]


current perplexity :  560.9002776749513 [0.2, 0.7000000000000001, 0.09999999999999998]

best perplexity :  560.9002776749513 [0.2, 0.7000000000000001, 0.09999999999999998]

checking lambda1 = 0.2, lambda2 = 0.8, lambda3 = 0.0


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2577743.16it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:07<00:00, 231088.32it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 108.45it/s]


current perplexity :  553.1924432231955 [0.2, 0.8, 0.0]

best perplexity :  553.1924432231955 [0.2, 0.8, 0.0]

checking lambda1 = 0.30000000000000004, lambda2 = 0.0, lambda3 = 0.7


Creating dict: 100%|██████████| 15602595/15602595 [00:05<00:00, 2621108.74it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:17<00:00, 200300.04it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 93.97it/s]


current perplexity :  674.990907440057 [0.30000000000000004, 0.0, 0.7]

checking lambda1 = 0.30000000000000004, lambda2 = 0.1, lambda3 = 0.6


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2550494.31it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:13<00:00, 213656.29it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 105.75it/s]


current perplexity :  626.2277295686731 [0.30000000000000004, 0.1, 0.6]

checking lambda1 = 0.30000000000000004, lambda2 = 0.2, lambda3 = 0.49999999999999994


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2561581.11it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:07<00:00, 229504.04it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 99.39it/s]


current perplexity :  602.1965388379131 [0.30000000000000004, 0.2, 0.49999999999999994]

checking lambda1 = 0.30000000000000004, lambda2 = 0.30000000000000004, lambda3 = 0.3999999999999999


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2547453.08it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:16<00:00, 204837.49it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 91.79it/s]


current perplexity :  585.0938686240243 [0.30000000000000004, 0.30000000000000004, 0.3999999999999999]

checking lambda1 = 0.30000000000000004, lambda2 = 0.4, lambda3 = 0.29999999999999993


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2547662.34it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:13<00:00, 212344.24it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 97.51it/s]


current perplexity :  571.723955346997 [0.30000000000000004, 0.4, 0.29999999999999993]

checking lambda1 = 0.30000000000000004, lambda2 = 0.5, lambda3 = 0.19999999999999996


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2544186.54it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:18<00:00, 197898.20it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 101.07it/s]


current perplexity :  560.7821692636052 [0.30000000000000004, 0.5, 0.19999999999999996]

checking lambda1 = 0.30000000000000004, lambda2 = 0.6000000000000001, lambda3 = 0.09999999999999987


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2505737.73it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:04<00:00, 241290.19it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 104.23it/s]


current perplexity :  551.6060337484548 [0.30000000000000004, 0.6000000000000001, 0.09999999999999987]

best perplexity :  551.6060337484548 [0.30000000000000004, 0.6000000000000001, 0.09999999999999987]

checking lambda1 = 0.30000000000000004, lambda2 = 0.7000000000000001, lambda3 = -1.1102230246251565e-16


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2548378.42it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:05<00:00, 237793.60it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 97.81it/s]


current perplexity :  543.8526464931776 [0.30000000000000004, 0.7000000000000001, -1.1102230246251565e-16]

best perplexity :  543.8526464931776 [0.30000000000000004, 0.7000000000000001, -1.1102230246251565e-16]

checking lambda1 = 0.4, lambda2 = 0.0, lambda3 = 0.6


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2480399.39it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:11<00:00, 217567.37it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 85.19it/s]


current perplexity :  643.2913926622717 [0.4, 0.0, 0.6]

checking lambda1 = 0.4, lambda2 = 0.1, lambda3 = 0.5


Creating dict: 100%|██████████| 15602595/15602595 [00:05<00:00, 2621284.28it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:05<00:00, 236749.41it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 102.60it/s]


current perplexity :  602.310833341694 [0.4, 0.1, 0.5]

checking lambda1 = 0.4, lambda2 = 0.2, lambda3 = 0.39999999999999997


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2547688.52it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [00:58<00:00, 265856.09it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 124.33it/s]


current perplexity :  581.587686319141 [0.4, 0.2, 0.39999999999999997]

checking lambda1 = 0.4, lambda2 = 0.30000000000000004, lambda3 = 0.29999999999999993


Creating dict: 100%|██████████| 15602595/15602595 [00:05<00:00, 2677008.72it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [00:50<00:00, 306593.91it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 115.59it/s]


current perplexity :  566.6926729615899 [0.4, 0.30000000000000004, 0.29999999999999993]

checking lambda1 = 0.4, lambda2 = 0.4, lambda3 = 0.19999999999999996


Creating dict: 100%|██████████| 15602595/15602595 [00:05<00:00, 2656961.15it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [00:55<00:00, 282471.14it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 104.16it/s]


current perplexity :  554.9791580148996 [0.4, 0.4, 0.19999999999999996]

checking lambda1 = 0.4, lambda2 = 0.5, lambda3 = 0.09999999999999998


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2600167.38it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:05<00:00, 239994.19it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 102.72it/s]


current perplexity :  545.3662963479687 [0.4, 0.5, 0.09999999999999998]

checking lambda1 = 0.4, lambda2 = 0.6000000000000001, lambda3 = -1.1102230246251565e-16


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2531889.61it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:09<00:00, 226095.53it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 101.20it/s]


current perplexity :  537.325912027445 [0.4, 0.6000000000000001, -1.1102230246251565e-16]

best perplexity :  537.325912027445 [0.4, 0.6000000000000001, -1.1102230246251565e-16]

checking lambda1 = 0.5, lambda2 = 0.0, lambda3 = 0.5


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2468073.83it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:22<00:00, 188759.24it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 70.12it/s]


current perplexity :  619.2815800489943 [0.5, 0.0, 0.5]

checking lambda1 = 0.5, lambda2 = 0.1, lambda3 = 0.4


Creating dict: 100%|██████████| 15602595/15602595 [00:08<00:00, 1815500.50it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:37<00:00, 159781.75it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:02<00:00, 36.38it/s]


current perplexity :  583.6078169114224 [0.5, 0.1, 0.4]

checking lambda1 = 0.5, lambda2 = 0.2, lambda3 = 0.3


Creating dict: 100%|██████████| 15602595/15602595 [00:08<00:00, 1894191.89it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:37<00:00, 159903.45it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 67.17it/s]


current perplexity :  565.241740887437 [0.5, 0.2, 0.3]

checking lambda1 = 0.5, lambda2 = 0.30000000000000004, lambda3 = 0.19999999999999996


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2588527.82it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:04<00:00, 240245.13it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 99.93it/s]


current perplexity :  551.954827865251 [0.5, 0.30000000000000004, 0.19999999999999996]

checking lambda1 = 0.5, lambda2 = 0.4, lambda3 = 0.09999999999999998


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2592209.09it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:06<00:00, 234471.64it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 99.98it/s]


current perplexity :  541.475353183595 [0.5, 0.4, 0.09999999999999998]

checking lambda1 = 0.5, lambda2 = 0.5, lambda3 = 0.0


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2576451.45it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:18<00:00, 199542.56it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 100.03it/s]


current perplexity :  532.889727125189 [0.5, 0.5, 0.0]

best perplexity :  532.889727125189 [0.5, 0.5, 0.0]

checking lambda1 = 0.6000000000000001, lambda2 = 0.0, lambda3 = 0.3999999999999999


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2530862.17it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:20<00:00, 193626.21it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 58.31it/s]


current perplexity :  600.1363073460391 [0.6000000000000001, 0.0, 0.3999999999999999]

checking lambda1 = 0.6000000000000001, lambda2 = 0.1, lambda3 = 0.29999999999999993


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2319832.98it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:49<00:00, 141882.59it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:05<00:00, 19.90it/s]


current perplexity :  568.3447494179367 [0.6000000000000001, 0.1, 0.29999999999999993]

checking lambda1 = 0.6000000000000001, lambda2 = 0.2, lambda3 = 0.1999999999999999


Creating dict: 100%|██████████| 15602595/15602595 [00:10<00:00, 1547380.50it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:40<00:00, 155050.64it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 54.58it/s]


current perplexity :  551.7694615380008 [0.6000000000000001, 0.2, 0.1999999999999999]

checking lambda1 = 0.6000000000000001, lambda2 = 0.30000000000000004, lambda3 = 0.09999999999999987


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2282302.39it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:44<00:00, 149039.40it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 50.87it/s]


current perplexity :  539.7315822015868 [0.6000000000000001, 0.30000000000000004, 0.09999999999999987]

checking lambda1 = 0.6000000000000001, lambda2 = 0.4, lambda3 = -1.1102230246251565e-16


Creating dict: 100%|██████████| 15602595/15602595 [00:07<00:00, 2150957.67it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:20<00:00, 194504.68it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 65.91it/s]


current perplexity :  530.2429407301797 [0.6000000000000001, 0.4, -1.1102230246251565e-16]

best perplexity :  530.2429407301797 [0.6000000000000001, 0.4, -1.1102230246251565e-16]

checking lambda1 = 0.7000000000000001, lambda2 = 0.0, lambda3 = 0.29999999999999993


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2484024.45it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [02:47<00:00, 93078.12it/s] 
calculating perplexity for articles....: 100%|██████████| 100/100 [00:02<00:00, 35.05it/s]


current perplexity :  584.3459988081714 [0.7000000000000001, 0.0, 0.29999999999999993]

checking lambda1 = 0.7000000000000001, lambda2 = 0.1, lambda3 = 0.19999999999999993


Creating dict: 100%|██████████| 15602595/15602595 [00:08<00:00, 1925778.16it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:29<00:00, 174674.87it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 61.40it/s]


current perplexity :  555.5307922057027 [0.7000000000000001, 0.1, 0.19999999999999993]

checking lambda1 = 0.7000000000000001, lambda2 = 0.2, lambda3 = 0.09999999999999992


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2556715.57it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:32<00:00, 168158.33it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 64.26it/s]


current perplexity :  540.3806768399796 [0.7000000000000001, 0.2, 0.09999999999999992]

checking lambda1 = 0.7000000000000001, lambda2 = 0.30000000000000004, lambda3 = -1.1102230246251565e-16


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2525617.36it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:17<00:00, 201767.58it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 100.76it/s]


current perplexity :  529.3690305594669 [0.7000000000000001, 0.30000000000000004, -1.1102230246251565e-16]

best perplexity :  529.3690305594669 [0.7000000000000001, 0.30000000000000004, -1.1102230246251565e-16]

checking lambda1 = 0.8, lambda2 = 0.0, lambda3 = 0.19999999999999996


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2295310.33it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:24<00:00, 184606.74it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 73.69it/s]


current perplexity :  571.0157408879486 [0.8, 0.0, 0.19999999999999996]

checking lambda1 = 0.8, lambda2 = 0.1, lambda3 = 0.09999999999999995


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2418121.88it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:04<00:00, 243084.22it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 101.60it/s]


current perplexity :  544.5612474699558 [0.8, 0.1, 0.09999999999999995]

checking lambda1 = 0.8, lambda2 = 0.2, lambda3 = -5.551115123125783e-17


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2541356.60it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:06<00:00, 234628.58it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 104.35it/s]


current perplexity :  530.595068614198 [0.8, 0.2, -5.551115123125783e-17]

checking lambda1 = 0.9, lambda2 = 0.0, lambda3 = 0.09999999999999998


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2586357.07it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:11<00:00, 218166.05it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:01<00:00, 98.78it/s]


current perplexity :  559.5803196715702 [0.9, 0.0, 0.09999999999999998]

checking lambda1 = 0.9, lambda2 = 0.1, lambda3 = -2.7755575615628914e-17


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2556997.68it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:18<00:00, 197833.02it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 100.47it/s]


current perplexity :  535.0532618833385 [0.9, 0.1, -2.7755575615628914e-17]

checking lambda1 = 1.0, lambda2 = 0.0, lambda3 = 0.0


Creating dict: 100%|██████████| 15602595/15602595 [00:06<00:00, 2548944.30it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:15<00:00, 207777.58it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 100.54it/s]

current perplexity :  549.6792421554559 [1.0, 0.0, 0.0]

529.3690305594669 [0.7000000000000001, 0.30000000000000004, -1.1102230246251565e-16]





## below is for the validation dataset

In [None]:
print(best_perplexity, best_lambdas)

529.3690305594669 [0.7000000000000001, 0.30000000000000004, -1.1102230246251565e-16]


## below is for test dataset 

In [13]:
interpolated_probabilities = compute_interpolated_probabilities(unigram_probabilities, bigram_probabilities, trigram_probabilities, lambda1=0.7, lambda2=0.3, lambda3=0)
print(compute_perplexity(test_data, interpolated_probabilities, trigram_n_1_counts, n = 3, K = 1, V=trigram_vocab_size))

Creating dict: 100%|██████████| 15602595/15602595 [00:07<00:00, 2178432.05it/s]
interpolating probabilities....: 100%|██████████| 15602595/15602595 [01:03<00:00, 247386.37it/s]
calculating perplexity for articles....: 100%|██████████| 100/100 [00:00<00:00, 114.31it/s]

531.9069643347831



