In [None]:
import pandas as pd
import numpy as np
import pickle
import math

# Function to load any n-gram pkl file
def load_ngram(path):
    with open(path, "rb") as f:
        df = pickle.load(f)
    return dict(zip(df["Ngram"], df["Count"]))


# Load all n-grams
unigram_c = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/unigram.pkl")
bigram_c = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/bigram.pkl")
trigram_c = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/trigram.pkl")
quadrigram_c = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/quadrigram.pkl")


# Quick check
print("Unigrams loaded:", len(unigram_c))
print("Bigrams loaded:", len(bigram_c))
print("Trigrams loaded:", len(trigram_c))
print("Quadrigrams loaded:", len(quadrigram_c))

Unigrams loaded: 299475
Bigrams loaded: 3466685
Trigrams loaded: 9694653
Quadrigrams loaded: 14174147


In [None]:
from collections import Counter

vocab_size = len(unigram_c)
print("Vocabulary size:", vocab_size)


Vocabulary size: 299475


In [None]:
# Cell 3: Good-Turing Smoothing
def good_turing_probs(counter, vocab_size, n):
    N = sum(counter.values())
    freq_of_freq = Counter(counter.values())
    N1 = freq_of_freq[1]

    probs = {}
    for ng, c in counter.items():
        Nc = freq_of_freq[c]
        Nc1 = freq_of_freq.get(c+1,0)
        
        c_star = (c+1) * Nc1 / Nc
        probs[ng] = c_star / N

    # unseen events
    if n == 1:
        unseen_count = vocab_size - len(counter)
    else:
        unseen_count = vocab_size**n - len(counter)

    p_unseen = (N1 / N) / max(1, unseen_count)
    return probs, p_unseen
# Build models
uni_probs, uni_pu   = good_turing_probs(unigram_c, vocab_size, 1)
bi_probs, bi_pu     = good_turing_probs(bigram_c, vocab_size, 2)
tri_probs, tri_pu   = good_turing_probs(trigram_c, vocab_size, 3)
quad_probs, quad_pu = good_turing_probs(quadrigram_c, vocab_size, 4)

print("Good-Turing models built successfully.")


Good-Turing models built successfully.


In [14]:
# Cell 4: Sentence Probability (log-space to avoid underflow)
def sentence_log_prob(sentence, probs, p_unseen, n):
    tokens = ["<s>"]*(n-1) + sentence.strip().split() + ["</s>"]
    log_prob = 0.0
    for i in range(len(tokens)-n+1):
        ng = " ".join(tokens[i:i+n])
        p = probs.get(ng, p_unseen)
        log_prob += math.log(p if p > 0 else 1e-15)
    return log_prob

def sentence_perplexity(sentence, probs, p_unseen, n):
    tokens = ["<s>"]*(n-1) + sentence.strip().split() + ["</s>"]
    log_prob = 0.0
    for i in range(len(tokens)-n+1):
        ng = " ".join(tokens[i:i+n])
        p = probs.get(ng, p_unseen)
        log_prob += math.log(p if p > 0 else 1e-15)
    length = len(tokens)   # include <s> and </s>
    return math.exp(-log_prob / length)


In [15]:
# Cell 5: Load validation & test sets
val = pd.read_csv("val_sentences.csv")
test = pd.read_csv("test_sentences.csv")

val_sentences = val["sentence"].tolist()
test_sentences = test["sentence"].tolist()

print("Validation sentences:", len(val_sentences))
print("Test sentences:", len(test_sentences))


Validation sentences: 1000
Test sentences: 1000


In [18]:
# Cell 6: Compute and store sentence probabilities for validation and test sets
val_results = []
for s in val_sentences:
    val_row = {"sentence": s}
    val_row["unigram_logprob"] = sentence_log_prob(s, uni_probs, uni_pu, 1)
    val_row["bigram_logprob"]  = sentence_log_prob(s, bi_probs, bi_pu, 2)
    val_row["trigram_logprob"] = sentence_log_prob(s, tri_probs, tri_pu, 3)
    val_row["quadrigram_logprob"] = sentence_log_prob(s, quad_probs, quad_pu, 4)
    val_results.append(val_row)

test_results = []
for s in test_sentences:
    test_row = {"sentence": s}
    test_row["unigram_logprob"] = sentence_log_prob(s, uni_probs, uni_pu, 1)
    test_row["bigram_logprob"]  = sentence_log_prob(s, bi_probs, bi_pu, 2)
    test_row["trigram_logprob"] = sentence_log_prob(s, tri_probs, tri_pu, 3)
    test_row["quadrigram_logprob"] = sentence_log_prob(s, quad_probs, quad_pu, 4)
    test_results.append(test_row)

# Save to CSV
val_df = pd.DataFrame(val_results)
test_df = pd.DataFrame(test_results)
val_df.to_csv("val_sentence_logprobabilities.csv", index=False)
test_df.to_csv("test_sentence_logprobabilities.csv", index=False)

In [17]:
# Cell 7: Task 3 - Good-Turing Frequency Tables

def good_turing_table(counter, top_k=100):
    # Build frequency-of-frequency
    freq_of_freq = Counter(counter.values())
    rows = []
    
    for c in sorted(freq_of_freq.keys())[:top_k]:
        Nc = freq_of_freq[c]
        Nc1 = freq_of_freq.get(c+1, 0)
        if Nc > 0:
            c_star = (c+1) * Nc1 / Nc
        else:
            c_star = c
        rows.append((c, Nc, c_star))
    
    df = pd.DataFrame(rows, columns=["c", "Nc", "c*"])
    return df

# Generate tables
uni_table  = good_turing_table(unigram_c)
bi_table   = good_turing_table(bigram_c)
tri_table  = good_turing_table(trigram_c)
quad_table = good_turing_table(quadrigram_c)

print("Top Good-Turing frequencies for Unigrams:")
display(uni_table.head(20))

print("\nTop Good-Turing frequencies for Bigrams:")
display(bi_table.head(20))

print("\nTop Good-Turing frequencies for Trigrams:")
display(tri_table.head(20))

print("\nTop Good-Turing frequencies for Quadrigrams:")
display(quad_table.head(20))


Top Good-Turing frequencies for Unigrams:


Unnamed: 0,c,Nc,c*
0,1,64458,1.202861
1,2,38767,1.398586
2,3,18073,2.438997
3,4,11020,3.397913
4,5,7489,4.408866
5,6,5503,5.481192
6,7,4309,6.444187
7,8,3471,7.537597
8,9,2907,8.121775
9,10,2361,10.07751



Top Good-Turing frequencies for Bigrams:


Unnamed: 0,c,Nc,c*
0,10,15033,10.696335
1,11,14618,10.101245
2,12,12305,11.294839
3,13,10691,12.060612
4,14,9210,13.315961
5,15,8176,13.976517
6,16,7142,15.100532
7,17,6344,16.541614
8,18,5830,17.285763
9,19,5304,18.322021



Top Good-Turing frequencies for Trigrams:


Unnamed: 0,c,Nc,c*
0,8,6412,33.669994
1,9,23988,7.871436
2,10,18882,9.08336
3,11,15592,9.841201
4,12,12787,11.017518
5,13,10837,11.690136
6,14,9049,13.106973
7,15,7907,13.897559
8,16,6868,14.695545
9,17,5937,15.935321



Top Good-Turing frequencies for Quadrigrams:


Unnamed: 0,c,Nc,c*
0,5,18867,11.699793
1,6,36790,4.801441
2,7,25235,5.719041
3,8,18040,6.861752
4,9,13754,7.615966
5,10,10475,9.201146
6,11,8762,9.630678
7,12,7032,10.496871
8,13,5678,11.825291
9,14,4796,12.848207
