In [56]:
# Cell 1: Imports & Counter Loader
import pandas as pd
import numpy as np
import math
from collections import Counter

def load_counter(csv_file):
    df = pd.read_csv(csv_file)
    df["Ngram"] = df["Ngram"].astype(str)   # ensure keys are strings
    df["Count"] = df["Count"].astype(int)
    return Counter(dict(zip(df["Ngram"], df["Count"])))

# Load n-gram counts
unigram_c    = load_counter(r"C:\Users\evilk\OneDrive\Desktop\III YEAR\LABS\NLP\LAB4\unigram.csv")
bigram_c     = load_counter(r"C:\Users\evilk\OneDrive\Desktop\III YEAR\LABS\NLP\LAB4\bigram.csv")
trigram_c    = load_counter(r"C:\Users\evilk\OneDrive\Desktop\III YEAR\LABS\NLP\LAB4\trigram.csv")
quadrigram_c = load_counter(r"C:\Users\evilk\OneDrive\Desktop\III YEAR\LABS\NLP\LAB4\quadrigram.csv")

print("Unigrams:", len(unigram_c))
print("Bigrams:", len(bigram_c))
print("Trigrams:", len(trigram_c))
print("Quadrigrams:", len(quadrigram_c))


Unigrams: 199997
Bigrams: 200000
Trigrams: 200000
Quadrigrams: 200000


In [57]:
# Cell 2: Build Vocabulary
vocab = set()
for ng in unigram_c:
    for w in ng.split():
        vocab.add(w)

vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)


Vocabulary size: 199997


In [58]:
# Cell 3: Good-Turing Smoothing
def good_turing_probs(counter, vocab_size, n):
    N = sum(counter.values())
    freq_of_freq = Counter(counter.values())
    N1 = freq_of_freq[1]

    probs = {}
    for ng, c in counter.items():
        Nc = freq_of_freq[c]
        Nc1 = freq_of_freq.get(c+1, 0)
        if Nc > 0:
            c_star = (c+1) * Nc1 / Nc
        else:
            c_star = c
        probs[ng] = c_star / N

    # unseen events
    if n == 1:
        unseen_count = vocab_size - len(counter)
    else:
        unseen_count = vocab_size**n - len(counter)

    p_unseen = (N1 / N) / max(1, unseen_count)
    return probs, p_unseen

# Build models
uni_probs, uni_pu   = good_turing_probs(unigram_c, vocab_size, 1)
bi_probs, bi_pu     = good_turing_probs(bigram_c, vocab_size, 2)
tri_probs, tri_pu   = good_turing_probs(trigram_c, vocab_size, 3)
quad_probs, quad_pu = good_turing_probs(quadrigram_c, vocab_size, 4)

print("Good-Turing models built successfully.")


Good-Turing models built successfully.


In [59]:
# Cell 4: Sentence Probability (log-space to avoid underflow)
def sentence_log_prob(sentence, probs, p_unseen, n):
    tokens = ["<s>"]*(n-1) + sentence.strip().split() + ["</s>"]
    log_prob = 0.0
    for i in range(len(tokens)-n+1):
        ng = " ".join(tokens[i:i+n])
        p = probs.get(ng, p_unseen)
        log_prob += math.log(p if p > 0 else 1e-15)
    return log_prob

def sentence_perplexity(sentence, probs, p_unseen, n):
    tokens = ["<s>"]*(n-1) + sentence.strip().split() + ["</s>"]
    log_prob = 0.0
    for i in range(len(tokens)-n+1):
        ng = " ".join(tokens[i:i+n])
        p = probs.get(ng, p_unseen)
        log_prob += math.log(p if p > 0 else 1e-15)
    length = len(tokens)   # include <s> and </s>
    return math.exp(-log_prob / length)


In [60]:
# Cell 5: Load validation & test sets
val = pd.read_csv("val_sentences.csv")
test = pd.read_csv("test_sentences.csv")

val_sentences = val["sentence"].tolist()
test_sentences = test["sentence"].tolist()

print("Validation sentences:", len(val_sentences))
print("Test sentences:", len(test_sentences))


Validation sentences: 1000
Test sentences: 1000


In [61]:
# Cell 6: Evaluate on first 10 validation sentences with all models
models = [
    ("Unigram", uni_probs, uni_pu, 1),
    ("Bigram", bi_probs, bi_pu, 2),
    ("Trigram", tri_probs, tri_pu, 3),
    ("Quadgram", quad_probs, quad_pu, 4)
]

for s in val_sentences[:10]:
    print(f"\nSentence: {s}")
    for name, probs, pu, n in models:
        logp = sentence_log_prob(s, probs, pu, n)
        ppl  = sentence_perplexity(s, probs, pu, n)
        print(f"  {name:8s} -> LogProb: {logp:.4f}, Perplexity: {ppl:.4f}")



Sentence: कम  से  कम  यह  तो  बता  दिया  जाए  कि  आखिर  उन्हें  गैस  कहां  से  मिलेगी ।
  Unigram  -> LogProb: -587.1592, Perplexity: 999999999999984.6250
  Bigram   -> LogProb: -376.8639, Perplexity: 1238152138.7032
  Trigram  -> LogProb: -521.7073, Perplexity: 841356465741.7955
  Quadgram -> LogProb: -520.5458, Perplexity: 201144458758.3000

Sentence: शारिब  रूदौलवी  ने  कहा  कि  जिगर  से  मेरा  गहरा  ताल्लुक  रहा  है  यही  कारण  है  कि  उनके  मृत्यु  के  बाद  मैंने  इनपर  किताब  लिखने  का  फैसला  किया  1960 में  जिगर  का  देहांत  हुआ  तो  मुझे  बहुत  तकलीफ  हुई  कि  उनपर  कुछ  ज्यादा  लिखा  नहीं  गया  था  कुछ  मजामीन  के ।
  Unigram  -> LogProb: -1475.6091, Perplexity: 3678447867683.1069
  Bigram   -> LogProb: -1357.9808, Perplexity: 219587321638.9611
  Trigram  -> LogProb: -1599.3909, Perplexity: 12758081684434.4805
  Quadgram -> LogProb: -1694.0536, Perplexity: 42112419859281.9844

Sentence: िंध  का  समूचा  इलाका  संपदाओं  से  लबरेज  उर्वशी  रौतेला  का  जन्म  25 - 02 - 1994 को  भ

In [62]:
# Cell 7: Task 3 - Good-Turing Frequency Tables

def good_turing_table(counter, top_k=100):
    # Build frequency-of-frequency
    freq_of_freq = Counter(counter.values())
    rows = []
    
    for c in sorted(freq_of_freq.keys())[:top_k]:
        Nc = freq_of_freq[c]
        Nc1 = freq_of_freq.get(c+1, 0)
        if Nc > 0:
            c_star = (c+1) * Nc1 / Nc
        else:
            c_star = c
        rows.append((c, Nc, c_star))
    
    df = pd.DataFrame(rows, columns=["c", "Nc", "c*"])
    return df

# Generate tables
uni_table  = good_turing_table(unigram_c)
bi_table   = good_turing_table(bigram_c)
tri_table  = good_turing_table(trigram_c)
quad_table = good_turing_table(quadrigram_c)

print("Top Good-Turing frequencies for Unigrams:")
display(uni_table.head(20))

print("\nTop Good-Turing frequencies for Bigrams:")
display(bi_table.head(20))

print("\nTop Good-Turing frequencies for Trigrams:")
display(tri_table.head(20))

print("\nTop Good-Turing frequencies for Quadrigrams:")
display(quad_table.head(20))


Top Good-Turing frequencies for Unigrams:


Unnamed: 0,c,Nc,c*
0,3,4921,24.277992
1,4,29868,3.336514
2,5,19931,4.328935
3,6,14380,5.354172
4,7,10999,6.438403
5,8,8852,7.328513
6,9,7208,8.325472
7,10,6001,9.284286
8,11,5065,10.483712
9,12,4425,11.028701



Top Good-Turing frequencies for Bigrams:


Unnamed: 0,c,Nc,c*
0,41,17,10188.705882
1,42,4124,39.517459
2,43,3790,43.164116
3,44,3718,43.741259
4,45,3614,43.912562
5,46,3450,44.793043
6,47,3288,47.19708
7,48,3233,47.060006
8,49,3105,47.600644
9,50,2956,48.187754



Top Good-Turing frequencies for Trigrams:


Unnamed: 0,c,Nc,c*
0,35,1391,156.940331
1,36,6064,34.565468
2,37,5665,36.336452
3,38,5417,36.768137
4,39,5107,39.075778
5,40,4989,38.698938
6,41,4709,39.092376
7,42,4383,41.077116
8,43,4187,43.401003
9,44,4130,41.992736



Top Good-Turing frequencies for Quadrigrams:


Unnamed: 0,c,Nc,c*
0,20,4482,53.118474
1,21,11337,19.970186
2,22,10291,20.525896
3,23,9184,22.20993
4,24,8499,22.343805
5,25,7596,24.16535
6,26,7060,24.384136
7,27,6376,26.26537
8,28,5981,26.517472
9,29,5469,28.156884
