In [5]:
import pandas as pd
import numpy as np
import math
from collections import Counter
from sklearn.model_selection import KFold

def load_counter(csv_file):
    df = pd.read_csv(csv_file)
    df["Ngram"] = df["Ngram"].astype(str)
    df["Count"] = df["Count"].astype(int)
    return Counter(dict(zip(df["Ngram"], df["Count"])))

# Load n-gram counts
unigram_c    = load_counter(r"C:\Users\evilk\OneDrive\Desktop\III YEAR\LABS\NLP\LAB4\unigram.csv")
bigram_c     = load_counter(r"C:\Users\evilk\OneDrive\Desktop\III YEAR\LABS\NLP\LAB4\bigram.csv")
trigram_c    = load_counter(r"C:\Users\evilk\OneDrive\Desktop\III YEAR\LABS\NLP\LAB4\trigram.csv")
quadrigram_c = load_counter(r"C:\Users\evilk\OneDrive\Desktop\III YEAR\LABS\NLP\LAB4\quadrigram.csv")

print("Unigrams:", len(unigram_c))
print("Bigrams:", len(bigram_c))
print("Trigrams:", len(trigram_c))
print("Quadrigrams:", len(quadrigram_c))


Unigrams: 199997
Bigrams: 200000
Trigrams: 200000
Quadrigrams: 200000


In [6]:
vocab = set()
for ng in unigram_c:
    for w in ng.split():
        vocab.add(w)

vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)


Vocabulary size: 199997


In [7]:
def estimate_lambdas(quad_c, tri_c, bi_c, uni_c):
    lambda_counts = [0,0,0,0]

    for qng, qcount in quad_c.items():
        if qcount < 2:  # skip too rare events
            continue

        w1,w2,w3,w4 = qng.split()
        trig = " ".join([w2,w3,w4])
        bigr = " ".join([w3,w4])
        unigr = w4

        # leave-one-out counts
        trig_prefix = " ".join([w1,w2,w3])
        trig_count = tri_c.get(trig_prefix, 0)
        bigr_count = bi_c.get(" ".join([w2,w3]), 0)
        unigr_count = uni_c.get(w3, 0)

        probs = [
            (qcount-1) / max(1, trig_count-1),             # quadrigram
            tri_c.get(trig,0) / max(1, bigr_count-1),      # trigram
            bi_c.get(bigr,0) / max(1, unigr_count-1),      # bigram
            uni_c.get(unigr,0) / sum(uni_c.values())       # unigram
        ]

        max_index = np.argmax(probs)
        lambda_counts[max_index] += qcount

    total = sum(lambda_counts)
    if total == 0:
        return [0.25,0.25,0.25,0.25]  # fallback
    return [c/total for c in lambda_counts]


In [None]:
# Load validation sentences
val = pd.read_csv("val_sentences.csv")
val_sentences = val["sentence"].tolist()

kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_lambdas = []

for train_idx, test_idx in kf.split(val_sentences):
  
    lambdas = estimate_lambdas(quadrigram_c, trigram_c, bigram_c, unigram_c)
    all_lambdas.append(lambdas)

df_lambda = pd.DataFrame(all_lambdas, columns=["λ4 (Quad)","λ3 (Tri)","λ2 (Bi)","λ1 (Uni)"])
df_lambda.loc["Average"] = df_lambda.mean()
print("Estimated λ values (per fold and average):")
display(df_lambda)


Estimated λ values (per fold and average):


Unnamed: 0,λ4 (Quad),λ3 (Tri),λ2 (Bi),λ1 (Uni)
0,0.708414,0.205019,0.084701,0.001866
1,0.708414,0.205019,0.084701,0.001866
2,0.708414,0.205019,0.084701,0.001866
3,0.708414,0.205019,0.084701,0.001866
4,0.708414,0.205019,0.084701,0.001866
Average,0.708414,0.205019,0.084701,0.001866


In [9]:
def interpolated_prob(ngram, lambdas):
    w = ngram.split()
    if len(w) != 4: 
        return 1e-12

    quad = quadrigram_c.get(" ".join(w),0) / max(1,trigram_c.get(" ".join(w[:3]),0))
    tri  = trigram_c.get(" ".join(w[1:]),0) / max(1,bigram_c.get(" ".join(w[1:3]),0))
    bi   = bigram_c.get(" ".join(w[2:]),0) / max(1,unigram_c.get(w[2],0))
    uni  = unigram_c.get(w[3],0) / sum(unigram_c.values())

    return lambdas[0]*quad + lambdas[1]*tri + lambdas[2]*bi + lambdas[3]*uni


In [10]:
def sentence_prob_interp(sentence, lambdas):
    tokens = ["<s>","<s>","<s>"] + sentence.strip().split() + ["</s>"]
    log_prob = 0.0
    for i in range(len(tokens)-3):
        ng = " ".join(tokens[i:i+4])
        p = interpolated_prob(ng, lambdas)
        log_prob += math.log(p if p>0 else 1e-15)
    return log_prob

def sentence_perplexity_interp(sentence, lambdas):
    tokens = sentence.strip().split()
    logp = sentence_prob_interp(sentence, lambdas)
    return math.exp(-logp / max(1,len(tokens)))


In [None]:
best_lambdas = df_lambda.loc["Average"].values.tolist()

print("Using averaged λ values:", best_lambdas)

for s in val_sentences[:5]:
    logp = sentence_prob_interp(s, best_lambdas)
    ppl = sentence_perplexity_interp(s, best_lambdas)
    print(f"\nSentence: {s}")
    print(f"  LogProb: {logp:.4f}, Perplexity: {ppl:.4f}")


Using averaged λ values: [0.708413735452884, 0.2050189253402777, 0.08470105707429092, 0.0018662821325473665]

Sentence: कम  से  कम  यह  तो  बता  दिया  जाए  कि  आखिर  उन्हें  गैस  कहां  से  मिलेगी ।
  LogProb: -126.7709, Perplexity: 2760.5391

Sentence: शारिब  रूदौलवी  ने  कहा  कि  जिगर  से  मेरा  गहरा  ताल्लुक  रहा  है  यही  कारण  है  कि  उनके  मृत्यु  के  बाद  मैंने  इनपर  किताब  लिखने  का  फैसला  किया  1960 में  जिगर  का  देहांत  हुआ  तो  मुझे  बहुत  तकलीफ  हुई  कि  उनपर  कुछ  ज्यादा  लिखा  नहीं  गया  था  कुछ  मजामीन  के ।
  LogProb: -524.3225, Perplexity: 35826.7231

Sentence: िंध  का  समूचा  इलाका  संपदाओं  से  लबरेज  उर्वशी  रौतेला  का  जन्म  25 - 02 - 1994 को  भारत  के  उत्तराखंड  राज्य  के  कोटद्वार , पौड़ी  गढ़वाल  में  हुआ  था ।
  LogProb: -315.1057, Perplexity: 36443.6559

Sentence: इन  केंद्रों  पर  आगामी  28 फरवरी  तक  28 हजार  400 मीट्रिक  टन  धान  की  खरीद  की  जाएगी ।
  LogProb: -99.5014, Perplexity: 251.6033

Sentence: इंदौर ।
  LogProb: -12.0790, Perplexity: 419.6738
