In [2]:
import pandas as pd
import numpy as np
import math, pickle
from sklearn.model_selection import KFold


def load_ngram(path):
    with open(path, "rb") as f:
        df = pickle.load(f)
    return dict(zip(df["Ngram"], df["Count"]))


# Load all n-grams
unigram_c = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/unigram.pkl")
bigram_c = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/bigram.pkl")
trigram_c = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/trigram.pkl")
quadrigram_c = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/quadrigram.pkl")


# Quick check
print("Unigrams loaded:", len(unigram_c))
print("Bigrams loaded:", len(bigram_c))
print("Trigrams loaded:", len(trigram_c))
print("Quadrigrams loaded:", len(quadrigram_c))


Unigrams loaded: 299475
Bigrams loaded: 3466685
Trigrams loaded: 9694653
Quadrigrams loaded: 14174147


In [3]:
vocab = set()
for ng in unigram_c:
    for w in ng.split():
        vocab.add(w)

vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)


Vocabulary size: 299475


In [4]:
def estimate_lambdas(quad_c, tri_c, bi_c, uni_c):
    lambda_counts = [0, 0, 0, 0]

    for qng, qcount in quad_c.items():
        if qcount < 2:  # skip too rare events
            continue

        w1, w2, w3, w4 = qng.split()
        trig = f"{w2} {w3} {w4}"
        bigr = f"{w3} {w4}"
        unigr = w4

        # prefix counts
        quad_prefix = f"{w1} {w2} {w3}"
        trig_prefix = f"{w2} {w3}"
        bigr_prefix = w3

        quad_denom = tri_c.get(quad_prefix, 0)
        trig_denom = bi_c.get(trig_prefix, 0)
        bigr_denom = uni_c.get(bigr_prefix, 0)
        total_unigrams = sum(uni_c.values())

        probs = [
            (qcount - 1) / (quad_denom - 1) if quad_denom > 1 else 0,  # quadrigram
            tri_c.get(trig, 0) / (trig_denom - 1) if trig_denom > 1 else 0,  # trigram
            bi_c.get(bigr, 0) / (bigr_denom - 1) if bigr_denom > 1 else 0,   # bigram
            uni_c.get(unigr, 0) / total_unigrams if total_unigrams > 0 else 0 # unigram
        ]

        max_index = np.argmax(probs)
        lambda_counts[max_index] += qcount

    total = sum(lambda_counts)
    if total == 0:
        return [0.25, 0.25, 0.25, 0.25]  # fallback
    return [c / total for c in lambda_counts]


In [5]:
from collections import Counter

def build_ngram_counts(sentences, n):
    counts = Counter()
    for sent in sentences:
        tokens = sent.split()
        for i in range(len(tokens) - n + 1):
            ngram = " ".join(tokens[i:i+n])
            counts[ngram] += 1
    return counts

kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_lambdas = []

val = pd.read_csv("val_sentences.csv")
val_sentences = val["sentence"].tolist()

for train_idx, test_idx in kf.split(val_sentences):
    train_sents = [val_sentences[i] for i in train_idx]

    uni_c = build_ngram_counts(train_sents, 1)
    bi_c = build_ngram_counts(train_sents, 2)
    tri_c = build_ngram_counts(train_sents, 3)
    quad_c = build_ngram_counts(train_sents, 4)

    lambdas = estimate_lambdas(quad_c, tri_c, bi_c, uni_c)
    all_lambdas.append(lambdas)

df_lambda = pd.DataFrame(all_lambdas, columns=["λ4 (Quad)","λ3 (Tri)","λ2 (Bi)","λ1 (Uni)"])
df_lambda.loc["Average"] = df_lambda.mean()

print(df_lambda)


         λ4 (Quad)  λ3 (Tri)  λ2 (Bi)  λ1 (Uni)
0         0.409836  0.590164      0.0       0.0
1         0.358974  0.641026      0.0       0.0
2         0.359649  0.640351      0.0       0.0
3         0.358696  0.641304      0.0       0.0
4         0.294737  0.705263      0.0       0.0
Average   0.356378  0.643622      0.0       0.0


In [6]:
def interpolated_prob(ngram, lambdas):
    w = ngram.split()
    if len(w) != 4: 
        return 1e-12

    quad = quadrigram_c.get(" ".join(w),0) / max(1,trigram_c.get(" ".join(w[:3]),0))
    tri  = trigram_c.get(" ".join(w[1:]),0) / max(1,bigram_c.get(" ".join(w[1:3]),0))
    bi   = bigram_c.get(" ".join(w[2:]),0) / max(1,unigram_c.get(w[2],0))
    uni  = unigram_c.get(w[3],0) / sum(unigram_c.values())

    return lambdas[0]*quad + lambdas[1]*tri + lambdas[2]*bi + lambdas[3]*uni


In [7]:
def sentence_prob_interp(sentence, lambdas):
    tokens = ["<s>","<s>","<s>"] + sentence.strip().split() + ["</s>"]
    log_prob = 0.0
    for i in range(len(tokens)-3):
        ng = " ".join(tokens[i:i+4])
        p = interpolated_prob(ng, lambdas)
        log_prob += math.log(p if p>0 else 1e-15)
    return log_prob

def sentence_perplexity_interp(sentence, lambdas):
    tokens = sentence.strip().split()
    logp = sentence_prob_interp(sentence, lambdas)
    return math.exp(-logp / max(1,len(tokens)))


In [8]:
best_lambdas = df_lambda.loc["Average"].values.tolist()

print("Using averaged λ values:", best_lambdas)

for s in val_sentences[:5]:
    logp = sentence_prob_interp(s, best_lambdas)
    ppl = sentence_perplexity_interp(s, best_lambdas)
    print(f"\nSentence: {s}")
    print(f"  LogProb: {logp:.4f}, Perplexity: {ppl:.4f}")


Using averaged λ values: [0.3563784083268647, 0.6436215916731354, 0.0, 0.0]

Sentence: वोल्टेज के आधार पर स्कूटर की स्पीड को बढ़ाया या घटाया जा सकता है।
  LogProb: -518.0816, Perplexity: 11787686347935690.0000

Sentence: भाजपा ने 350 से अधिक सीटों पर जीत दर्ज करने का लक्ष्य रखा है .
  LogProb: -552.6204, Perplexity: 9999999999999822.0000

Sentence: पीएम ने लॉकडाउन के सकारात्मक परिणामों को रेखांकित किया और कहा कि देश पिछले एक - डेढ़ महीन में हजारों लोगों की जान बचाने में कामयाब रहा है।
  LogProb: -1001.6245, Perplexity: 3433320018281917.0000

Sentence: उन्होंने यह निर्देश स्वच्छ भारत मिशन - शहरी क्षेत्र की राज्य स्तरीय सर्वोच्च समिति की बैठक में स्वच्छ भारत मिशन - शहरी के तहत तहत ठोस अपशिष्ट प्रबंधन प्रणाली के कामकाज का जायजा लेते हुए दिए।
  LogProb: -1277.9347, Perplexity: 2610157215682480.0000

Sentence: स्टार्टअप टेक उद्यमी जैसे भारत पे ( एक भारतीय भुगतान ऐप ) के संस्थापक निपुण मेहरा और ऑथेंटिकेशन टेक्नोलॉजी कंपनी ऑथब्रिज के अजय त्रेहन से लेकर रतन टाटा और गौतम अडानी जैसे शीर्ष उद्यमी