<h1>generating random 1000 lines from tokenized_sentences

In [None]:
import random

input_file = "tokenized_sentences.txt"
train_file = "train.txt"
valid_file = "validation.txt"
test_file = "test.txt"

# Reservoir sampling for validation & test
valid_size, test_size = 1000, 1000
valid, test = [], []

with open(input_file, "r", encoding="utf-8") as infile, \
     open(train_file, "w", encoding="utf-8", buffering=1024*1024) as train_out, \
     open(valid_file, "w", encoding="utf-8", buffering=1024*1024) as valid_out, \
     open(test_file, "w", encoding="utf-8", buffering=1024*1024) as test_out:

    for idx, line in enumerate(infile, 1):
        # Validation sampling
        if len(valid) < valid_size:
            valid.append(line)
        elif random.random() < valid_size / idx:
            replace_idx = random.randint(0, valid_size - 1)
            train_out.write(valid[replace_idx])
            valid[replace_idx] = line
            continue

        # Test sampling
        elif len(test) < test_size:
            test.append(line)
        elif random.random() < test_size / idx:
            replace_idx = random.randint(0, test_size - 1)
            train_out.write(test[replace_idx])
            test[replace_idx] = line
            continue

        else:
            train_out.write(line)

    # At the end, flush validation & test
    valid_out.writelines(valid)
    test_out.writelines(test)

<h1>applying good turing smoothing

In [None]:
import pickle
import collections
import math
import csv

# ---------------- Load Pickle ----------------
def load_ngram_counts(n):
    file_name = f"ngram_counts_{n}gram.pkl"
    with open(file_name, "rb") as f:
        counts = pickle.load(f)
    return counts

# ---------------- Vocabulary ----------------
def get_vocabulary(unigrams):
    return set([token[0] for token in unigrams.keys()])

# ---------------- Good-Turing Smoothing ----------------
def good_turing_smoothing(ngrams, vocab_size, n):
    N = sum(ngrams.values())  # total seen n-grams
    Nc = collections.Counter(ngrams.values())  # frequency of frequency
    N1 = Nc.get(1, 0)

    smoothed_probs = {}
    for ngram, c in ngrams.items():
        Nc_plus1 = Nc.get(c + 1, 0)
        if Nc_plus1 > 0 and Nc[c] > 0:
            c_star = (c + 1) * (Nc_plus1 / Nc[c]) #updation of count for seen probability
        else:
            c_star = c
        smoothed_probs[ngram] = c_star / N   #unseen probability

    # Unseen probability 
    if n == 1:   #no of unseen unigrams
        U = len(ngrams) 
        num_unseen = max(1, vocab_size - U) #vocab size - unique unigrams
    else:
        num_unseen = max(1, (vocab_size ** n) - len(ngrams))

    unseen_prob = (N1 / N) / num_unseen if N > 0 else 1e-12

    return smoothed_probs, unseen_prob

# ---------------- Sentence Probability ----------------
def sentence_logprob(sentence, smoothed_probs, unseen_prob, n):
    words = ["<s>"] * (n - 1) + sentence.strip().split() + ["</s>"]  ## applying start and end of sentences
    logp = 0.0
    for i in range(len(words) - n + 1):
        ngram = tuple(words[i:i + n])
        prob = smoothed_probs.get(ngram, unseen_prob)
        logp += math.log(prob if prob > 0 else 1e-12)   ##if smoothed prob is 0, take log of a small value
    return logp

# ---------------- File Processing ----------------
def process_file(input_file, output_csv, uni, bi, tri, quad, uni_un, bi_un, tri_un, quad_un):
    with open(input_file, "r", encoding="utf-8") as f, \
         open(output_csv, "w", newline="", encoding="utf-8") as out:
        
        writer = csv.writer(out)
        writer.writerow(["sentence", "unigram_logprob", "bigram_logprob", "trigram_logprob", "quadrigram_logprob"])
        
        for i, line in enumerate(f, 1):
            sentence = line.strip()
            if not sentence:
                continue
            uni_lp = sentence_logprob(sentence, uni, uni_un, 1)
            bi_lp = sentence_logprob(sentence, bi, bi_un, 2)
            tri_lp = sentence_logprob(sentence, tri, tri_un, 3)
            quad_lp = sentence_logprob(sentence, quad, quad_un, 4)
            
            writer.writerow([sentence, uni_lp, bi_lp, tri_lp, quad_lp])

            if i % 100 == 0:
                print(f"Processed {i} sentences from {input_file}")

# ---------------- Main ----------------
if __name__ == "__main__":
    # Load counts from training set
    unigrams = load_ngram_counts(1)
    bigrams = load_ngram_counts(2)
    trigrams = load_ngram_counts(3)
    quadrigrams = load_ngram_counts(4)

    vocab = get_vocabulary(unigrams)
    V = len(vocab)

    # Apply Good-Turing smoothing
    uni_probs, uni_unseen = good_turing_smoothing(unigrams, V, 1)
    bi_probs, bi_unseen = good_turing_smoothing(bigrams, V, 2)
    tri_probs, tri_unseen = good_turing_smoothing(trigrams, V, 3)
    quad_probs, quad_unseen = good_turing_smoothing(quadrigrams, V, 4)

    print("Good-Turing models built.")

    # Compute sentence probabilities for validation and test
    process_file("validation.txt", "validation_probs.csv",
                 uni_probs, bi_probs, tri_probs, quad_probs,
                 uni_unseen, bi_unseen, tri_unseen, quad_unseen)

    process_file("test.txt", "test_probs.csv",
                 uni_probs, bi_probs, tri_probs, quad_probs,
                 uni_unseen, bi_unseen, tri_unseen, quad_unseen)

    print("Sentence log-probabilities saved to validation_probs.csv and test_probs.csv")

Good-Turing models built.
Processed 100 sentences from validation.txt
Processed 200 sentences from validation.txt
Processed 300 sentences from validation.txt
Processed 400 sentences from validation.txt
Processed 500 sentences from validation.txt
Processed 600 sentences from validation.txt
Processed 700 sentences from validation.txt
Processed 800 sentences from validation.txt
Processed 900 sentences from validation.txt
Processed 1000 sentences from validation.txt
Processed 100 sentences from test.txt
Processed 200 sentences from test.txt
Processed 300 sentences from test.txt
Processed 400 sentences from test.txt
Processed 500 sentences from test.txt
Processed 600 sentences from test.txt
Processed 700 sentences from test.txt
Processed 800 sentences from test.txt
Processed 900 sentences from test.txt
Processed 1000 sentences from test.txt
Sentence log-probabilities saved to validation_probs.csv and test_probs.csv


In [4]:
import pandas as pd
import matplotlib.pyplot as plt
df_valid = pd.read_csv("validation_probs.csv")
df_test = pd.read_csv("test_probs.csv")
df_test

Unnamed: 0,sentence,unigram_logprob,bigram_logprob,trigram_logprob,quadrigram_logprob
0,આગળ રાજીવે કહ્યું કે મને આ ફિલ્મ કરવાનો આનંદ એ...,-224.847594,-491.371379,-886.177848,-1382.079797
1,"અને કરેલાં વ્યવહારો કઈ રીતે દર્શાવવા, જેથી કર ...",-130.983638,-284.341029,-475.941343,-726.576650
2,"વાસ્તવિક ફેશન વલણ ચળકતા હાથ છે, જે ચમકતા સોના ...",-136.326965,-267.453882,-538.103259,-778.606157
3,આ ઉપરાંત એક્ટ્રસ બાની અને લીઝા રેની વચ્ચેના બો...,-137.102001,-284.064090,-485.387381,-676.323635
4,પાણપુરની મોબાઇલ શોપમાંથી રૂપિયા 70 હજારના મોબા...,-82.430779,-202.665324,-367.116529,-486.196637
...,...,...,...,...,...
995,"તે જરૂરી છે, તે કામ ન હતી ""તેના પોતાના ખાતર એક...",-97.987791,-209.637243,-349.890256,-596.159632
996,જયાં તેમને છાતીમાં દુઃખાવો અને શ્વાસમાં તકલીફ ...,-93.343205,-161.654636,-273.037570,-484.139258
997,"ઘર ને ઝળહળવા માટે ,",-49.170553,-108.861620,-215.743625,-281.867989
998,"સૌથી મહત્ત્વપૂર્ણ વાત એ છે કે, રોકાણકારોએ ઇન્વ...",-163.884697,-349.020348,-589.921431,-1008.808841


In [5]:
df_valid

Unnamed: 0,sentence,unigram_logprob,bigram_logprob,trigram_logprob,quadrigram_logprob
0,વાસ્તવિક અંકુશ રેખા નજીક આવેલા પેન્ગોગ ત્સો એટ...,-154.389792,-299.341855,-577.936643,-891.714668
1,#1.,-6.836237,-56.104956,-81.581451,-108.043697
2,અને બીજાં નવા નિશાળિયા.,-29.758443,-105.562530,-170.693897,-223.618390
3,આ ગામના વાલીઓએ ફીની બબાલથી બચવા કરી અનોખી પહેલ.,-85.194901,-218.421505,-343.487599,-449.684891
4,74 ડિગ્રી ઉત્તર અક્ષાંશ અને 104.,-65.418081,-154.719851,-257.283170,-336.620199
...,...,...,...,...,...
995,..,-10.520790,-14.352074,-14.352074,-14.352074
996,ગુરવિંદર સિંહ (ખેડૂત)- બે ઇજા અને ઢસડાવાના નિશ...,-65.092633,-239.882524,-385.803876,-540.218486
997,થી શરૂ થઈને 399 રૂ.,-47.496788,-101.425329,-168.768514,-243.722973
998,જપ્ત કરાયેલો વિદેશી દારૂની કિંમત આશરે રૂ.,-70.338676,-132.690521,-271.360805,-392.330664


<h1>table

In [None]:
import pandas as pd
import collections

def good_turing_table(ngrams, top_k=100):
    Nc = collections.Counter(ngrams.values())  # frequency of frequencies
    rows = []

    for c in range(top_k):
        Nc_val = Nc.get(c, 0)
        Nc_plus1 = Nc.get(c + 1, 0)

        if Nc_val > 0:
            c_star = (c + 1) * (Nc_plus1 / Nc_val) if Nc_plus1 > 0 else c ##count updation
        else:
            c_star = 0

        rows.append([c, Nc_val, c_star])

    df = pd.DataFrame(rows, columns=["C (MLE)", "Nc", "C*"])
    return df

if __name__ == "__main__":
    # Load n-gram counts
    unigrams = load_ngram_counts(1)
    bigrams = load_ngram_counts(2)
    trigrams = load_ngram_counts(3)
    quadrigrams = load_ngram_counts(4)

    # Build tables
    uni_table   = good_turing_table(unigrams, 100)
    bi_table    = good_turing_table(bigrams, 100)
    tri_table   = good_turing_table(trigrams, 100)
    quad_table  = good_turing_table(quadrigrams, 100)

    # Save them
    uni_table.to_csv("good_turing_table_unigrams.csv", index=False)
    bi_table.to_csv("good_turing_table_bigrams.csv", index=False)
    tri_table.to_csv("good_turing_table_trigrams.csv", index=False)
    quad_table.to_csv("good_turing_table_quadrigrams.csv", index=False)

    # Print preview
    print("\nUnigram Table (first 10 rows):\n", uni_table.head(10))
    print("\nBigram Table (first 10 rows):\n", bi_table.head(10))
    print("\nTrigram Table (first 10 rows):\n", tri_table.head(10))
    print("\nQuadrigram Table (first 10 rows):\n", quad_table.head(10))


Unigram Table (first 10 rows):
    C (MLE)      Nc        C*
0        0       0  0.000000
1        1  430533  0.393187
2        2   84640  1.352942
3        3   38171  2.344398
4        4   22372  3.375425
5        5   15103  4.377938
6        6   11020  5.380853
7        7    8471  6.289694
8        8    6660  7.470270
9        9    5528  8.420767

Bigram Table (first 10 rows):
    C (MLE)       Nc        C*
0        0        0  0.000000
1        1  4420859  0.229718
2        2   507775  1.083687
3        3   183423  2.075727
4        4    95184  3.052771
5        5    58115  4.013800
6        6    38877  5.044062
7        7    28014  6.075534
8        8    21275  7.015981
9        9    16585  8.027736

Trigram Table (first 10 rows):
    C (MLE)       Nc        C*
0        0        0  0.000000
1        1  8963253  0.108275
2        2   485247  0.860933
3        3   139255  1.849643
4        4    64393  2.867082
5        5    36924  3.832792
6        6    23587  4.911901
7        7   

<h1>Implement deleted interpolated smoothing technique for the quadrigram model and find
the best parameters.

In [None]:
import pickle
import collections
import math
import csv
import json
from pathlib import Path
from tqdm import tqdm 

# ---------------- Load Pickle ----------------
def load_ngram_counts(n, folder="."):
    file_name = Path(folder) / f"ngram_counts_{n}gram.pkl"
    with open(file_name, "rb") as f:
        counts = pickle.load(f)
    # Expect counts to be dict mapping tuple(word1,...,wordn) -> int
    # For unigrams counts might be dict of (token,) -> count
    return counts

# ---------------- Helpers ----------------
def get_total_unigram_tokens(unigrams):
    # N = sum of unigram counts
    return sum(unigrams.values())

def safe_div(num, den):
    if den <= 0:
        return 0.0
    return num / den

# ---------------- Deleted interpolation lambdas ----------------
def compute_deleted_interpolation_lambdas(unigrams, bigrams, trigrams, quadrigrams):
    
    acc = [0.0, 0.0, 0.0, 0.0]  # index 0 -> unigram weight, ... index 3 -> quad weight
    # Unigrams keys: (w,), bigrams: (w1,w2), trigrams: (w1,w2,w3), quadrigrams: (w1,w2,w3,w4)

    N_unigram_tokens = get_total_unigram_tokens(unigrams)
    # iterate over all observed quadrigrams
    for quad, c_quad in tqdm(quadrigrams.items(), desc="Deleted-Interpolation", unit="quad"):
        c = c_quad
        if c <= 0:
            continue

        w1, w2, w3, w4 = quad

        # 4-gram leave-one-out conditional:
        context4 = (w1, w2, w3)
        c_context4 = trigrams.get(context4, 0)
        p4 = 0.0
        if c_context4 - 1 > 0:
            p4 = safe_div(c_quad - 1, c_context4 - 1)

        # 3-gram:
        tri = (w2, w3, w4)
        context3 = (w2, w3)
        c_tri = trigrams.get(tri, 0)
        c_context3 = bigrams.get(context3, 0)
        p3 = 0.0
        if c_context3 - 1 > 0:
            p3 = safe_div(c_tri - 1, c_context3 - 1)

        # 2-gram:
        bi = (w3, w4)
        context2 = (w3,)
        c_bi = bigrams.get(bi, 0)
        c_context2 = unigrams.get(context2, 0)
        p2 = 0.0
        if c_context2 - 1 > 0:
            p2 = safe_div(c_bi - 1, c_context2 - 1)

        # 1-gram:
        unigram = (w4,)
        c_uni = unigrams.get(unigram, 0)
        p1 = 0.0
        if N_unigram_tokens - 1 > 0:
            p1 = safe_div(c_uni - 1, N_unigram_tokens - 1)

        # choose best order
        ps = [p1, p2, p3, p4]
        # pick argmax; in tie, choose highest-order? standard is argmax; ties arbitrary
        best_k = max(range(4), key=lambda k: ps[k])  # 0..3
        acc[best_k] += c  # weight by count (or you can add 1 if preferred)

    total = sum(acc)
    if total == 0:
        # fallback to uniform
        lambdas = [0.25, 0.25, 0.25, 0.25]
    else:
        lambdas = [a / total for a in acc]

    return lambdas

# ---------------- Interpolated probability ----------------
def interpolated_prob(w_prev3, w_prev2, w_prev1, w, unigrams, bigrams, trigrams, quadrigrams, lambdas):
    # compute MLE conditional probabilities (non-deleted) at each order:
    # P4 = c(w_prev3,w_prev2,w_prev1,w)/c(w_prev3,w_prev2,w_prev1)
    c4 = quadrigrams.get((w_prev3, w_prev2, w_prev1, w), 0)
    c_context4 = trigrams.get((w_prev3, w_prev2, w_prev1), 0)
    p4 = safe_div(c4, c_context4)

    c3 = trigrams.get((w_prev2, w_prev1, w), 0)
    c_context3 = bigrams.get((w_prev2, w_prev1), 0)
    p3 = safe_div(c3, c_context3)

    c2 = bigrams.get((w_prev1, w), 0)
    c_context2 = unigrams.get((w_prev1,), 0)
    p2 = safe_div(c2, c_context2)

    c1 = unigrams.get((w,), 0)
    N = get_total_unigram_tokens(unigrams)
    p1 = safe_div(c1, N)

    lamb1, lamb2, lamb3, lamb4 = lambdas
    return lamb1*p1 + lamb2*p2 + lamb3*p3 + lamb4*p4

# ---------------- Sentence log-prob & perplexity ----------------
def sentence_logprob_interpolated(sentence, unigrams, bigrams, trigrams, quadrigrams, lambdas, n=4):
    tokens = sentence.strip().split()
    # pad with start tokens
    tokens = ["<s>"]*(n-1) + tokens + ["</s>"]
    logp = 0.0
    word_count = 0
    for i in range(n-1, len(tokens)):
        w = tokens[i]
        w_prev1 = tokens[i-1] if i-1 >= 0 else "<s>"
        w_prev2 = tokens[i-2] if i-2 >= 0 else "<s>"
        w_prev3 = tokens[i-3] if i-3 >= 0 else "<s>"
        prob = interpolated_prob(w_prev3, w_prev2, w_prev1, w,
                                 unigrams, bigrams, trigrams, quadrigrams, lambdas)
        if prob <= 0:
            prob = 1e-12
        logp += math.log(prob)
        word_count += 1
    return logp, word_count

def evaluate_perplexity(file_path, unigrams, bigrams, trigrams, quadrigrams, lambdas):
    total_logprob = 0.0
    total_words = 0
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            s = line.strip()
            if not s:
                continue
            lp, wc = sentence_logprob_interpolated(s, unigrams, bigrams, trigrams, quadrigrams, lambdas)
            total_logprob += lp
            total_words += wc

    # perplexity = exp(- total_logprob / total_words)
    avg_neg_log_likelihood = - total_logprob / total_words
    perp = math.exp(avg_neg_log_likelihood)
    return perp

# ---------------- Main ----------------
if __name__ == "__main__":
    # Load counts
    unigrams = load_ngram_counts(1)
    bigrams = load_ngram_counts(2)
    trigrams = load_ngram_counts(3)
    quadrigrams = load_ngram_counts(4)

    # compute lambdas
    lambdas = compute_deleted_interpolation_lambdas(unigrams, bigrams, trigrams, quadrigrams)
    print("Deleted-interpolation lambdas (lambda1..lambda4):", lambdas)

    # Evaluate on validation set and optionally on test
    val_file = "validation.txt"
    test_file = "test.txt"

    print("Evaluating on validation set...")
    val_ppl = evaluate_perplexity(val_file, unigrams, bigrams, trigrams, quadrigrams, lambdas)
    print(f"Validation Perplexity (interpolated quadrigram): {val_ppl:.4f}")

    print("Evaluating on test set...")
    test_ppl = evaluate_perplexity(test_file, unigrams, bigrams, trigrams, quadrigrams, lambdas)
    print(f"Test Perplexity (interpolated quadrigram): {test_ppl:.4f}")

    # Save lambdas
    with open("quad_interpolation_lambdas.json", "w", encoding="utf-8") as out:
        json.dump({"lambdas": lambdas}, out, indent=2)
    print("Lambdas saved to quad_interpolation_lambdas.json")

Deleted-Interpolation: 100%|██████████| 11619213/11619213 [00:48<00:00, 241725.84quad/s]


Deleted-interpolation lambdas (lambda1..lambda4): [0.40355213390901956, 0.38137513613787494, 0.15748241207594238, 0.0575903178771631]
Evaluating on validation set...
Validation Perplexity (interpolated quadrigram): 1921.6896
Evaluating on test set...
Test Perplexity (interpolated quadrigram): 2188.2764
Lambdas saved to quad_interpolation_lambdas.json
