In [1]:
import pandas as pd
import re
import math
import random
from collections import Counter

In [2]:
BOS = "<s>"
EOS = "</s>"
UNK = "<unk>"

In [3]:
def tokenize_lines(lines):
    pattern = re.compile(r"([\u0900-\u097F]+|[।!?.,;:])")
    for line in lines:
        line = str(line).strip()
        if not line:
            continue
        tokens = pattern.findall(line)
        yield tokens

In [4]:
def add_sentence_boundaries(tokens, order=4):
    bos = [BOS] * (order - 1)
    return bos + tokens + [EOS]

In [5]:
def build_vocab(train_tokens, min_freq=1):
    freq = Counter(train_tokens)
    vocab = {w for w, c in freq.items() if c >= min_freq or w in {BOS, EOS, UNK}}
    return vocab

In [6]:
def map_to_unk(tokens, vocab):
    return [t if t in vocab else UNK for t in tokens]

In [7]:
def ngrams(tokens, n):
    for i in range(n - 1, len(tokens)):
        yield tuple(tokens[i - n + 1 : i + 1])

In [8]:
class NgramLM4DeletedInterp:
    def __init__(self):
        self.order = 4
        self.counts = {1: Counter(), 2: Counter(), 3: Counter(), 4: Counter()}
        self.context_counts = {1: 0, 2: Counter(), 3: Counter(), 4: Counter()}
        self.vocab = set()
        self.N_tokens = 0
        self.lambdas = [0.25, 0.25, 0.25, 0.25]

    def fit(self, sentences, min_freq=1):
        flat = []
        for sent in sentences:
            s = add_sentence_boundaries(sent, order=self.order)
            flat.extend(s)
        self.N_tokens = len(flat)
        self.vocab = build_vocab(flat, min_freq=min_freq)
        flat = map_to_unk(flat, self.vocab)

        for n in [1, 2, 3, 4]:
            self.counts[n].update(ngrams(flat, n))

        self.context_counts[1] = self.N_tokens
        for n in [2, 3, 4]:
            ctx_counter = Counter()
            for ng, c in self.counts[n].items():
                ctx = ng[:-1]
                ctx_counter[ctx] += c
            self.context_counts[n] = ctx_counter

    def _p_ml(self, w, context):
        k = len(context) + 1
        if k == 1:
            return self.counts[1][(w,)] / max(1, self.context_counts[1])
        elif k == 2:
            return self.counts[2][context + (w,)] / self.context_counts[2][context] if self.context_counts[2][context] > 0 else 0.0
        elif k == 3:
            return self.counts[3][context + (w,)] / self.context_counts[3][context] if self.context_counts[3][context] > 0 else 0.0
        elif k == 4:
            return self.counts[4][context + (w,)] / self.context_counts[4][context] if self.context_counts[4][context] > 0 else 0.0

    def prob(self, w, context3):
        w = w if w in self.vocab else UNK
        c3, c2, c1 = context3, context3[1:], context3[2:]
        p4, p3, p2, p1 = self._p_ml(w, c3), self._p_ml(w, c2), self._p_ml(w, c1), self._p_ml(w, ())
        λ1, λ2, λ3, λ4 = self.lambdas
        return λ4*p4 + λ3*p3 + λ2*p2 + λ1*p1

    def learn_lambdas_deleted_interpolation(self):
        assign_counts = [0, 0, 0, 0]
        for quad, c in self.counts[4].items():
            w1, w2, w3, w4 = quad
            c4, d4 = c - 1, self.context_counts[4][quad[:-1]] - 1
            p4 = (c4/d4) if d4 > 0 and c4 > 0 else 0.0

            c3, d3 = self.counts[3][(w2, w3, w4)] - 1, self.context_counts[3][(w2, w3)] - 1
            p3 = (c3/d3) if d3 > 0 and c3 > 0 else 0.0

            c2, d2 = self.counts[2][(w3, w4)] - 1, self.context_counts[2][(w3,)] - 1
            p2 = (c2/d2) if d2 > 0 and c2 > 0 else 0.0

            c1, d1 = self.counts[1][(w4,)] - 1, self.context_counts[1] - 1
            p1 = (c1/d1) if d1 > 0 and c1 > 0 else 0.0

            probs = [p1, p2, p3, p4]
            k = probs.index(max(probs))
            assign_counts[k] += c

        total = sum(assign_counts)
        self.lambdas = [cnt / total for cnt in assign_counts] if total > 0 else [0.25]*4

    def sent_logprob(self, sent_tokens):
        s = add_sentence_boundaries(sent_tokens, order=self.order)
        s = map_to_unk(s, self.vocab)
        logp = 0.0
        for i in range(3, len(s)):
            w, ctx = s[i], (s[i-3], s[i-2], s[i-1])
            p = self.prob(w, ctx) or 1e-12
            logp += math.log(p)
        return logp

    def corpus_perplexity(self, sentences):
        total_logp, total_tokens = 0.0, 0
        for sent in sentences:
            total_logp += self.sent_logprob(sent)
            total_tokens += len(sent) + 1
        return math.exp(-total_logp / max(1, total_tokens))

In [9]:
def split_train_dev(data, dev_ratio=0.1, seed=42):
    random.Random(seed).shuffle(data)
    k = int(len(data) * (1 - dev_ratio))
    return data[:k], data[k:]

In [12]:
import pandas as pd

# Load CSV
df = pd.read_csv("train.csv", encoding="utf-8-sig")

# Extract sentences (plain text)
sentences = df["sentence"].dropna().tolist()

# Tokenize (replace with your own tokenizer)
tokenized_sents = list(tokenize_lines(sentences))

# Split train/dev
train_sents, dev_sents = split_train_dev(tokenized_sents, dev_ratio=0.1)

# Train model
model = NgramLM4DeletedInterp()
model.fit(train_sents, min_freq=2)  # filter rare words
model.learn_lambdas_deleted_interpolation()

print("Learned λ (uni, bi, tri, quad):", model.lambdas)
print("Train PPL:", model.corpus_perplexity(train_sents))
print("Dev PPL:", model.corpus_perplexity(dev_sents))


Learned λ (uni, bi, tri, quad): [0.0019316206297083252, 0.25671238168823646, 0.27583542592234883, 0.4655205717597064]
Train PPL: 2.8126192310489193
Dev PPL: 2.8973247418014534
