In [1]:
from collections import Counter, defaultdict
import math
import pandas as pd
import ast

In [2]:
class KneserNeyLM:
    def __init__(self, n=4, discount=0.75):
        self.n = n  #4 for quadrigram
        self.D = discount
        self.ngram_counts = [Counter() for _ in range(n)]
        self.context_counts = [Counter() for _ in range(n)]
        self.vocab = set()

    def train(self, corpus):
        
        for sent in corpus:
            sent = ["<s>"]*(self.n-1) + sent + ["</s>"]
            self.vocab.update(sent)

            for i in range(len(sent)):
                for k in range(1, self.n+1):
                    if i-k+1 >= 0:
                        ngram = tuple(sent[i-k+1:i+1])
                        context = ngram[:-1]
                        self.ngram_counts[k-1][ngram] += 1
                        self.context_counts[k-1][context] += 1

    def prob(self, word, context):
        """
        Compute Kneser-Ney probability P(word | context).
        context length = n-1
        """
        return self._kn_prob(tuple(context), word, self.n)

    def _kn_prob(self, context, word, order):
        if order == 1:
            # Continuation probability for unigrams
            unique_contexts = sum(1 for ng in self.ngram_counts[0] if ng[0] == word)
            total_unique_contexts = len(self.ngram_counts[0])
            return unique_contexts / total_unique_contexts if total_unique_contexts > 0 else 1/len(self.vocab)

        ngram = context + (word,)
        count_ngram = self.ngram_counts[order-1][ngram]
        count_context = self.context_counts[order-1][context]

        # First term: discounted probability mass
        first_term = max(count_ngram - self.D, 0) / count_context if count_context > 0 else 0.0

        # Lambda: leftover probability mass redistributed
        unique_continuations = len([w for w in self.vocab if self.ngram_counts[order-1][context + (w,)] > 0])
        lambda_term = (self.D * unique_continuations) / count_context if count_context > 0 else 1.0

        # Recursive backoff
        backoff = self._kn_prob(context[1:], word, order-1)

        return first_term + lambda_term * backoff

In [3]:
df = pd.read_csv("../ass5/train.csv")

In [5]:
import ast

corpus = []

for row in df["sentence"]:
    # Step 1: Convert string repr to Python list/dict if possible
    if isinstance(row, str):
        if row.strip().startswith("[") and row.strip().endswith("]"):
            try:
                row = ast.literal_eval(row)
            except Exception:
                # if eval fails, treat as plain text
                row = [{"text": row}]
        else:
            row = [{"text": row}]
    
    # Step 2: Extract text and tokenize
    for item in row:
        text = item.get("text", "").strip()
        if text:
            tokens = text.split()  # simple whitespace tokenization
            corpus.append(tokens)

print("Example tokens:", corpus[0][:15])


Example tokens: ['યુનાઇટેડ', 'સ્ટેટ્સ', 'બ્યુરો', 'ઓફ', 'લેબર', 'સ્ટેટિસ્ટિક્સ', 'અનુસાર,', '2008', 'માં', 'અપરાધ', 'દ્રશ્ય', 'તપાસકર્તાઓ', 'માટેનો', 'સરેરાશ', 'પગાર']


In [6]:
kneser_ney_quad_model = KneserNeyLM(n=4, discount=0.75)
kneser_ney_quad_model.train(corpus)

context = ["ચેતવણી", "પણ", "આપે"]   
word = "છે"
print("P(છે | ચેતવણી પણ આપે) =", kneser_ney_quad_model.prob(word, context))

P(છે | ચેતવણી પણ આપે) = 0.8082539202130228


In [7]:
import pickle

with open("kneser_ney_quad_model.pkl", "wb") as f:
    pickle.dump(kneser_ney_quad_model, f)