In [2]:
import pickle
import pandas as pd

# Function to load any n-gram pkl file
def load_ngram(path):
    with open(path, "rb") as f:
        df = pickle.load(f)
    return dict(zip(df["Ngram"], df["Count"]))


# Load all n-grams
unigrams = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/unigram.pkl")
bigrams = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/bigram.pkl")
trigrams = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/trigram.pkl")
quadrigrams = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/quadrigram.pkl")


# Quick check
print("Unigrams loaded:", len(unigrams))
print("Bigrams loaded:", len(bigrams))
print("Trigrams loaded:", len(trigrams))
print("Quadrigrams loaded:", len(quadrigrams))

Unigrams loaded: 299475
Bigrams loaded: 3466685
Trigrams loaded: 9694653
Quadrigrams loaded: 14174147


In [None]:
class KatzBackoff:
    def __init__(self, unigrams, bigrams, trigrams, quadrigrams, k=5, d=0.75):
        self.unigrams = unigrams
        self.bigrams = bigrams
        self.trigrams = trigrams
        self.quadrigrams = quadrigrams
        self.k = k
        self.d = d
        self.total_unigrams = sum(unigrams.values())

    # --- counts ---
    def get_count(self, ngram):
        if len(ngram)==1:
            return self.unigrams.get(ngram,0)
        if len(ngram)==2:
            return self.bigrams.get(ngram,0)
        if len(ngram)==3:
            return self.trigrams.get(ngram,0)
        if len(ngram)==4:
            return self.quadrigrams.get(ngram,0)
        return 0

    # --- base model: unigram ---
    def prob_unigram(self, w):
        return self.unigrams.get((w,),0) / self.total_unigrams

    # --- recursive probability ---
    def prob(self, ngram):

        # unigram base
        if len(ngram)==1:
            return self.prob_unigram(ngram[0])

        hist = ngram[:-1]
        w    = ngram[-1]

        count_hist  = self.get_count(hist)
        count_ngram = self.get_count(ngram)

        # back off immediately if history nonexistent
        if count_hist == 0:
            return self.prob(ngram[1:])

        # high-count ngram → discount
        if count_ngram > self.k:
            return self.d * (count_ngram / count_hist)

        # low-count → backoff
        return self.alpha(hist) * self.prob(ngram[1:])

    # --- backoff α(hist) ---
    def alpha(self, hist):
        count_hist = self.get_count(hist)
        if count_hist == 0:
            return 1.0

        n = len(hist)+1

        # choose correct ngram table
        if n == 4:
            table = self.quadrigrams
        elif n == 3:
            table = self.trigrams
        else:
            table = self.bigrams

        # leftover mass
        discounted_mass = 0
        for ng, cnt in table.items():
            if ng[:-1] == hist and cnt > self.k:
                discounted_mass += self.d * (cnt / count_hist)

        leftover = 1 - discounted_mass

        # denominator: sum of lower-order model probabilities
        denom = 0
        for ng, cnt in table.items():
            if ng[:-1] == hist and cnt <= self.k:
                denom += self.prob(ng[1:])   # safe because n-1 history always smaller

        return leftover / denom if denom > 0 else 1.0



# --- Example Usage ---
model = KatzBackoff(unigrams, bigrams, trigrams, quadrigrams, k=3, d=0.75)

results = []

for ngram, count in quadrigrams.items():
    hist = ngram[:-1]   # history (first 3 words)
    
    # only run if history exists in trigrams
    if model.get_count(hist) > 0:
        prob = model.prob(ngram)
        results.append({
            "Ngram": " ".join(ngram),
            "Count": count,
            "Probability": prob
        })

df_results = pd.DataFrame(results)

print("Calculated probabilities for", len(df_results), "quadrigrams (with valid history).")
print(df_results.head())

# save if needed
df_results.to_csv("quadrigram_probabilities_KB_filtered.csv", index=False, encoding="utf-8")