# N-gram Language Model Workflow

This notebook demonstrates building n-gram language models from a tokenized Hindi corpus. The workflow includes:

- **Loading Data:** The corpus is read from `sentence_tokenized.txt`.
- **Counting N-grams:** Unigram, bigram, trigram, and quadrigram frequencies are computed using Python's `Counter`.
- **Statistics:**  
    - Total tokens (`N`): 22,801,825  
    - Unique unigrams: 299,475  
    - Unique bigrams: 3,466,685  
    - Unique trigrams: 9,694,653  
    - Unique quadrigrams: 14,174,147
- **Saving Probabilities:** N-gram probabilities (raw, Add-One, Add-K, Token-Type) are calculated and saved to CSV files for further analysis.

This process enables statistical modeling and probability estimation for natural language processing tasks.

In [18]:
#Imp orts
from collections import Counter
import io, csv
from tqdm import tqdm

In [19]:
#Path to tokenized Hindi file
path = r"C:\Users\ashis\OneDrive\Desktop\NLP\LAB1\sentence_tokenized.txt"

In [20]:
from collections import Counter
import io, csv
from tqdm import tqdm


def build_counts(path):
    unigram_c = Counter()
    bigram_c = Counter()
    trigram_c = Counter()
    quadrigram_c = Counter()

    with io.open(path, "r", encoding="utf-8", errors="ignore", buffering=1024*1024) as f:
        for line in tqdm(f, desc="Building n-gram counts"):
            words = line.strip().split()
            if not words:
                continue

            # Add sentence boundaries
            words = ["<s>","<s>","<s>"] + words + ["</s>"]

            # Count n-grams
            unigram_c.update(words)
            bigram_c.update(zip(words[:-1], words[1:]))
            trigram_c.update(zip(words[:-2], words[1:-1], words[2:]))
            quadrigram_c.update(zip(words[:-3], words[1:-2], words[2:-1], words[3:]))

    return unigram_c, bigram_c, trigram_c, quadrigram_c


# Run builder
unigram_c, bigram_c, trigram_c, quadrigram_c = build_counts(path=path)

# Show stats
print("Unique unigrams:", len(unigram_c))
print("Unique bigrams:", len(bigram_c))
print("Unique trigrams:", len(trigram_c))
print("Unique quadrigrams:", len(quadrigram_c))

# Totals
N = sum(unigram_c.values())
print("Total tokens (N):", N)


Building n-gram counts: 1000000it [00:54, 18440.71it/s]



Unique unigrams: 299475
Unique bigrams: 3466685
Unique trigrams: 9694653
Unique quadrigrams: 14174147
Total tokens (N): 22801825


In [24]:
vocab_size = len(unigram_c)
print("Vocab Size : " , vocab_size)

Vocab Size :  299475


In [31]:
from collections import Counter
import io, csv
from tqdm import tqdm

def save_ngram_probs(counter, base_counter, filename_prefix, top_n=1000, k=0.6):
    """
    Save n-gram probabilities with smoothing into CSV.
    base_counter   : Denominator counts (None for unigrams)
    top_n          : number of top n-grams to save
    k              : value for Add-K smoothing
    """

    V = vocab_size
    csv_file = f"{filename_prefix}.csv"

    # Precompute U values (unique continuations for each history)
    U_dict = {}
    if base_counter:  # only needed for bigram/trigram/quadrigram
        for ng in counter:
            hist = ng[:-1]
            if hist not in U_dict:
                U_dict[hist] = set()
            U_dict[hist].add(ng[-1])
        # convert sets to counts for faster lookup
        U_dict = {hist: len(s) for hist, s in U_dict.items()}

    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Ngram", "Count", "Add-One", "Add-K", "Token-Type"])

        for ngram, count in tqdm(counter.most_common(top_n), desc=f"Saving {filename_prefix}"):
            key = " ".join(ngram) if isinstance(ngram, tuple) else ngram

            if base_counter:
                denom = base_counter.get(ngram[:-1], 0)
                U = U_dict.get(ngram[:-1], V)  # fallback if missing
            else:  # unigram
                denom = N
                U = V

            raw  = count
            add1 = (counter[ngram] + 1) / (denom + V) 
            addk = (counter[ngram] + k) / (denom + k * V) 
            tokT = (counter[ngram] + k) / (denom + k * U) 

            writer.writerow([key, raw, add1, addk, tokT])

    print(f"Saved top {top_n} {filename_prefix}s → {csv_file}")

#Save all n-gram CSVs
save_ngram_probs(unigram_c, None, "unigram", top_n=200000)
save_ngram_probs(bigram_c, unigram_c, "bigram", top_n=200000)
save_ngram_probs(trigram_c, bigram_c, "trigram", top_n=200000)
save_ngram_probs(quadrigram_c, trigram_c, "quadrigram", top_n=200000)


Saving unigram: 100%|██████████| 200000/200000 [00:01<00:00, 197216.92it/s]



Saved top 200000 unigrams → unigram.csv


Saving bigram: 100%|██████████| 200000/200000 [00:01<00:00, 144674.43it/s]



Saved top 200000 bigrams → bigram.csv


Saving trigram: 100%|██████████| 200000/200000 [00:01<00:00, 143648.54it/s]



Saved top 200000 trigrams → trigram.csv


Saving quadrigram: 100%|██████████| 200000/200000 [00:01<00:00, 128759.18it/s]
Saving quadrigram: 100%|██████████| 200000/200000 [00:01<00:00, 128759.18it/s]


Saved top 200000 quadrigrams → quadrigram.csv


In [23]:
import pickle
import pandas as pd

# Save n-gram counters to pickle format (2 columns: Ngram as tuple, Count)
def save_ngrams_to_pickle(counter, filename_prefix):
    """
    Save n-gram counter to pickle with 2 columns: Ngram (as tuple) and Count
    """
    data = []
    for ngram, count in counter.items():
        data.append({"Ngram": ngram, "Count": count})
    
    df = pd.DataFrame(data)
    pickle_file = f"{filename_prefix}.pkl"
    
    with open(pickle_file, "wb") as f:
        pickle.dump(df, f)
    
    print(f"✅ Saved {filename_prefix} → {pickle_file} ({len(df)} n-grams)")

# Save all n-gram counters to pickle
save_ngrams_to_pickle(unigram_c, "unigram")
save_ngrams_to_pickle(bigram_c, "bigram")
save_ngrams_to_pickle(trigram_c, "trigram")
save_ngrams_to_pickle(quadrigram_c, "quadrigram")

✅ Saved unigram → unigram.pkl (299475 n-grams)
✅ Saved bigram → bigram.pkl (3466685 n-grams)
✅ Saved bigram → bigram.pkl (3466685 n-grams)
✅ Saved trigram → trigram.pkl (9694653 n-grams)
✅ Saved trigram → trigram.pkl (9694653 n-grams)
✅ Saved quadrigram → quadrigram.pkl (14174147 n-grams)
✅ Saved quadrigram → quadrigram.pkl (14174147 n-grams)
