In [None]:
from collections import defaultdict
import pickle

def count_ngrams(file_path, num_lines=None, n_list=[1,2,3,4]):
    """
    Count n-grams (1-gram, 2-gram, 3-gram, 4-gram) from a large tokenized file.
    
    Args:
        file_path: Path to the corpus (one tokenized sentence per line)
        num_lines: Number of lines to read (None = read all)
        n_list: List of n-gram sizes to count (e.g., [1,2,3,4])
    
    Returns:
        dict: {n: defaultdict(counts)}
    """
    ngram_counts = {n: defaultdict(int) for n in n_list}

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if num_lines and i >= num_lines:   ##if num of lines> 1M THEN EXIT  
                break

            tokens = line.strip().split()
            for n in n_list:
                # Padding only for n > 1
                if n > 1:
                    tokens_n = ["<s>"]*(n-1) + tokens + ["</s>"]
                else:
                    tokens_n = tokens
                for j in range(len(tokens_n) - n + 1):
                    ngram = tuple(tokens_n[j:j+n])
                    ngram_counts[n][ngram] += 1

            if (i+1) % 100000 == 0:
                print(f"Processed {i+1} lines...")

    return ngram_counts


def save_counts_to_file(ngram_counts, prefix="ngram_counts"):
    """
    Save n-gram counts to disk using pickle.
    """
    for n, counts in ngram_counts.items():
        file_name = f"{prefix}_{n}gram.pkl"
        with open(file_name, "wb") as f:
            pickle.dump(dict(counts), f)
        print(f"Saved {n}-gram counts to {file_name}")


if __name__ == "__main__":
    corpus_file = "tokenized_sentences.txt"
    num_lines_to_process = 1_000_000  

    # Count 1-gram to 4-gram
    counts = count_ngrams(corpus_file, num_lines=num_lines_to_process, n_list=[1,2,3,4])

    # Save to disk
    save_counts_to_file(counts, prefix="ngram_counts")

Processed 100000 lines...
Processed 200000 lines...
Processed 300000 lines...
Processed 400000 lines...
Processed 500000 lines...
Processed 600000 lines...
Processed 700000 lines...
Processed 800000 lines...
Processed 900000 lines...
Processed 1000000 lines...
Saved 1-gram counts to ngram_counts_1gram.pkl
Saved 2-gram counts to ngram_counts_2gram.pkl
Saved 3-gram counts to ngram_counts_3gram.pkl
Saved 4-gram counts to ngram_counts_4gram.pkl


<h1>printing 10 lines of all the pickle file

In [2]:
import pickle

def read_ngram_pickle(file_path, num_lines=10):
    """
    Read n-gram pickle file and print first `num_lines` items.
    """
    with open(file_path, "rb") as f:
        ngram_counts = pickle.load(f)

    print(f"First {num_lines} items from {file_path}:")
    for i, (ngram, count) in enumerate(ngram_counts.items()):
        print(ngram, count)
        if i+1 >= num_lines:
            break
    print("\n")


if __name__ == "__main__":
    for n in range(1, 5):
        file_name = f"ngram_counts_{n}gram.pkl"
        read_ngram_pickle(file_name, num_lines=10)

First 10 items from ngram_counts_1gram.pkl:
('આ',) 161255
('વીડિયો',) 4200
('જુઓ:',) 194
('ઊંઝા',) 112
('માર્કેટયાર્ડ',) 27
('આજથી',) 977
('25',) 1939
('જુલાઈ',) 683
('સુધી',) 23874
('બંધ',) 8839


First 10 items from ngram_counts_2gram.pkl:
('<s>', 'આ') 67993
('આ', 'વીડિયો') 873
('વીડિયો', 'જુઓ:') 36
('જુઓ:', 'ઊંઝા') 1
('ઊંઝા', 'માર્કેટયાર્ડ') 1
('માર્કેટયાર્ડ', 'આજથી') 1
('આજથી', '25') 2
('25', 'જુલાઈ') 5
('જુલાઈ', 'સુધી') 113
('સુધી', 'બંધ') 136


First 10 items from ngram_counts_3gram.pkl:
('<s>', '<s>', 'આ') 67993
('<s>', 'આ', 'વીડિયો') 487
('આ', 'વીડિયો', 'જુઓ:') 36
('વીડિયો', 'જુઓ:', 'ઊંઝા') 1
('જુઓ:', 'ઊંઝા', 'માર્કેટયાર્ડ') 1
('ઊંઝા', 'માર્કેટયાર્ડ', 'આજથી') 1
('માર્કેટયાર્ડ', 'આજથી', '25') 1
('આજથી', '25', 'જુલાઈ') 1
('25', 'જુલાઈ', 'સુધી') 1
('જુલાઈ', 'સુધી', 'બંધ') 5


First 10 items from ngram_counts_4gram.pkl:
('<s>', '<s>', '<s>', 'આ') 67993
('<s>', '<s>', 'આ', 'વીડિયો') 487
('<s>', 'આ', 'વીડિયો', 'જુઓ:') 36
('આ', 'વીડિયો', 'જુઓ:', 'ઊંઝા') 1
('વીડિયો', 'જુઓ:', 'ઊંઝા', 'મ

<h1>applying smoothing techniques

In [None]:
import pickle

# ---------------- Load Pickle ----------------
def load_ngram_counts(n):
    file_name = f"ngram_counts_{n}gram.pkl"
    with open(file_name, "rb") as f:
        counts = pickle.load(f)
    return counts

# ---------------- Vocabulary ----------------
def get_vocabulary(unigrams):                          ##extract unique words from unigram keys
    return set([token[0] for token in unigrams.keys()])

# ---------------- Generator for Add-K / Add-One Smoothing ----------------
def smoothing_generator(ngrams, lower_ngrams=None, vocab=None, k=1.0):
    """
    Generator to yield (ngram, smoothed_prob) one by one.
    k=1.0 => Add-One Smoothing
    k=<other> => Add-K Smoothing
    """
    V = len(vocab)
    
    if lower_ngrams is None:  # Unigrams
        N = sum(ngrams.values())
        for ngram, count in ngrams.items():
            yield ngram, (count + k) / (N + k * V)
    else:  # n-grams n>1
        for ngram, count in ngrams.items():
            prefix = ngram[:-1]
            prefix_count = lower_ngrams.get(prefix, 0)
            yield ngram, (count + k) / (prefix_count + k * V)

# ---------------- Generator for Token Type Smoothing ----------------
def token_type_smoothing_generator(ngrams):
    """
    Yield (ngram, count + number_of_token_types) one by one
    """
    token_types = len(set([w for ngram in ngrams.keys() for w in ngram]))
    for ngram, count in ngrams.items():
        yield ngram, count + token_types

# ---------------- Save generator to file in chunks ----------------
def save_generator_to_file(generator, out_file, batch_size=1_000_000):
    batch = {}
    for i, (ngram, value) in enumerate(generator, 1):
        batch[ngram] = value
        if i % batch_size == 0:
            with open(out_file, "ab") as f:
                pickle.dump(batch, f)
            batch = {}
    # Save remaining
    if batch:
        with open(out_file, "ab") as f:
            pickle.dump(batch, f)

# ---------------- Main ----------------
if __name__ == "__main__":
    # Load all n-grams
    unigrams = load_ngram_counts(1)
    bigrams = load_ngram_counts(2)
    trigrams = load_ngram_counts(3)
    quadrigrams = load_ngram_counts(4)

    vocab = get_vocabulary(unigrams)

    # --- Add-One Smoothing (k=1.0) ---
    save_generator_to_file(smoothing_generator(unigrams, vocab=vocab, k=1.0), "unigram_addone.pkl")
    save_generator_to_file(smoothing_generator(bigrams, lower_ngrams=unigrams, vocab=vocab, k=1.0), "bigram_addone.pkl")
    save_generator_to_file(smoothing_generator(trigrams, lower_ngrams=bigrams, vocab=vocab, k=1.0), "trigram_addone.pkl")
    save_generator_to_file(smoothing_generator(quadrigrams, lower_ngrams=trigrams, vocab=vocab, k=1.0), "quadrigram_addone.pkl")

    # --- Add-K Smoothing (k=0.5) ---
    save_generator_to_file(smoothing_generator(unigrams, vocab=vocab, k=0.5), "unigram_addk.pkl")
    save_generator_to_file(smoothing_generator(bigrams, lower_ngrams=unigrams, vocab=vocab, k=0.5), "bigram_addk.pkl")
    save_generator_to_file(smoothing_generator(trigrams, lower_ngrams=bigrams, vocab=vocab, k=0.5), "trigram_addk.pkl")
    save_generator_to_file(smoothing_generator(quadrigrams, lower_ngrams=trigrams, vocab=vocab, k=0.5), "quadrigram_addk.pkl")

    # --- Token Type Smoothing ---
    save_generator_to_file(token_type_smoothing_generator(unigrams), "unigram_toktype.pkl")
    save_generator_to_file(token_type_smoothing_generator(bigrams), "bigram_toktype.pkl")
    save_generator_to_file(token_type_smoothing_generator(trigrams), "trigram_toktype.pkl")
    save_generator_to_file(token_type_smoothing_generator(quadrigrams), "quadrigram_toktype.pkl")

    print("All smoothed n-grams saved successfully in a memory-efficient way.")

All smoothed n-grams saved successfully in a memory-efficient way.


In [4]:
import pickle

def read_chunked_pickle(file_path, num_lines=10):
    """
    Read a pickle file saved in batches and print the first `num_lines` n-grams.
    """
    print(f"\nReading file: {file_path}")
    count_printed = 0

    with open(file_path, "rb") as f:
        while True:
            try:
                batch = pickle.load(f)  # load one batch at a time
                for ngram, value in batch.items():
                    print(ngram, value)
                    count_printed += 1
                    if count_printed >= num_lines:
                        return  # stop after printing required lines
            except EOFError:
                break  # end of file

# ---------------- Example Usage ----------------
if __name__ == "__main__":
    files_to_check = [
        "unigram_addone.pkl",
        "bigram_addone.pkl",
        "trigram_addone.pkl",
        "quadrigram_addone.pkl",
        "unigram_addk.pkl",
        "bigram_addk.pkl",
        "trigram_addk.pkl",
        "quadrigram_addk.pkl",
        "unigram_toktype.pkl",
        "bigram_toktype.pkl",
        "trigram_toktype.pkl",
        "quadrigram_toktype.pkl"
    ]

    for file in files_to_check:
        read_chunked_pickle(file, num_lines=10)


Reading file: unigram_addone.pkl
('આ',) 0.011662559910782024
('વીડિયો',) 0.0003038300229771003
('જુઓ:',) 1.4103036058208653e-05
('ઊંઝા',) 8.172528587577323e-06
('માર્કેટયાર્ડ',) 2.0250513314350887e-06
('આજથી',) 7.073215007655417e-05
('25',) 0.00014030712796371688
('જુલાઈ',) 4.946911109648574e-05
('સુધી',) 0.001726717876357598
('બંધ',) 0.0006393376346387923

Reading file: bigram_addone.pkl
('<s>', 'આ') 0.09848408038168846
('આ', 'વીડિયો') 0.0010262299201207992
('વીડિયો', 'જુઓ:') 5.326760782371589e-05
('જુઓ:', 'ઊંઝા') 2.8960324355632785e-06
('ઊંઝા', 'માર્કેટયાર્ડ') 2.8963763435565762e-06
('માર્કેટયાર્ડ', 'આજથી') 2.89673291977643e-06
('આજથી', '25') 4.339128963251917e-06
('25', 'જુલાઈ') 8.666199654796381e-06
('જુલાઈ', 'સુધી') 0.00016495704605340268
('સુધી', 'બંધ') 0.0001918015344122753

Reading file: trigram_addone.pkl
('<s>', '<s>', 'આ') 0.09848408038168846
('<s>', 'આ', 'વીડિયો') 0.0006434607640569146
('આ', 'વીડિયો', 'જુઓ:') 5.3523975124370914e-05
('વીડિયો', 'જુઓ:', 'ઊંઝા') 2.896695160491

<h1>applying the model on 1000 test sentences

In [None]:
import pickle
import math
import csv

# -------------------------------
# Helper function to load models
# -------------------------------
def load_model(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

# -------------------------------
# Configuration
# -------------------------------
n_models = ["unigram", "bigram", "trigram", "quadrigram"]
smoothings = ["addone", "addk", "toktype"]

# Load all models dynamically into a dictionary
models = {
    f"{n}_{smooth}": load_model(f"{n}_{smooth}.pkl")
    for n in n_models
    for smooth in smoothings
}

# -------------------------------
# Probability calculator
# -------------------------------
def sentence_prob(sentence_tokens, model, n):
    """Compute raw & log probability of a sentence given an n-gram model."""
    prob = 1.0
    log_prob = 0.0
    padded = ["<s>"] * (n - 1) + sentence_tokens + ["</s>"]

    for i in range(n - 1, len(padded)):
        ngram = tuple(padded[i - n + 1 : i + 1])
        p = model.get(ngram, 1e-12)  # fallback tiny value if missing
        prob *= p
        log_prob += math.log(p)      #adding the log probability to avoid underflow
    return prob, log_prob

# -------------------------------
# Read test sentences
# -------------------------------
with open("test_sentences.txt", "r", encoding="utf-8") as f:
    sentences = [line.strip().split() for line in f]

# -------------------------------
# Calculate probabilities
# -------------------------------
results = []
for sent in sentences:
    res = {"sentence": " ".join(sent)}
    for n in n_models:
        for smooth in smoothings:
            model_key = f"{n}_{smooth}"
            n_val = n_models.index(n) + 1  # unigram=1, bigram=2, etc.
            res[model_key] = sentence_prob(sent, models[model_key], n_val)
    results.append(res)

# -------------------------------
# Save results to CSV
# -------------------------------
with open("sentence_probabilities.csv", "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["sentence"]
    for n in n_models:
        for smooth in smoothings:
            fieldnames += [f"{n}_{smooth}_prob", f"{n}_{smooth}_logprob"]

    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for r in results:
        row = {"sentence": r["sentence"]}
        for n in n_models:
            for smooth in smoothings:
                p, lp = r[f"{n}_{smooth}"]
                row[f"{n}_{smooth}_prob"] = p
                row[f"{n}_{smooth}_logprob"] = lp
        writer.writerow(row)

print("✅ Sentence probabilities saved to 'sentence_probabilities.csv'")

✅ Sentence probabilities saved to 'sentence_probabilities.csv'


In [16]:
import pandas as pd
df=pd.read_csv('sentence_probabilities.csv')
df

Unnamed: 0,sentence,unigram_addone_prob,unigram_addone_logprob,unigram_addk_prob,unigram_addk_logprob,unigram_toktype_prob,unigram_toktype_logprob,bigram_addone_prob,bigram_addone_logprob,bigram_addk_prob,...,trigram_addk_prob,trigram_addk_logprob,trigram_toktype_prob,trigram_toktype_logprob,quadrigram_addone_prob,quadrigram_addone_logprob,quadrigram_addk_prob,quadrigram_addk_logprob,quadrigram_toktype_prob,quadrigram_toktype_logprob
0,આજે અમદાવાદમાં હવામાન ખૂબ જ ગરમ છે.,3.131978e-34,-77.146228,3.734935e-34,-76.970163,1.306414e+29,67.042254,1.100848e-42,-96.612493,4.372406e-41,...,1.356943e-63,-144.757626,2.287596e-25,-56.737126,1.458486e-79,-181.526823,5.601282e-79,-180.181227,4.798937e-61,-138.889296
1,ભારતીય ક્રિકેટ ટીમે મેચ જીતી લીધી.,6.533053e-36,-81.016189,7.586069e-36,-80.866750,1.109357e+23,53.063238,2.456280e-34,-77.389245,1.474797e-32,...,7.163913e-51,-115.462783,2.278161e-13,-29.110238,5.401421e-60,-136.468444,4.024127e-59,-134.460213,3.299564e-31,-70.186347
2,સરકારે નવી શિક્ષણ નીતિની જાહેરાત કરી.,1.161830e-35,-80.440482,1.339398e-35,-80.298258,1.121741e+23,53.074339,6.180491e-43,-97.189761,1.835523e-41,...,4.690165e-53,-120.491542,2.274777e-13,-29.111724,1.074730e-75,-172.621813,2.148011e-75,-171.929339,6.911490e-67,-152.340016
3,શેર બજારમાં આજે તેજી જોવા મળી.,5.026476e-35,-78.975759,5.835135e-35,-78.826581,1.141304e+23,53.091628,3.662220e-29,-65.476898,3.922311e-27,...,5.373735e-47,-106.539976,1.569093e+05,11.963423,7.300742e-70,-159.192981,2.653842e-69,-157.902363,4.767060e-49,-111.264940
4,આગામી અઠવાડિયે નવી ફિલ્મ રિલીઝ થશે.,1.774352e-34,-77.714458,2.061284e-34,-77.564564,1.129755e+23,53.081458,2.344175e-34,-77.435960,1.353665e-32,...,9.773927e-53,-119.757292,2.275468e-13,-29.111420,4.365154e-62,-141.286622,2.668692e-61,-139.476102,3.295230e-31,-70.187662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,વિકાસના કાર્યોને વેગ આપવા માટે કેન્દ્ર સરકારે ...,1.699204e-64,-146.835286,2.162622e-64,-146.594124,2.926201e+34,79.361598,1.186722e-89,-204.758879,7.943761e-88,...,5.255832e-121,-276.953458,3.291321e-91,-208.343954,1.000000e-144,-331.572253,1.000000e-144,-331.572253,1.000000e-144,-331.572253
996,ભારતની મહિલા હોકી ટીમે એશિયન ગેમ્સમાં ગોલ્ડ મે...,4.313552e-67,-152.811440,5.180189e-67,-152.628360,1.194296e+58,133.727492,2.730453e-104,-238.464382,2.025489e-102,...,7.550708e-134,-306.524761,3.292742e-103,-235.974544,5.460555e-148,-339.085043,1.090663e-147,-338.393223,6.907840e-139,-318.126671
997,આગામી બજેટમાં નાણા મંત્રી દ્વારા મોટા નિર્ણયો ...,3.842829e-49,-111.480461,5.058505e-49,-111.205599,3.065299e+52,120.854570,1.255164e-50,-114.901988,1.118341e-47,...,1.104421e-100,-230.159189,1.608961e-55,-126.166592,7.157109e-119,-272.039520,5.609494e-118,-269.980580,3.299798e-91,-208.341382
998,શહેરીકરણની સમસ્યાઓને ઉકેલવા માટે નવી શહેરી વિક...,1.310153e-54,-124.069451,1.559412e-54,-123.895286,2.945801e+46,106.999295,8.936709e-91,-207.345076,2.300550e-89,...,1.593233e-125,-287.357371,6.904130e-115,-262.865166,1.000000e-132,-303.941232,1.000000e-132,-303.941232,1.000000e-132,-303.941232
