In [1]:

from collections import Counter
import io, csv
from tqdm import tqdm


In [2]:

path = r"C:\Users\evilk\OneDrive\Desktop\III YEAR\LABS\NLP\LAB1\tokenized_hindi.txt"


In [3]:

def build_counts(path):
    unigram_c = Counter()
    bigram_c = Counter()
    trigram_c = Counter()
    quadrigram_c = Counter()

    with io.open(path, "r", encoding="utf-8", buffering=1024*1024) as f:
        for line in tqdm(f, desc="Building n-gram counts"):
            words = line.strip().split()
            if not words:
                continue

            sent = ["<s>"] + words + ["</s>"]

            #Batch updates with zip (much faster)
            unigram_c.update(sent)
            bigram_c.update(zip(sent[:-1], sent[1:]))
            trigram_c.update(zip(sent[:-2], sent[1:-1], sent[2:]))
            quadrigram_c.update(zip(sent[:-3], sent[1:-2], sent[2:-1], sent[3:]))

    print("Finished building counts")
    return unigram_c, bigram_c, trigram_c, quadrigram_c


In [4]:

unigram_c, bigram_c, trigram_c, quadrigram_c = build_counts(path)

print("Unique unigrams:", len(unigram_c))
print("Unique bigrams:", len(bigram_c))
print("Unique trigrams:", len(trigram_c))
print("Unique quadrigrams:", len(quadrigram_c))

# Totals
N = sum(unigram_c.values())
V_uni = len(unigram_c)
V_bi  = len(bigram_c)
V_tri = len(trigram_c)

print("Total tokens (N):", N)


Building n-gram counts: 4800107it [07:22, 10845.16it/s]

Finished building counts
Unique unigrams: 804388
Unique bigrams: 10059390
Unique trigrams: 33277451
Unique quadrigrams: 54638391
Total tokens (N): 100840225





In [5]:
# Modified save function to also store counts
def save_ngram_probs(counter, base_counter, filename_prefix, top_n=1000, k=0.6):
    """
    Save n-gram counts and probabilities into CSV.
    counter        : Counter of n-grams
    base_counter   : Denominator counts (None for unigrams)
    filename_prefix: "unigram", "bigram", etc.
    top_n          : number of top n-grams to save
    k              : value for Add-K smoothing
    """

    V = len(base_counter) if base_counter else len(counter)
    csv_file = f"{filename_prefix}.csv"

    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        # Added "Count" column
        writer.writerow(["Ngram", "Count", "Raw", "Add-One", "Add-K", "Token-Type"])

        for ngram, count in tqdm(counter.most_common(top_n), desc=f"Saving {filename_prefix}"):
            key = " ".join(ngram) if isinstance(ngram, tuple) else ngram

            if base_counter:
                denom = base_counter.get(ngram[:-1], 0)
            else:  # unigram
                denom = N

            # Probabilities
            raw  = count / denom if denom > 0 else count / N
            add1 = (count + 1) / (denom + V) if denom > 0 else (count + 1) / (N + V)
            addk = (count + k) / (denom + k*V) if denom > 0 else (count + k) / (N + k*V)
            tokT = (count + V) / (denom + V*V) if denom > 0 else (count + V) / (N + V*V)

            # Now store count as well
            writer.writerow([key, count, raw, add1, addk, tokT])

    print(f"Saved top {top_n} {filename_prefix}s → {csv_file}")


In [6]:
#Save all n-gram CSVs
save_ngram_probs(unigram_c, None, "unigram", top_n=200000)
save_ngram_probs(bigram_c, unigram_c, "bigram", top_n=200000)
save_ngram_probs(trigram_c, bigram_c, "trigram", top_n=200000)
save_ngram_probs(quadrigram_c, trigram_c, "quadrigram", top_n=200000)


Saving unigram: 100%|██████████| 200000/200000 [00:01<00:00, 147983.69it/s]


Saved top 200000 unigrams → unigram.csv


Saving bigram: 100%|██████████| 200000/200000 [00:01<00:00, 112917.05it/s]


Saved top 200000 bigrams → bigram.csv


Saving trigram: 100%|██████████| 200000/200000 [00:02<00:00, 89021.69it/s] 


Saved top 200000 trigrams → trigram.csv


Saving quadrigram: 100%|██████████| 200000/200000 [00:04<00:00, 43557.76it/s]

Saved top 200000 quadrigrams → quadrigram.csv



