In [2]:
import json
from collections import defaultdict, Counter
with open("tokenized.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [10]:
# Flatten sentences to tokens
all_tokens = [token for sentence in data["tokens"] for token in sentence]
print("Total individual tokens:", len(all_tokens))
print("First 30 individual tokens:", all_tokens[:30])

Total individual tokens: 21856
First 30 individual tokens: ['આ', 'વીડિયો', 'જુઓ', 'ઊંઝા', 'માર્કેટયાર્ડ', 'આજથી', 'જુલાઈ', 'સુધી', 'બંધ', 'મિથેનોલ', 'આવ્યો', 'ક્યાંથી', 'આખરે', 'ત્રણ', 'રાજ્યોમાં', 'મળેલ', 'હાર', 'પર', 'કોંગ્રેસ', 'અધ્યક્ષ', 'રાહુલ', 'ગાંધી', 'દ્વારા', 'પ્રથમ', 'પ્રતિક્રિયા', 'આપવામાં', 'આવી', 'છે', 'તેમણે', 'કહ્યું']


In [None]:
def build_ngram_model(tokens, n=1):
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        ngrams.append(ngram)

    counts = Counter(ngrams) #counts frequency of each ngram.
    model = defaultdict(dict)

    for ngram, freq in counts.items(): #maps prefix -> {next_word: frequency}
        prefix = ngram[:-1] if n > 1 else ()
        model[prefix][ngram[-1]] = freq

    # Normalize to probabilities (conditional probablity model)
    for prefix in model:
        total_prefix = sum(model[prefix].values())
        for word in model[prefix]:
            model[prefix][word] /= total_prefix

    return model

In [13]:
unigram_model = build_ngram_model(all_tokens, 1)
print("\nUnigram probabilities (top 10):")
print(dict(list(unigram_model[()].items())[:10]))


Unigram probabilities (top 10):
{'આ': 0.012628111273792094, 'વીડિયો': 0.0002287701317715959, 'જુઓ': 0.00018301610541727673, 'ઊંઝા': 4.575402635431918e-05, 'માર્કેટયાર્ડ': 4.575402635431918e-05, 'આજથી': 0.0002745241581259151, 'જુલાઈ': 0.00018301610541727673, 'સુધી': 0.001509882869692533, 'બંધ': 0.0006863103953147877, 'મિથેનોલ': 4.575402635431918e-05}


In [24]:
bigram_model = build_ngram_model(all_tokens, 2)
print("\nBigram probabilities after ('આ')")
print(bigram_model[("આ",)])


Bigram probabilities after ('આ')
{'વીડિયો': 0.014492753623188406, 'ક્ષેત્રના': 0.007246376811594203, 'આંકડો': 0.007246376811594203, 'ઠેકાઓ': 0.0036231884057971015, 'કેદીઓને': 0.007246376811594203, 'નિર્ણય': 0.010869565217391304, 'અઠવાડિયે': 0.0036231884057971015, 'ખાસ': 0.0036231884057971015, 'ક્ષેત્રે': 0.0036231884057971015, 'સારા': 0.0036231884057971015, 'સજ્જન': 0.0036231884057971015, 'જાદુઈ': 0.0036231884057971015, 'મુદ્દે': 0.0036231884057971015, 'આરોપી': 0.007246376811594203, 'આરોપીને': 0.0036231884057971015, 'સિન્ડ્રોમ': 0.0036231884057971015, 'કિસ્સામાં': 0.021739130434782608, 'ઉદ્ઘાટન': 0.0036231884057971015, 'ભયના': 0.0036231884057971015, 'બધી': 0.0036231884057971015, 'સમયે': 0.010869565217391304, 'ઉપરાંત': 0.021739130434782608, 'સિવાય': 0.010869565217391304, 'માટે': 0.021739130434782608, 'વખતે': 0.021739130434782608, 'બનાવ': 0.0036231884057971015, 'સેલ': 0.0036231884057971015, 'ગઠબંધન': 0.0036231884057971015, 'ગઠબંધનનું': 0.0036231884057971015, 'ગઠબંધનનો': 0.00362318840579

In [23]:
trigram_model = build_ngram_model(all_tokens, 3)
print("\nTrigram probabilities after ('આ','વીડિયો'):")
print(trigram_model[("આ","વીડિયો")])


Trigram probabilities after ('આ','વીડિયો'):
{'જુઓ': 0.25, 'સેલિબ્રિટી': 0.25, 'વાઈરલ': 0.25, 'વધુને': 0.25}


In [25]:
quadgram_model = build_ngram_model(all_tokens, 4)
print("\nQuadrigram probabilities after ('આ','વીડિયો','જુઓ'):")
print(quadgram_model[("આ","વીડિયો","જુઓ")])


Quadrigram probabilities after ('આ','વીડિયો','જુઓ'):
{'ઊંઝા': 1.0}


In [26]:
import pickle

with open("unigram_model.pkl", "wb") as f:
   pickle.dump(unigram_model, f)

with open("bigram_model.pkl", "wb") as f:
   pickle.dump(bigram_model, f)

with open("trigram_model.pkl", "wb") as f:
   pickle.dump(trigram_model, f)

with open("quadgram_model.pkl", "wb") as f:
    pickle.dump(quadgram_model, f)