<a href="https://colab.research.google.com/github/AsmitaJha/2program/blob/main/maithili1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook helps in uploading any text file which may be written in any language, counting the number of bigrams, trigrams, and unigrams in it and also predicting the next word

In [None]:
from google.colab import files

# Uploading the file manually, opening it, and displaying the first 1000 words
uploaded = files.upload()

# Read the text file
with open("khattarkaka.txt", "r", encoding="utf-8") as file:
    text = file.read()

print(text[:1000])  # for printing the first 1000 characters in the text file

Saving khattarkaka.txt to khattarkaka (1).txt
पुरातन सभ्यता खट्टर ककाक तरंग
लेखक : हरिमोहन झा
ओहि दिन खट्टर कका सॅं पुरातन सभ्य्ता पर गप्प छिड़ि गेल ।

हम कहलिऎन्ह – देखू, खट्टर कका, ताहि दिनक ऋषि-मुनि केहन त्यागपूर्ण जीवन व्यतीत करथि ! गुफा-कंदरा में रहि कंदमूल खा तपस्या करथि। ब्राह्म मुहूर्त्त में उठि, नदी में स्नान कय, बल्कल पहिरने, कंमडलु में जल भरने, कुटी में आबि, कुशासन पर बैसि देवता क ध्यान धरथि। केहन पवित्र सात्त्विक जीवन छलैन्ह ? औखन धरि दाढी ओ गेरुआ वस्त्र देखि कऽ लोक कैं श्रद्धा उत्पन्न भऽ जाइ छैक।

खट्टर कका भाङक पत्ती धोइत बजलाह- हौ, जंगल में हजाम नहिं भेटैन्ह, तैं दाढी। धोबी नहिं भेटैन्ह, तैं कषाय रंग। तेल क अभाव में जटा। वस्त्रक अभाव में बल्कल। अन्नक अभाव में कंद-मूल। तकरो अभाव में एकभुक्त वा उपवास। लोटाक अभाव में कमंडलु सॅं पानि पीबथि। थारीक अभाव में पात पर खाथि। अथवा हाथे पर भोजन कय करपात्री बनि जाथि। ई सभ त्याग क सूचक नहिं अभाव क सूचक थीक। अप्राप्तिस्तत्र कारणम्।

हम-परन्तु ॱ ॱ ॱ ॱ ॱ

ख०-परन्तु की? यदि हुनका लोकनि कैं पकमान भेटितैन्ह त पकोहा किएक जोहितथि ? गुलाबजामुन भ

In [None]:

import re

def preprocess_text(text):
    """Cleans and tokenizes Maithili text."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[०-९]', '', text)  # Remove Devanagari numbers
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  # Keep only Devanagari characters
    return text.split()  # Tokenize words

cleaned_text = preprocess_text(text)
print(cleaned_text[:20])  # Print first 20 words

['पुरातन', 'सभ्यता', 'खट्टर', 'ककाक', 'तरंग', 'लेखक', 'हरिमोहन', 'झा', 'ओहि', 'दिन', 'खट्टर', 'कका', 'सॅं', 'पुरातन', 'सभ्य्ता', 'पर', 'गप्प', 'छिड़ि', 'गेल', '।']


In [None]:
from collections import defaultdict, Counter

class NGramModel: #n gram model with functions for building frequency table, counting the number of unigrams, bigrams, and trigrams, and predicting the next word
    def __init__(self, text, n=3):
        self.n = n
        self.text = preprocess_text(text)
        self.unigrams = []
        self.bigrams = defaultdict(Counter)
        self.trigrams = defaultdict(Counter)
        self.build_model()

    def build_model(self): #for building the frequency table for unigram, bigram, and trigram
        for i in range(len(self.text)):
            unigram = self.text[i]
            self.unigrams.append(unigram)

            if i < len(self.text) - 1:
                bigram = (self.text[i], self.text[i + 1])
                self.bigrams[bigram[0]][bigram[1]] += 1

            if i < len(self.text) - 2:
                trigram = (self.text[i], self.text[i + 1], self.text[i + 2])
                self.trigrams[(trigram[0], trigram[1])][trigram[2]] += 1

    def predict_next_word(self, words): #for predicting the next word based on the given word
        words = words.lower().split()

        if len(words) >= 2 and (words[-2], words[-1]) in self.trigrams:
            candidates = self.trigrams[(words[-2], words[-1])]
        elif words[-1] in self.bigrams:
            candidates = self.bigrams[words[-1]]
        else:
            candidates = Counter(self.unigrams)  # Use unigrams if no context

        if candidates:
            return max(candidates, key=candidates.get)  # Most frequent word
        return None  # No prediction available

    def count_unigrams(self): #for counting the number of unigrams in the text file
        return len(set(self.unigrams))

    def count_bigrams(self): #for counting the number of bigrams in the text file
        return sum(len(value) for value in self.bigrams.values())

    def count_trigrams(self): #for counting the number of trigrams in the text file
        return sum(len(value) for value in self.trigrams.values())


In [None]:
ngram_model = NGramModel(text)

print(f"Count of Unique Unigrams: {ngram_model.count_unigrams()}")
print(f"Count of Unique Bigrams: {ngram_model.count_bigrams()}")
print(f"Count of Unique Trigrams: {ngram_model.count_trigrams()}")

Number of Unique Unigrams: 1197
Number of Unique Bigrams: 2243
Number of Unique Trigrams: 2505


hence, there are 1197 unigrams, 2243 bigrams, and 2505 trigrams in the .txt file

#checking the working of next word prediction for the text:
नीतिक वचन छैक जे, ‘काल्हि करै सो आज कर, आज करै सो अब'। in the pdf

In [None]:
input_text = "नीतिक"  #unigram
predicted_word = ngram_model.predict_next_word(input_text)

print(f"Predicted next word: {predicted_word}")

Predicted next word: वचन


In [None]:
input_text = "नीतिक वचन" #bigram
predicted_word = ngram_model.predict_next_word(input_text)

print(f"Predicted next word: {predicted_word}")

Predicted next word: छैक


In [None]:
input_text = "नीतिक वचन छैक" #trigram
predicted_word = ngram_model.predict_next_word(input_text)

print(f"Predicted next word: {predicted_word}")

Predicted next word: जे


In [None]:
input_text = "नीतिक वचन छैक जे"  #more than 3 words, quadrigram
predicted_word = ngram_model.predict_next_word(input_text)

print(f"Predicted next word: {predicted_word}")

Predicted next word: काल्हि
