In [71]:
from nltk import ngrams
from collections import Counter
from indicnlp.tokenize import indic_tokenize
import re

# Load and preprocess Malayalam text from a text file
def load_data_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()  # Read lines from the text file
    corpus = ""
    for line in lines:
        line = line.strip()  # Remove leading/trailing whitespaces
        if line:  # Ignore empty lines
            corpus += line + " "
    
    # Preprocessing: Remove non-Malayalam characters and tokenize
    corpus = re.sub(r'[^\u0D00-\u0D7F\s]', '', corpus).lower()  # Keep Malayalam chars only
    tokens = list(indic_tokenize.trivial_tokenize(corpus, lang='ml'))
    
    return tokens

# Build n-gram model from tokenized text
tokens_ml = load_data_from_file('Dataset/train.txt')

# Create bigrams and trigrams
bigrams_ml = list(ngrams(tokens_ml, 2))
trigrams_ml = list(ngrams(tokens_ml, 3))

# Frequency counts for bigrams and trigrams
bigram_freq_ml = Counter(bigrams_ml)
trigram_freq_ml = Counter(trigrams_ml)
word_freq_ml = Counter(tokens_ml)




In [139]:
# Prediction function using bigrams or trigrams (higher-order n-grams)
def predict_next_word_ngram(words, top_n=5):
    # print("top: ", words)
    if len(words) == 1:
        # print("Bigram model (single word as context)")
        word = words[0]
        candidates = [
            (w2, bigram_freq_ml[(word, w2)] / word_freq_ml[word]) 
            for (w1, w2) in bigram_freq_ml if w1 == word
        ]
        # print(candidates)
    elif len(words) >= 2:
        # Trigram model (two words as context)
        word1, word2 = words[-2], words[-1]
        candidates = [
            (w3, trigram_freq_ml[(word1, word2, w3)] / bigram_freq_ml[(word1, word2)])
            for (w1, w2, w3) in trigram_freq_ml if w1 == word1 and w2 == word2
        ]
        if (len(candidates) == 0) and (isinstance(words,list)) :
            # print(type(words))
            # print(words)
            words.pop(0)
            return predict_next_word_ngram(words)
    else:
        return {"error": "Insufficient words for prediction"}
    
    # Sort candidates by probability
    # print("c1: ", candidates)
    candidates = sorted(candidates, key=lambda x: x[1], reverse=True)
    return candidates[:top_n]

In [141]:
words = "അവൾ പുസ്തകം"
word_list = words.split()
if len(predict_next_word_ngram(word_list)) == 0:
    print("no word")
else:
    print(predict_next_word_ngram(word_list))

[('വില്ലൻ', 0.09947643979057591), ('പുറത്തിറക്കിയിട്ടുണ്ട്', 0.020942408376963352), ('പ്രസിദ്ധീകരിച്ചത്', 0.020942408376963352), ('ലത്തീനിലുള്ളതിനുള്ളതിനോട്', 0.020942408376963352), ('എന്ന', 0.020942408376963352)]


In [119]:
w= ["1","2"]
w.pop(0)
w
if type(w) == str:
    print(1)
else:
    print(0)

0


In [121]:
w

['2']

In [6]:

from indicnlp.tokenize import indic_tokenize
import re

# Example Malayalam text
text_ml = "അവൻ സ്കൂളിലേക്ക് പോയി."

# Preprocess: Lowercasing and punctuation removal
text_ml = re.sub(r'[^\u0D00-\u0D7F\s]', '', text_ml)  # Keep Malayalam chars only

# Tokenize
tokens_ml = list(indic_tokenize.trivial_tokenize(text_ml, lang='ml'))
print(tokens_ml)


['അവൻ', 'സ്കൂളിലേക്ക്', 'പോയി']


In [19]:
predict_next_word_ml(word_list)

[]

[]
