In [159]:
from nltk import ngrams
from collections import defaultdict,Counter
from indicnlp.tokenize import indic_tokenize
import re

# Load and preprocess Malayalam text from a text file
def load_data_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()  # Read lines from the text file
    corpus = ""
    for line in lines:
        line = line.strip()  # Remove leading/trailing whitespaces
        if line:  # Ignore empty lines
            corpus += line + " "
    
    # Preprocessing: Remove non-Malayalam characters and tokenize
    corpus = re.sub(r'[^\u0D00-\u0D7F\s]', '', corpus).lower()  # Keep Malayalam chars only
    tokens = list(indic_tokenize.trivial_tokenize(corpus, lang='ml'))
    
    return tokens

# Build n-gram model from tokenized text
tokens_ml = load_data_from_file('Dataset/train.txt')



In [163]:


# Create bigrams and trigrams
bigrams_ml = list(ngrams(tokens_ml, 2))
trigrams_ml = list(ngrams(tokens_ml, 3))
unigrams_ml = list(ngrams(tokens_ml, 1))
# Frequency counts for bigrams and trigrams
unigram_freq_ml = Counter(unigrams_ml)
bigram_freq_ml = Counter(bigrams_ml)
trigram_freq_ml = Counter(trigrams_ml)
word_freq_ml = Counter(tokens_ml)


In [160]:
# Create n-gram models
n_grams_freq = defaultdict(Counter)
max_n = 3  # Supports up to 5-grams

# Build frequency dictionaries for n-grams
for n in range(1, max_n + 1):
    n_grams = list(ngrams(tokens_ml, n))
    n_grams_freq[n].update(n_grams)

# Continuation Probability for higher n-grams
def continuation_prob_ml(word, n_grams):
    unique_preceding = set(w1 for *w1, w2 in n_grams if w2 == word)
    return len(unique_preceding) / len(n_grams)

In [166]:
# Kneser-Ney with Interpolation
def kneser_ney_interpolated(words, d=0.75):
    n = len(words)
    if n == 1:
        # Unigram case
        w1 = words[0]
        return continuation_prob_ml(w1, n_grams_freq[2])

    # Calculate probabilities
    p_kn = 0
    for i in range(1, n + 1):
        n_gram = tuple(words[-i:])
        lower_n_gram = tuple(words[-i + 1:]) if i > 1 else ()
        count_ngram = n_grams_freq[i][n_gram]
        count_lower = n_grams_freq[i - 1][lower_n_gram] if i > 1 else len(tokens_ml)

        # Absolute discounting
        if count_lower > 0:
            p_ngram = max(count_ngram - d, 0) / count_lower
        else:
            p_ngram = 0

        # Backoff factor
        lambda_factor = d * count_lower / (count_lower + len(tokens_ml))
        p_continuation = continuation_prob_ml(n_gram[-1], n_grams_freq[i])

        # Interpolated probability
        p_kn += p_ngram + lambda_factor * p_continuation

    return p_kn

# Prediction function using higher-order n-grams
def predict_next_word_ml(words, top_n=5):
    if len(words) < 1:
        return {"error": "Provide at least one word for context."}

    last_n = words[-(max_n - 1):]  # Consider the last few words
    candidates = [
        (w_next, kneser_ney_interpolated(last_n + [w_next]))
        for w_next in unigram_freq_ml
    ]
    candidates = sorted(candidates, key=lambda x: x[1], reverse=True)
    return candidates[:top_n]

In [171]:
words = "മുമ്പുണ്ടായ ഒരു"
word_list = words.split()

print(predict_next_word_ml(word_list))

KeyboardInterrupt: 

In [172]:
# Continuation Probability (count of unique preceding bigram heads)
def continuation_prob_ml(word):
    unique_preceding = set(w1 for (w1, w2) in bigram_freq_ml if w2 == word)
    return len(unique_preceding) / len(bigram_freq_ml)

# Kneser-Ney Smoothing for Trigrams
def kneser_ney_prob_ml(w1, w2, w3, d=0.75):
    trigram_count = trigram_freq_ml[(w1, w2, w3)]
    bigram_count = bigram_freq_ml[(w1, w2)]

    if bigram_count > 0:
        p_trigram = max(trigram_count - d, 0) / bigram_count
    else:
        p_trigram = 0

    lambda_factor = d * bigram_count / (bigram_count + unigram_freq_ml[w2])
    p_continuation = continuation_prob_ml(w3)

    p_kneser_ney = p_trigram + lambda_factor * p_continuation
    return p_kneser_ney

# Prediction function using bigrams or trigrams (higher-order n-grams)
def predict_next_word_ngram(words, top_n=5):
    # print("top: ", words)
    if len(words) == 1:
        # print("Bigram model (single word as context)")
        word = words[0]
        candidates = [
            (w2, bigram_freq_ml[(word, w2)] / word_freq_ml[word]) 
            for (w1, w2) in bigram_freq_ml if w1 == word
        ]
        # print(candidates)
    elif len(words) >= 2:
        # Trigram model (two words as context)
        w1, w2 = words[-2], words[-1]
        candidates = [
        (w3, kneser_ney_prob_ml(w1, w2, w3))
        for (_, _, w3) in trigram_freq_ml if (_ == w1 and _ == w2)
        ]
        print("c1: ", candidates)
        # word1, word2 = words[-2], words[-1]
        # candidates = [
        #     (w3, trigram_freq_ml[(word1, word2, w3)] / bigram_freq_ml[(word1, word2)])
        #     for (w1, w2, w3) in trigram_freq_ml if w1 == word1 and w2 == word2
        # ]
        if (len(candidates) == 0) and (isinstance(words,list)) :
            # print(type(words))
            # print(words)
            words.pop(0)
            return predict_next_word_ngram(words)
    else:
        return {"error": "Insufficient words for prediction"}
    
    # Sort candidates by probability
    # print("c1: ", candidates)
    candidates = sorted(candidates, key=lambda x: x[1], reverse=True)
    return candidates[:top_n]

In [173]:
words = "മുമ്പുണ്ടായ ഒരു"
word_list = words.split()
if len(predict_next_word_ngram(word_list)) == 0:
    print("no word")
else:
    print(predict_next_word_ngram(word_list))

c1:  []
[('പ്രധാന', 0.015134279637964291), ('വലിയ', 0.010831396211484247), ('ചെറിയ', 0.00964439388693803), ('പ്രത്യേക', 0.009594935456748603), ('ദിവസം', 0.005242593600079133)]


In [177]:
from nltk import ngrams
from collections import defaultdict, Counter
from indicnlp.tokenize import indic_tokenize
import re

# Load and preprocess Malayalam text
def load_data_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    corpus = " ".join([line.strip() for line in lines if line.strip()])
    corpus = re.sub(r'[^\u0D00-\u0D7F\s]', '', corpus).lower()
    tokens = list(indic_tokenize.trivial_tokenize(corpus, lang='ml'))
    return tokens

# Load the corpus
tokens_ml = load_data_from_file('Dataset/train.txt')

# Create n-gram models
n_grams_freq = defaultdict(Counter)
max_n = 5  # Supports up to 5-grams

# Build frequency dictionaries for n-grams
for n in range(1, max_n + 1):
    n_grams = list(ngrams(tokens_ml, n))
    n_grams_freq[n].update(n_grams)

# Continuation Probability for higher n-grams
def continuation_prob_ml(word, n_grams):
    unique_preceding = set(w1 for *w1, w2 in n_grams if w2 == word)
    return len(unique_preceding) / len(n_grams)

# Kneser-Ney with Interpolation
def kneser_ney_interpolated(words, d=0.75):
    n = len(words)
    if n == 1:
        # Unigram case
        w1 = words[0]
        return continuation_prob_ml(w1, n_grams_freq[2])

    # Calculate probabilities
    p_kn = 0
    for i in range(1, n + 1):
        # print(i)
        n_gram = tuple(words[-i:])
        lower_n_gram = tuple(words[-i + 1:]) if i > 1 else ()
        count_ngram = n_grams_freq[i][n_gram]
        count_lower = n_grams_freq[i - 1][lower_n_gram] if i > 1 else len(tokens_ml)

        # Absolute discounting
        if count_lower > 0:
            p_ngram = max(count_ngram - d, 0) / count_lower
        else:
            p_ngram = 0

        # Backoff factor
        lambda_factor = d * count_lower / (count_lower + len(tokens_ml))
        p_continuation = continuation_prob_ml(n_gram[-1], n_grams_freq[i])

        # Interpolated probability
        p_kn += p_ngram + lambda_factor * p_continuation

    return p_kn

# Prediction function using higher-order n-grams
def predict_next_word_ml(words, top_n=5):
    print("j")
    if len(words) < 1:
        return {"error": "Provide at least one word for context."}

    last_n = words[-(max_n - 1):]  # Consider the last few words
    
    candidates = [
        (w_next, kneser_ney_interpolated(last_n + [w_next]))
        for w_next in n_grams_freq[1]
    ]
    candidates = sorted(candidates, key=lambda x: x[1], reverse=True)
    print("j")
    return candidates[:top_n]


In [183]:
print(len(unigram_freq_ml))

334519


In [178]:
words = "മുമ്പുണ്ടായ ഒരു"
word_list = words.split()
predict_next_word_ml(word_list)


1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3


KeyboardInterrupt: 

In [6]:

from indicnlp.tokenize import indic_tokenize
import re

# Example Malayalam text
text_ml = "അവൻ സ്കൂളിലേക്ക് പോയി."

# Preprocess: Lowercasing and punctuation removal
text_ml = re.sub(r'[^\u0D00-\u0D7F\s]', '', text_ml)  # Keep Malayalam chars only

# Tokenize
tokens_ml = list(indic_tokenize.trivial_tokenize(text_ml, lang='ml'))
print(tokens_ml)


['അവൻ', 'സ്കൂളിലേക്ക്', 'പോയി']


In [19]:
predict_next_word_ml(word_list)

[]

[]
