In [1]:
#  Trigram Language Model which predict a sequence of words by inferring a sequence of words/ sentence from corpora.

import random
from collections import defaultdict
from nltk.util import ngrams
import string

In [2]:
# Function to read Amharic words from a file
def read_amharic_words_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        amharic_words = file.read().split()
    return amharic_words

In [3]:
# Function to remove user-defined punctuations from a list of punctuations
def remove_user_defined_punctuations(punctuation_list, user_defined_punctuations):
    cleaned_punctuations = [p for p in punctuation_list if p not in user_defined_punctuations]
    return cleaned_punctuations

In [4]:
# Function to remove numbers from a list of words
def remove_numbers(words, numbers):
    cleaned_words = [p for p in words if p not in numbers]
    return cleaned_words

In [5]:
# Function to remove a specific character from a list of words
def remove_character(words, char_to_remove):
    cleaned_words = [string.replace(char_to_remove, '') for string in words]
    return cleaned_words

In [6]:
# Function to apply Laplace smoothing to the trigram model
def apply_laplace_smoothing(trigram_model, smoothing_factor=0.01):
    for trigram in trigram_model:
        total_count = sum(trigram_model[trigram].values()) + (len(trigram_model[trigram]) * smoothing_factor)
        for word in trigram_model[trigram]:
            trigram_model[trigram][word] = (trigram_model[trigram][word] + smoothing_factor) / total_count

In [7]:
# Read Amharic words from a file
file_path = 'amh_wikipedia_2021_30K-sentences.txt'  # Replace with the actual file path
amharic_words = read_amharic_words_from_file(file_path)

In [8]:
# Define the user-defined punctuations to be removed
user_defined_punctuations = ['፣', '፤', '፡', '፦', ';',"«","»"]  # Add or modify punctuations as needed

# Remove user-defined punctuations
cleaned_punctuations = remove_user_defined_punctuations(amharic_words, user_defined_punctuations)
print(cleaned_punctuations[:100])

['1', 'ሽቦዎች', ':', 'የኤሌክትሪክ', 'ሽቦ', 'በውሃ', 'በተመላ', 'ቱቦ', 'ይመሰላል።', '2', 'ኤሌክትሪክ', 'እምቅ', 'አቅም', 'ልዩነት', '(ቮልቴጅ)', ':', 'በሁለት', 'ነጥቦች', 'መካከል', 'ባለ', 'የጫና', 'ልዩነት', 'ይመሰላል።', '3', "'", "('ዱ'", '-', 'ሁለት፣', "'ሻንቤ'", '-', 'ቀን)', 'ነው።', '4', "'", 'የተባለው', 'አልበሙ', 'ሲሆን', 'ስለ', 'ፍቅር', 'መቻቻል', 'ህዝብ', 'ስሜት', 'እንዲሁም', 'ስለመንግስታት', 'እና', 'ስለ', 'ተቃዋሚዎች', 'በ', 'ፍቅር', 'ለ', 'ሃገር', 'እድገት', 'መስራት', 'ዘፍኖአል።', '5', "'", 'የተሰየመው', 'በዚህ', 'ቅልቅል', 'በመስፋፋቱ', 'ነበር።', '6', "'''", 'ክርስቶስ', 'ወንጌል', 'ቅዱስ', 'ማርቆስ', 'እንደ', 'ጻፈው።', '7', "''ቅዱስ", 'ጊዮርጊስም', 'ይህን', 'ሁሉ', 'ተግባር', 'ሠርቶ', 'በመገኘቱ', 'ብዙ', 'ምስክር', 'ተገኝቶለት', 'በ፮ኛው', 'መቶ', 'ዘመን', 'በደቡብ', 'ሶርያ', 'በምትገኘው', 'አድራ', 'ወይም', '(ይድራስ)', 'በተባለች', 'ቤተክርስቲያን', 'ተሰብስበው', 'የቤተክርስቲያን', 'አባቶች', 'እውነተኛ', 'ሰማዕትነቱንና', 'ቅድስናውን', 'በጉባኤ', 'አጽድቀው', 'ውሳኔውን']


In [9]:
# Remove English punctuations
cleaned_english_punctuations = [word for word in cleaned_punctuations if all(char not in string.punctuation for char in word)]

print(cleaned_english_punctuations[:100])

['1', 'ሽቦዎች', 'የኤሌክትሪክ', 'ሽቦ', 'በውሃ', 'በተመላ', 'ቱቦ', 'ይመሰላል።', '2', 'ኤሌክትሪክ', 'እምቅ', 'አቅም', 'ልዩነት', 'በሁለት', 'ነጥቦች', 'መካከል', 'ባለ', 'የጫና', 'ልዩነት', 'ይመሰላል።', '3', 'ሁለት፣', 'ነው።', '4', 'የተባለው', 'አልበሙ', 'ሲሆን', 'ስለ', 'ፍቅር', 'መቻቻል', 'ህዝብ', 'ስሜት', 'እንዲሁም', 'ስለመንግስታት', 'እና', 'ስለ', 'ተቃዋሚዎች', 'በ', 'ፍቅር', 'ለ', 'ሃገር', 'እድገት', 'መስራት', 'ዘፍኖአል።', '5', 'የተሰየመው', 'በዚህ', 'ቅልቅል', 'በመስፋፋቱ', 'ነበር።', '6', 'ክርስቶስ', 'ወንጌል', 'ቅዱስ', 'ማርቆስ', 'እንደ', 'ጻፈው።', '7', 'ጊዮርጊስም', 'ይህን', 'ሁሉ', 'ተግባር', 'ሠርቶ', 'በመገኘቱ', 'ብዙ', 'ምስክር', 'ተገኝቶለት', 'በ፮ኛው', 'መቶ', 'ዘመን', 'በደቡብ', 'ሶርያ', 'በምትገኘው', 'አድራ', 'ወይም', 'በተባለች', 'ቤተክርስቲያን', 'ተሰብስበው', 'የቤተክርስቲያን', 'አባቶች', 'እውነተኛ', 'ሰማዕትነቱንና', 'ቅድስናውን', 'በጉባኤ', 'አጽድቀው', 'ውሳኔውን', 'ለሕዝብ', 'አስተላልፈዋል።', '8', 'ናቸው።', '9', 'ሕገ', 'መንግሥት', '10', 'አገልግሎት', 'ሥራችሁን', 'ለመፈጽም', 'ያለባችሁ', 'ሰዎች', 'ይሉናል።»']


In [10]:
# Define numbers to be removed
numbers = ['1','2','3','4','5','6','7','8','9','0']

# Remove numbers
cleaned_words_without_numbers = remove_numbers(cleaned_english_punctuations, numbers)
print(cleaned_words_without_numbers[:100])

['ሽቦዎች', 'የኤሌክትሪክ', 'ሽቦ', 'በውሃ', 'በተመላ', 'ቱቦ', 'ይመሰላል።', 'ኤሌክትሪክ', 'እምቅ', 'አቅም', 'ልዩነት', 'በሁለት', 'ነጥቦች', 'መካከል', 'ባለ', 'የጫና', 'ልዩነት', 'ይመሰላል።', 'ሁለት፣', 'ነው።', 'የተባለው', 'አልበሙ', 'ሲሆን', 'ስለ', 'ፍቅር', 'መቻቻል', 'ህዝብ', 'ስሜት', 'እንዲሁም', 'ስለመንግስታት', 'እና', 'ስለ', 'ተቃዋሚዎች', 'በ', 'ፍቅር', 'ለ', 'ሃገር', 'እድገት', 'መስራት', 'ዘፍኖአል።', 'የተሰየመው', 'በዚህ', 'ቅልቅል', 'በመስፋፋቱ', 'ነበር።', 'ክርስቶስ', 'ወንጌል', 'ቅዱስ', 'ማርቆስ', 'እንደ', 'ጻፈው።', 'ጊዮርጊስም', 'ይህን', 'ሁሉ', 'ተግባር', 'ሠርቶ', 'በመገኘቱ', 'ብዙ', 'ምስክር', 'ተገኝቶለት', 'በ፮ኛው', 'መቶ', 'ዘመን', 'በደቡብ', 'ሶርያ', 'በምትገኘው', 'አድራ', 'ወይም', 'በተባለች', 'ቤተክርስቲያን', 'ተሰብስበው', 'የቤተክርስቲያን', 'አባቶች', 'እውነተኛ', 'ሰማዕትነቱንና', 'ቅድስናውን', 'በጉባኤ', 'አጽድቀው', 'ውሳኔውን', 'ለሕዝብ', 'አስተላልፈዋል።', 'ናቸው።', 'ሕገ', 'መንግሥት', '10', 'አገልግሎት', 'ሥራችሁን', 'ለመፈጽም', 'ያለባችሁ', 'ሰዎች', 'ይሉናል።»', '11', 'ተነሣ።', '12', 'ደግሞ', 'ቃሉ', 'የግሪክ', 'ሲሆን', 'ሥርዓት', 'ማለት']


In [11]:
# Define a character to be removed
char_to_remove = '።'

cleaned_words_without_character = remove_character(cleaned_words_without_numbers, char_to_remove)
print(cleaned_words_without_character[:100])

['ሽቦዎች', 'የኤሌክትሪክ', 'ሽቦ', 'በውሃ', 'በተመላ', 'ቱቦ', 'ይመሰላል', 'ኤሌክትሪክ', 'እምቅ', 'አቅም', 'ልዩነት', 'በሁለት', 'ነጥቦች', 'መካከል', 'ባለ', 'የጫና', 'ልዩነት', 'ይመሰላል', 'ሁለት፣', 'ነው', 'የተባለው', 'አልበሙ', 'ሲሆን', 'ስለ', 'ፍቅር', 'መቻቻል', 'ህዝብ', 'ስሜት', 'እንዲሁም', 'ስለመንግስታት', 'እና', 'ስለ', 'ተቃዋሚዎች', 'በ', 'ፍቅር', 'ለ', 'ሃገር', 'እድገት', 'መስራት', 'ዘፍኖአል', 'የተሰየመው', 'በዚህ', 'ቅልቅል', 'በመስፋፋቱ', 'ነበር', 'ክርስቶስ', 'ወንጌል', 'ቅዱስ', 'ማርቆስ', 'እንደ', 'ጻፈው', 'ጊዮርጊስም', 'ይህን', 'ሁሉ', 'ተግባር', 'ሠርቶ', 'በመገኘቱ', 'ብዙ', 'ምስክር', 'ተገኝቶለት', 'በ፮ኛው', 'መቶ', 'ዘመን', 'በደቡብ', 'ሶርያ', 'በምትገኘው', 'አድራ', 'ወይም', 'በተባለች', 'ቤተክርስቲያን', 'ተሰብስበው', 'የቤተክርስቲያን', 'አባቶች', 'እውነተኛ', 'ሰማዕትነቱንና', 'ቅድስናውን', 'በጉባኤ', 'አጽድቀው', 'ውሳኔውን', 'ለሕዝብ', 'አስተላልፈዋል', 'ናቸው', 'ሕገ', 'መንግሥት', '10', 'አገልግሎት', 'ሥራችሁን', 'ለመፈጽም', 'ያለባችሁ', 'ሰዎች', 'ይሉናል»', '11', 'ተነሣ', '12', 'ደግሞ', 'ቃሉ', 'የግሪክ', 'ሲሆን', 'ሥርዓት', 'ማለት']


In [12]:
# Define stopwords to be removed
stopwords = ["ዘፍኖአል","ይመሰላል", "ነው", "ነበር", "ግን", "አስታውቀዋል", "ይጠበቃል", "ብለዋል", "አሉ", "ላይ", "ጋር", "ውስጥ", "ስለዚህ", "እና", "ማለት", "መሆኑ", "ናቸው", "በዚሁ"]

# Remove stopwords
cleaned_words = [word for word in cleaned_words_without_character if word not in stopwords]
print(cleaned_words[:100])

['ሽቦዎች', 'የኤሌክትሪክ', 'ሽቦ', 'በውሃ', 'በተመላ', 'ቱቦ', 'ኤሌክትሪክ', 'እምቅ', 'አቅም', 'ልዩነት', 'በሁለት', 'ነጥቦች', 'መካከል', 'ባለ', 'የጫና', 'ልዩነት', 'ሁለት፣', 'የተባለው', 'አልበሙ', 'ሲሆን', 'ስለ', 'ፍቅር', 'መቻቻል', 'ህዝብ', 'ስሜት', 'እንዲሁም', 'ስለመንግስታት', 'ስለ', 'ተቃዋሚዎች', 'በ', 'ፍቅር', 'ለ', 'ሃገር', 'እድገት', 'መስራት', 'የተሰየመው', 'በዚህ', 'ቅልቅል', 'በመስፋፋቱ', 'ክርስቶስ', 'ወንጌል', 'ቅዱስ', 'ማርቆስ', 'እንደ', 'ጻፈው', 'ጊዮርጊስም', 'ይህን', 'ሁሉ', 'ተግባር', 'ሠርቶ', 'በመገኘቱ', 'ብዙ', 'ምስክር', 'ተገኝቶለት', 'በ፮ኛው', 'መቶ', 'ዘመን', 'በደቡብ', 'ሶርያ', 'በምትገኘው', 'አድራ', 'ወይም', 'በተባለች', 'ቤተክርስቲያን', 'ተሰብስበው', 'የቤተክርስቲያን', 'አባቶች', 'እውነተኛ', 'ሰማዕትነቱንና', 'ቅድስናውን', 'በጉባኤ', 'አጽድቀው', 'ውሳኔውን', 'ለሕዝብ', 'አስተላልፈዋል', 'ሕገ', 'መንግሥት', '10', 'አገልግሎት', 'ሥራችሁን', 'ለመፈጽም', 'ያለባችሁ', 'ሰዎች', 'ይሉናል»', '11', 'ተነሣ', '12', 'ደግሞ', 'ቃሉ', 'የግሪክ', 'ሲሆን', 'ሥርዓት', '13', 'ብለው', 'ይጠሩት', '14', 'ደግሞ', 'በአርሜንኛ', 'መጽሐፍ', 'ቅዱስ']


In [13]:
# Preprocess source text to suitable training corpus
trigrams_amharic = list(ngrams(cleaned_words, 3))  # Generate trigrams

In [14]:
# Train the language model using trigrams with Laplace smoothing
trigram_model_amharic = defaultdict(lambda: defaultdict(lambda: 0.0))
for trigram in trigrams_amharic:
    w1, w2, w3 = trigram
    trigram_model_amharic[(w1, w2)][w3] += 1


In [2]:
# code to display the model
counter = 0

for trigram, word_counts in trigram_model_amharic.items():
    print(f"Trigram: {trigram}")
    for word, count in word_counts.items():
        print(f"    Word: {word}, Count: {count}")
    print()  
    
    counter += 1
    
    if counter >= 100:
        break

NameError: name 'trigram_model_amharic' is not defined

In [16]:
# Apply Laplace smoothing to the trigram model
apply_laplace_smoothing(trigram_model_amharic)

In [17]:
# Function to generate a sequence of words based on the trigram model with Laplace smoothing
def generate_sequence(seed_words, model, length=3):
    current_words = seed_words.split()
    for i in range(length):
        if len(current_words) < 2:
            break  # Break if there are not enough words for the trigram
        w1, w2 = current_words[-2], current_words[-1]
        next_word_probs = model.get((w1, w2), defaultdict(lambda: 1.0 / len(model)))
        next_word = random.choices(list(next_word_probs.keys()), weights=next_word_probs.values())[0]
        if next_word is not None:
            current_words.append(next_word)
        else:
            break  # Break if no prediction is found
    return ' '.join(current_words)

In [23]:
seed_phrase = input("Enter a phrase: ")

predicted_sequence = generate_sequence(seed_phrase, trigram_model_amharic, length=3)
print("The Predicted words are : " + predicted_sequence)

Enter a phrase:   ሕገ መንግሥት


The Predicted words are : ሕገ መንግሥት በአንቀጽ 39 ንኡስ
