In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


Spell Checker

In [10]:
import difflib
import docx

# Replace with the actual path to your word file
word_file_path = '/content/drive/MyDrive/AI/Project/Sinhala_Word_Dictionary.docx'

def load_dictionary(file_path):
    doc = docx.Document(file_path)
    return [paragraph.text.strip() for paragraph in doc.paragraphs if paragraph.text.strip()]  # Extract text from each paragraph and ignore empty ones

# Load Sinhala words
dictionary = load_dictionary(word_file_path)

# Detect Incorrect Words
def detect_errors(paragraph, dictionary):
    words = paragraph.split()  # Split the paragraph into words
    misspelled = [word for word in words if word not in dictionary]
    return misspelled

# Auto-Correction with Edit Distance
def suggest_correction(word, dictionary):
    closest_match = difflib.get_close_matches(word, dictionary, n=1)
    return closest_match[0] if closest_match else word  # Suggest closest or return original

# Correct the Paragraph
def correct_paragraph(paragraph, dictionary):
    words = paragraph.split()
    corrected = [
        suggest_correction(word, dictionary) if word not in dictionary else word
        for word in words
    ]
    return " ".join(corrected)

# List of sample paragraphs with errors
sample_paragraphs = [
    "සුද මිනිසා ගගට දුවනව",
    "මිනිසා උද ඉඳලා රාත්‍රිය දැක්වා වැඩකරයි",
    "ගුරුවරු දරුවන පන්ත මගින් දැනුම ලබාදෙත",
    "ගමනේදී ගගක් හරහා පාලමක් දැකියහැක",
    "රජකාලේදී දැවැන්ත ගෘහනිර්මාණය සෑදීතිබේ"
]

# Process each sample paragraph
for i, sample_paragraph in enumerate(sample_paragraphs, start=1):
    print(f"Sample Sentence {i}:")
    print("Original Sentence:", sample_paragraph)

    # Detect errors
    misspelled_words = detect_errors(sample_paragraph, dictionary)
    if misspelled_words:
        print("Misspelled Words:", misspelled_words)

        # Correct the paragraph if there are mistakes
        corrected_paragraph = correct_paragraph(sample_paragraph, dictionary)
        print("Corrected Sentence:", corrected_paragraph)
    else:
        print("There is no mistake.")

    # Add an extra space between paragraphs' results
    print("\n")

Sample Sentence 1:
Original Sentence: සුද මිනිසා ගගට දුවනව
Misspelled Words: ['සුද', 'ගගට', 'දුවනව']
Corrected Sentence: සුදු මිනිසා ගඟට දුවනවා


Sample Sentence 2:
Original Sentence: මිනිසා උද ඉඳලා රාත්‍රිය දැක්වා වැඩකරයි
Misspelled Words: ['උද', 'දැක්වා']
Corrected Sentence: මිනිසා උදේ ඉඳලා රාත්‍රිය දක්වා වැඩකරයි


Sample Sentence 3:
Original Sentence: ගුරුවරු දරුවන පන්ත මගින් දැනුම ලබාදෙත
Misspelled Words: ['දරුවන', 'පන්ත', 'ලබාදෙත']
Corrected Sentence: ගුරුවරු දරුවන්ට පන්ති මගින් දැනුම ලබාදෙති


Sample Sentence 4:
Original Sentence: ගමනේදී ගගක් හරහා පාලමක් දැකියහැක
There is no mistake.


Sample Sentence 5:
Original Sentence: රජකාලේදී දැවැන්ත ගෘහනිර්මාණය සෑදීතිබේ
Misspelled Words: ['ගෘහනිර්මාණය']
Corrected Sentence: රජකාලේදී දැවැන්ත ගෘහනිර්මාණයන් සෑදීතිබේ




Grammer Checker

In [None]:
from docx import Document

# Load words from the .docx files
def load_words_from_docx(file_path):
    document = Document(file_path)
    words = []
    for paragraph in document.paragraphs:
        words.extend(paragraph.text.split())  # Split words by spaces
    return words

# File paths
nouns_file = '/content/drive/MyDrive/AI/Project/Sinhala_Nouns.docx'
verbs_mama_file = '/content/drive/MyDrive/AI/Project/verbs_mama_file.docx'
verbs_api_file = '/content/drive/MyDrive/AI/Project/verbs_api_file.docx'
other_verbs_file = '/content/drive/MyDrive/AI/Project/Other_Verbs.docx'

# Load words from each file
singular_nouns = load_words_from_docx(nouns_file)
verbs_mama = load_words_from_docx(verbs_mama_file)  # Verbs for "මම" (ending with "මි")
verbs_api = load_words_from_docx(verbs_api_file)    # Verbs for "අපි" (ending with "මු")
singular_verbs = load_words_from_docx(other_verbs_file)  # Verbs for singular nouns (ending with "යි")

# Helper to find the correct verb for a stem
def find_correct_verb(verbs_list, stem):
    for verb in verbs_list:
        if stem in verb:
            return verb
    return None

# Grammar Checker
def grammar_checker(sentence):
    words = sentence.split()
    if len(words) < 2:
        return "Error: Sentence is too short to check grammar."

    subject = words[0]  # The first word is the subject
    verb = words[-1]  # The last word is the verb
    verb_stem = verb[:-2]  # Extract the stem of the verb

    # Check grammar rules
    if subject == "මම":
        if verb not in verbs_mama:
            correct_verb = find_correct_verb(verbs_mama, verb_stem)
            if correct_verb:
                return f"Error: Sentence starting with 'මම' must end with a verb ending in 'මි'. Corrected sentence: {subject} " + ' '.join(words[1:-1]) + f" {correct_verb}"
        return "The sentence is grammatically correct."

    if subject == "අපි":
        if verb not in verbs_api:
            correct_verb = find_correct_verb(verbs_api, verb_stem)
            if correct_verb:
                return f"Error: Sentence starting with 'අපි' must end with a verb ending in 'මු'. Corrected sentence: {subject} " + ' '.join(words[1:-1]) + f" {correct_verb}"
        return "The sentence is grammatically correct."

    if subject in singular_nouns:
        if verb not in singular_verbs:
            correct_verb = find_correct_verb(singular_verbs, verb_stem)
            if correct_verb:
                return f"Error: Singular noun '{subject}' must end with a verb ending in 'යි'. Corrected sentence: {subject} " + ' '.join(words[1:-1]) + f" {correct_verb}"
        return "The sentence is grammatically correct."

    return "The sentence does not match any grammatical rule."

# Example Sentences
sentences = [
    "මම නටමි",    # Correct
    "මම නටමු",    # Incorrect
    "අපි කමු",     # Correct
    "අපි නටමි",    # Incorrect
    "ගම පසුබසයි",    # Correct
    "පාසල ජයගමු",     # Incorrect
]

# Test Grammar Checker
for sentence in sentences:
    print(f"Sentence: {sentence}")
    print(f"Feedback: {grammar_checker(sentence)}\n")


Sentence: මම නටමි
Feedback: The sentence is grammatically correct.

Sentence: මම නටමු
Feedback: Error: Sentence starting with 'මම' must end with a verb ending in 'මි'. Corrected sentence: මම  නටමි

Sentence: අපි කමු
Feedback: The sentence is grammatically correct.

Sentence: අපි නටමි
Feedback: Error: Sentence starting with 'අපි' must end with a verb ending in 'මු'. Corrected sentence: අපි  නටමු

Sentence: ගම පසුබසයි
Feedback: The sentence is grammatically correct.

Sentence: පාසල ජයගමු
Feedback: Error: Singular noun 'පාසල' must end with a verb ending in 'යි'. Corrected sentence: පාසල  ජයගයි



Accuracy Testing

In [11]:
import pandas as pd

# Define the grammar-checking function
def grammar_checker(sentence, singular_nouns, verbs_mama, verbs_api, singular_verbs):
    # Split sentence into words
    words = sentence.split()
    if not words:
        return "Error: Sentence is empty."

    # Extract the subject (first word)
    subject = words[0]

    # Extract the verb (last word)
    verb = words[-1]

    # Check grammar rules
    if subject == "මම":
        if verb in verbs_mama:
            return "The sentence is grammatically correct."
        else:
            correct_verb = next((v for v in verbs_mama if v.startswith(verb[:-2])), None)
            return f"Error: Sentence starting with 'මම' must end with a verb ending in 'මි'. Corrected sentence: {' '.join(words[:-1])} {correct_verb if correct_verb else 'මි'}"

    elif subject == "අපි":
        if verb in verbs_api:
            return "The sentence is grammatically correct."
        else:
            correct_verb = next((v for v in verbs_api if v.startswith(verb[:-2])), None)
            return f"Error: Sentence starting with 'අපි' must end with a verb ending in 'මු'. Corrected sentence: {' '.join(words[:-1])} {correct_verb if correct_verb else 'මු'}"

    elif subject in singular_nouns:
        if verb in singular_verbs:
            return "The sentence is grammatically correct."
        else:
            correct_verb = next((v for v in singular_verbs if v.startswith(verb[:-2])), None)
            return f"Error: Singular noun '{subject}' must end with a verb ending in 'යි'. Corrected sentence: {' '.join(words[:-1])} {correct_verb if correct_verb else 'යි'}"

    return "Error: Unable to process the sentence. Check input format."

# Load word files (placeholders, replace with your actual file paths)
nouns_file = '/content/drive/MyDrive/AI/Project/Sinhala_Nouns.docx'
verbs_mama_file = '/content/drive/MyDrive/AI/Project/verbs_mama_file.docx'
verbs_api_file = '/content/drive/MyDrive/AI/Project/verbs_api_file.docx'
other_verbs_file = '/content/drive/MyDrive/AI/Project/Other_Verbs.docx'

# Helper function to read words from Word files
def read_words_from_docx(file_path):
    from docx import Document
    doc = Document(file_path)
    words = []
    for para in doc.paragraphs:
        words.extend(para.text.split())
    return words

# Load words
singular_nouns = read_words_from_docx(nouns_file)
verbs_mama = read_words_from_docx(verbs_mama_file)
verbs_api = read_words_from_docx(verbs_api_file)
singular_verbs = read_words_from_docx(other_verbs_file)

# Load test data
test_file = '/content/drive/MyDrive/AI/Project/sinhala_test_data.xlsx'
test_data = pd.read_excel(test_file)

# Evaluate each sentence
results = []
for _, row in test_data.iterrows():
    sentence = row['Sentence']
    expected_feedback = row['Expected Feedback']

    # Get feedback from the grammar checker
    feedback = grammar_checker(sentence, singular_nouns, verbs_mama, verbs_api, singular_verbs)
    feedback_classification = "Correct" if "grammatically correct" in feedback.lower() else "Incorrect"
    expected_classification = "Correct" if "grammatically correct" in expected_feedback.lower() else "Incorrect"

    results.append({
        "Sentence": sentence,
        "Expected Feedback": expected_feedback,
        "System Feedback": feedback,
        "Expected Classification": expected_classification,
        "System Classification": feedback_classification
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Calculate metrics
TP = results_df[(results_df['Expected Classification'] == "Correct") & (results_df['System Classification'] == "Correct")].shape[0]
TN = results_df[(results_df['Expected Classification'] == "Incorrect") & (results_df['System Classification'] == "Incorrect")].shape[0]
FP = results_df[(results_df['Expected Classification'] == "Correct") & (results_df['System Classification'] == "Incorrect")].shape[0]
FN = results_df[(results_df['Expected Classification'] == "Incorrect") & (results_df['System Classification'] == "Correct")].shape[0]

accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP) if TP + FP > 0 else 0
recall = TP / (TP + FN) if TP + FN > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

# Output metrics and results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

# Save results to an Excel file
results_df.to_excel('/content/drive/MyDrive/sinhala_grammar_results.xlsx', index=False)


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


Spell Checker with Grammer Checekr

In [12]:
import difflib
from docx import Document

# Load words from .docx files (for spell checker)
def load_dictionary(file_path):
    doc = Document(file_path)
    return [paragraph.text.strip() for paragraph in doc.paragraphs if paragraph.text.strip()]

# Load words from different .docx files (for grammar checker)
def load_words_from_docx(file_path):
    document = Document(file_path)
    words = []
    for paragraph in document.paragraphs:
        words.extend(paragraph.text.split())  # Split words by spaces
    return words

# File paths for word lists (dictionary) and grammar rules
word_file_path = '/content/drive/MyDrive/AI/Project/Sinhala_Word_Dictionary.docx'
nouns_file = '/content/drive/MyDrive/AI/Project/Sinhala_Nouns.docx'
verbs_mama_file = '/content/drive/MyDrive/AI/Project/verbs_mama_file.docx'
verbs_api_file = '/content/drive/MyDrive/AI/Project/verbs_api_file.docx'
other_verbs_file = '/content/drive/MyDrive/AI/Project/Other_Verbs.docx'

# Load words for spell checker and grammar checker
dictionary = load_dictionary(word_file_path)
singular_nouns = load_words_from_docx(nouns_file)
verbs_mama = load_words_from_docx(verbs_mama_file)
verbs_api = load_words_from_docx(verbs_api_file)
singular_verbs = load_words_from_docx(other_verbs_file)

# Helper to find the correct verb for a stem
def find_correct_verb(verbs_list, stem):
    for verb in verbs_list:
        if stem in verb:
            return verb
    return None

# Spell Checker
def detect_errors(paragraph, dictionary):
    words = paragraph.split()
    misspelled = [word for word in words if word not in dictionary]
    return misspelled

def suggest_correction(word, dictionary):
    closest_match = difflib.get_close_matches(word, dictionary, n=1)
    return closest_match[0] if closest_match else word

def correct_paragraph(paragraph, dictionary):
    words = paragraph.split()
    corrected = [
        suggest_correction(word, dictionary) if word not in dictionary else word
        for word in words
    ]
    return " ".join(corrected)

# Grammar Checker
def grammar_checker(sentence):
    words = sentence.split()
    if len(words) < 2:
        return "Error: Sentence is too short to check grammar."

    subject = words[0]  # The first word is the subject
    verb = words[-1]  # The last word is the verb
    verb_stem = verb[:-2]  # Extract the stem of the verb

    # Check grammar rules
    if subject == "මම":
        if verb not in verbs_mama:
            correct_verb = find_correct_verb(verbs_mama, verb_stem)
            if correct_verb:
                return f"Error: Sentence starting with 'මම' must end with a verb ending in 'මි'. Corrected sentence: {subject} " + ' '.join(words[1:-1]) + f" {correct_verb}"
        return "The sentence is grammatically correct."

    if subject == "අපි":
        if verb not in verbs_api:
            correct_verb = find_correct_verb(verbs_api, verb_stem)
            if correct_verb:
                return f"Error: Sentence starting with 'අපි' must end with a verb ending in 'මු'. Corrected sentence: {subject} " + ' '.join(words[1:-1]) + f" {correct_verb}"
        return "The sentence is grammatically correct."

    if subject in singular_nouns:
        if verb not in singular_verbs:
            correct_verb = find_correct_verb(singular_verbs, verb_stem)
            if correct_verb:
                return f"Error: Singular noun '{subject}' must end with a verb ending in 'යි'. Corrected sentence: {subject} " + ' '.join(words[1:-1]) + f" {correct_verb}"
        return "The sentence is grammatically correct."

    return "The sentence does not match any grammatical rule."

# Combine Spell Checker and Grammar Checker
def check_and_correct_paragraph(paragraph, dictionary):
    # Correct spelling first
    corrected_paragraph = correct_paragraph(paragraph, dictionary)

    # Detect any spelling errors after correction
    misspelled_words = detect_errors(corrected_paragraph, dictionary)
    if misspelled_words:
        print(f"Misspelled Words: {misspelled_words}")

    # Check for grammar errors
    grammar_feedback = grammar_checker(corrected_paragraph)

    # Return corrected paragraph and grammar feedback
    return corrected_paragraph, grammar_feedback

# Test Sentences
sample_paragraphs = [
    "මිනිස ගගට දුවනව",  # Spelling error
    "මම නටමු",  # Grammar error
    "අපි කමු",  # Correct
    "ගුරුවරු දරුවන පන්ත මගින් දැනුම ලබාදෙය",  # Spelling error
    "මිනිසා උද ඉඳලා රාත්‍රිය දැක්වා වැඩකරමු",# Spelling error
]

# Process each sample paragraph
for i, sample_paragraph in enumerate(sample_paragraphs, start=1):
    print(f"Sample Sentence {i}:")
    print("Original Sentence:", sample_paragraph)

    # Combine spell and grammar check
    corrected_paragraph, grammar_feedback = check_and_correct_paragraph(sample_paragraph, dictionary)

    print("Corrected Sentence:", corrected_paragraph)
    print("Grammar Feedback:", grammar_feedback)
    print("\n")


Sample Sentence 1:
Original Sentence: මිනිස ගගට දුවනව
Corrected Sentence: මිනිසා ගඟට දුවනවා
Grammar Feedback: The sentence is grammatically correct.


Sample Sentence 2:
Original Sentence: මම නටමු
Misspelled Words: ['මම', 'නටමු']
Corrected Sentence: මම නටමු
Grammar Feedback: Error: Sentence starting with 'මම' must end with a verb ending in 'මි'. Corrected sentence: මම  නටමි


Sample Sentence 3:
Original Sentence: අපි කමු
Misspelled Words: ['අපි']
Corrected Sentence: අපි කෑම
Grammar Feedback: Error: Sentence starting with 'අපි' must end with a verb ending in 'මු'. Corrected sentence: අපි  කමු


Sample Sentence 4:
Original Sentence: ගුරුවරු දරුවන පන්ත මගින් දැනුම ලබාදෙය
Corrected Sentence: ගුරුවරු දරුවන්ට පන්ති මගින් දැනුම ලබාදෙති
Grammar Feedback: The sentence does not match any grammatical rule.


Sample Sentence 5:
Original Sentence: මිනිසා උද ඉඳලා රාත්‍රිය දැක්වා වැඩකරමු
Corrected Sentence: මිනිසා උදේ ඉඳලා රාත්‍රිය දක්වා වැඩකරයි
Grammar Feedback: The sentence is grammatically correct

Final Rule Based Model


In [13]:
import pandas as pd
from difflib import get_close_matches

# Step 1: Load Dictionary Words
def load_dataset(file_path):
    # Load the Excel file into a DataFrame
    df = pd.read_excel(file_path)
    # Extract the column named 'word', drop any NaN values, and return as a list
    return df['word'].dropna().tolist()

# File path to the dataset (update this path to your dataset's location)
file_path = "/content/drive/MyDrive/AI/Project/data-spell-checker.xlsx"

# Load dictionary words for spell checker
dictionary = load_dataset(file_path)

# Step 2: Spell Checker Functionality
def spell_checker(word, dictionary):
    # Check if the word exists in the dictionary
    if word in dictionary:
        return word  # Return the word if it exists in the dictionary
    else:
        # Use `get_close_matches` for finding the closest match in the dictionary
        suggestions = get_close_matches(word, dictionary, n=1, cutoff=0.8)
        if suggestions:
            return suggestions[0]  # Return the closest suggestion
        else:
            return word  # Return the original word if no suggestion is found

# Step 3: Grammar Checker Rules
def grammar_checker(sentence):
    words = sentence.split()
    if len(words) < 2:
        return "Error: Sentence is too short to check grammar."

    subject = words[0]
    verb = words[-1]

    # Rule 1: "මම" + "මි"
    if subject == "මම":
        if not verb.endswith("මි"):
            return f"Correct sentence: '{subject} ... {verb[:-2]}මි'."  # Suggested correction
        return "The sentence is grammatically correct."

    # Rule 2: "අපි" + "මු"
    if subject == "අපි":
        if not verb.endswith("මු"):
            return f"Correct sentence: '{subject} ... {verb[:-2]}මු'."  # Suggested correction
        return "The sentence is grammatically correct."

    # Rule 3: If sentence does not start with "මම" or "අපි", it must end with "යි"
    if subject not in ["මම", "අපි"]:
        if not verb.endswith("යි"):
            return f"Correct sentence: '{subject} ... {verb[:-2]}යි'."  # Suggested correction
        return "The sentence is grammatically correct."

    return "The sentence does not match any grammatical rule."

# Step 4: Process Sentences
sentences = [
    "මම අක්‍රමවතව නටමි",    # Correct
    "මම වද කලෙමු",    # Incorrect
    "අපි කමු",     # Correct
    "අපි නටමි",    # Incorrect
    "ගම වාණිජව පසුබසයි", # Correct
    "පාසල ජයගමු",  # Incorrect
]

# Step 5: Main Loop
for sentence in sentences:
    print(f"Sentence: {sentence}")

    # Check Spelling First
    corrected_words = []
    spelling_errors = []
    for word in sentence.split():
        corrected_word = spell_checker(word, dictionary)
        if corrected_word != word:
            spelling_errors.append((word, corrected_word))
        corrected_words.append(corrected_word)

    corrected_sentence = " ".join(corrected_words)
    if spelling_errors:
        print(f"Spelling Errors: {', '.join([f'{orig}->{corr}' for orig, corr in spelling_errors])}")
    print(f"Spell-Checked Sentence: {corrected_sentence}")

    # Check Grammar
    grammar_feedback = grammar_checker(corrected_sentence)
    print(f"Grammar Feedback: {grammar_feedback}\n")


Sentence: මම අක්‍රමවතව නටමි
Spelling Errors: අක්‍රමවතව->අක්‍රමවත්ව
Spell-Checked Sentence: මම අක්‍රමවත්ව නටමි
Grammar Feedback: The sentence is grammatically correct.

Sentence: මම වද කලෙමු
Spell-Checked Sentence: මම වද කලෙමු
Grammar Feedback: Correct sentence: 'මම ... කලෙමි'.

Sentence: අපි කමු
Spell-Checked Sentence: අපි කමු
Grammar Feedback: The sentence is grammatically correct.

Sentence: අපි නටමි
Spell-Checked Sentence: අපි නටමි
Grammar Feedback: Correct sentence: 'අපි ... නටමු'.

Sentence: ගම වාණිජව පසුබසයි
Spelling Errors: වාණිජව->වාණිජ, පසුබසයි->පසුබසින
Spell-Checked Sentence: ගම වාණිජ පසුබසින
Grammar Feedback: Correct sentence: 'ගම ... පසුබසයි'.

Sentence: පාසල ජයගමු
Spelling Errors: පාසල->පාසැල
Spell-Checked Sentence: පාසැල ජයගමු
Grammar Feedback: Correct sentence: 'පාසැල ... ජයගයි'.

