<a href="https://colab.research.google.com/github/AshSama12/Sinhala-Spell-and-Grammer-Checker/blob/master/sinhala_Spelling_%26GrammerCheck.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import re

# Load Sinhala dataset from Excel file
file_path = '/content/data-spell-checker.xlsx'

# Read the Excel file into a pandas DataFrame
sinhala_data = pd.read_excel(file_path, sheet_name=None)

# Check available sheet names
print("Sheet names:", sinhala_data.keys())

# Assuming the words are in the first sheet, load it
sinhala_words_df = pd.read_excel(file_path, sheet_name=0)

# Display first few rows
print(sinhala_words_df.head())


Sheet names: dict_keys(['Sheet1'])
        word  label
0  අභිචෝදකයා      1
1      අංකනය      1
2       අංකන      1
3       අංකය      1
4  අංකාන්තරය      1


In [9]:
# Combine all words into a list (assuming the words are in the first column)
sinhala_word_list = sinhala_words_df.iloc[:, 0].dropna().tolist()

# Convert to a set for quick lookup
sinhala_dictionary = set(sinhala_word_list)

print(f"Loaded {len(sinhala_dictionary)} unique Sinhala words.")


Loaded 101298 unique Sinhala words.


In [11]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m106.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.10.1


In [12]:
from rapidfuzz import process

# Function to tokenize Sinhala text
def tokenize_text(text):
    # Match Sinhala words using Unicode range
    words = re.findall(r'[\u0D80-\u0DFF]+', text)
    return words

# Function to check for misspellings
def check_spelling(tokens, dictionary):
    misspelled_words = []
    for word in tokens:
        if word not in dictionary:
            misspelled_words.append(word)
    return misspelled_words

# Function to suggest corrections
def suggest_corrections(word, dictionary, limit=3):
    suggestions = process.extract(word, dictionary, limit=limit)
    return [match[0] for match in suggestions]

# Function to auto-correct text
def correct_text(input_text, dictionary):
    tokens = tokenize_text(input_text)
    corrected_text = input_text
    for word in tokens:
        if word not in dictionary:
            suggestions = suggest_corrections(word, dictionary)
            if suggestions:
                corrected_word = suggestions[0]
                print(f"Correcting '{word}' to '{corrected_word}'")
                corrected_text = corrected_text.replace(word, corrected_word)
    return corrected_text

# Example usage
input_text = "අක්ක ඉගෙන ගන්න ගියා."  # Example sentence with potential errors
print("Original Text:", input_text)

corrected_text = correct_text(input_text, sinhala_dictionary)
print("Corrected Text:", corrected_text)


Original Text: අක්ක ඉගෙන ගන්න ගියා.
Correcting 'අක්ක' to 'අක්කාරම්'
Correcting 'ඉගෙන' to 'ඉගෙනුම'
Correcting 'ගන්න' to 'හරිගස්සගන්න'
Corrected Text: අක්කාරම් ඉගෙනුම හරිගස්සගන්න ගියා.


In [13]:
# Example rule for subject-verb agreement
def check_grammar(text):
    errors = []
    # Rule: Simple example for detecting "කනවා" misuse
    if re.search(r'කනවා', text):
        errors.append("Possible verb misuse: 'කනවා' - consider checking context.")
    return errors

# Function to correct grammar errors
def correct_grammar(text):
    corrected_text = text
    if 'කනවා' in text:
        corrected_text = corrected_text.replace('කනවා', 'කන්නේ')
        print("Corrected grammar: 'කනවා' to 'කන්නේ'")
    return corrected_text

# Example
input_text = "ආනන්ද කනවා"
print("Original:", input_text)

grammar_errors = check_grammar(input_text)
print("Grammar Errors:", grammar_errors)

corrected_text = correct_grammar(input_text)
print("Corrected Text:", corrected_text)


Original: ආනන්ද කනවා
Grammar Errors: ["Possible verb misuse: 'කනවා' - consider checking context."]
Corrected grammar: 'කනවා' to 'කන්නේ'
Corrected Text: ආනන්ද කන්නේ


In [14]:
# Sample paragraphs for testing
paragraphs = [
    "ආනන්ද කනවා.",
    "අක්ක ගිහින් ඉගෙන ගන්නවා.",
    "මම ගෙදර යනවා.",
    "උදේට බත් කනවා.",
    "අයියා පාඩම් කරනවා."
]

# Function to evaluate accuracy
def evaluate_accuracy(paragraphs, dictionary):
    total_words = 0
    corrected_words = 0

    for text in paragraphs:
        tokens = tokenize_text(text)
        misspelled_words = check_spelling(tokens, dictionary)
        total_words += len(tokens)
        corrected_words += len(tokens) - len(misspelled_words)

        # Correct grammar and spellings
        corrected_text = correct_text(text, dictionary)
        corrected_text = correct_grammar(corrected_text)
        print(f"Original: {text}")
        print(f"Corrected: {corrected_text}\n")

    accuracy = (corrected_words / total_words) * 100
    return accuracy

# Evaluate on sample paragraphs
accuracy = evaluate_accuracy(paragraphs, sinhala_dictionary)
print(f"Accuracy: {accuracy:.2f}%")


Correcting 'ආනන්ද' to 'ආනන්දය'
Corrected grammar: 'කනවා' to 'කන්නේ'
Original: ආනන්ද කනවා.
Corrected: ආනන්දය කන්නේ.

Correcting 'අක්ක' to 'අක්කාරම්'
Correcting 'ගිහින්' to 'ගිහි'
Correcting 'ඉගෙන' to 'ඉගෙනුම'
Original: අක්ක ගිහින් ඉගෙන ගන්නවා.
Corrected: අක්කාරම් ගිහි ඉගෙනුම ගන්නවා.

Original: මම ගෙදර යනවා.
Corrected: මම ගෙදර යනවා.

Correcting 'උදේට' to 'ද'
Corrected grammar: 'කනවා' to 'කන්නේ'
Original: උදේට බත් කනවා.
Corrected: ද බත් කන්නේ.

Correcting 'අයියා' to 'අය'
Correcting 'පාඩම්' to 'පාඩ'
Original: අයියා පාඩම් කරනවා.
Corrected: අය පාඩ කරනවා.

Accuracy: 53.33%


In [15]:
import pandas as pd
import re
from rapidfuzz import process

# Load Sinhala dataset from Excel file
file_path = '/content/data-spell-checker.xlsx'
sinhala_words_df = pd.read_excel(file_path, sheet_name=0)

# Extract words and convert to a set for fast lookup
sinhala_dictionary = set(sinhala_words_df.iloc[:, 0].dropna().tolist())
print(f"Loaded {len(sinhala_dictionary)} Sinhala words.")

# Function to tokenize Sinhala text
def tokenize_text(text):
    words = re.findall(r'[\u0D80-\u0DFF]+', text)
    return words

Loaded 101298 Sinhala words.


In [16]:
# Function to check for misspelled words
def check_spelling(tokens, dictionary):
    misspelled_words = []
    for word in tokens:
        if word not in dictionary:
            misspelled_words.append(word)
    return misspelled_words

# User input
input_text = input("Enter a paragraph in Sinhala: ")
tokens = tokenize_text(input_text)
misspelled = check_spelling(tokens, sinhala_dictionary)

print("Misspelled words:", misspelled)


Enter a paragraph in Sinhala: අක්ක ගිහින් ඉගෙන ගන්නව
Misspelled words: ['අක්ක', 'ගිහින්', 'ඉගෙන', 'ගන්නව']


In [17]:
# Function to suggest corrections for a word
def suggest_corrections(word, dictionary, limit=5):
    suggestions = process.extract(word, dictionary, limit=limit)
    return [match[0] for match in suggestions]


In [18]:
from collections import deque

# Function to interactively correct text
def interactive_correction(input_text, dictionary):
    tokens = tokenize_text(input_text)
    misspelled = check_spelling(tokens, dictionary)

    corrected_text = input_text
    queue = deque(misspelled)  # Queue of misspelled words

    while queue:
        word = queue.popleft()
        print(f"\nMisspelled Word: {word}")
        suggestions = suggest_corrections(word, dictionary)

        print("Suggestions:")
        for idx, suggestion in enumerate(suggestions, 1):
            print(f"{idx}. {suggestion}")

        print("0. Skip this word")
        choice = int(input("Enter the number of the correct word (0 to skip): "))

        if choice > 0 and choice <= len(suggestions):
            corrected_word = suggestions[choice - 1]
            corrected_text = corrected_text.replace(word, corrected_word, 1)
            print(f"Replaced '{word}' with '{corrected_word}'.")
        else:
            print(f"Skipped correction for '{word}'.")

    return corrected_text

# Perform interactive correction
corrected_text = interactive_correction(input_text, sinhala_dictionary)
print("\nCorrected Text:")
print(corrected_text)



Misspelled Word: අක්ක
Suggestions:
1. අක්කාරම්
2. අක්කොළ
3. අක්කරය
4. අක්කොල
5. අරක්කු
0. Skip this word
Enter the number of the correct word (0 to skip): 0
Skipped correction for 'අක්ක'.

Misspelled Word: ගිහින්
Suggestions:
1. ගිහි
2. හි
3. හ
4. සිහින්
5. ෂිහින්
0. Skip this word
Enter the number of the correct word (0 to skip): 4
Replaced 'ගිහින්' with 'සිහින්'.

Misspelled Word: ඉගෙන
Suggestions:
1. ඉගෙනුම
2. ඉගෙනීම
3. ඉගිලෙන
4. ඉගිළෙන
5. සුරැකගෙන
0. Skip this word
Enter the number of the correct word (0 to skip): 0
Skipped correction for 'ඉගෙන'.

Misspelled Word: ගන්නව
Suggestions:
1. ගන්නවා
2. බදාගන්නවා
3. සාදාගන්නවාට
4. සාදාගන්නවා
5. සිපගන්නවාට
0. Skip this word
Enter the number of the correct word (0 to skip): 1
Replaced 'ගන්නව' with 'ගන්නවා'.

Corrected Text:
අක්ක සිහින් ඉගෙන ගන්නවා


In [22]:
# Example function to calculate accuracy
def calculate_accuracy(corrected_texts, ground_truth_texts):
    correct = 0
    for corrected, truth in zip(corrected_texts, ground_truth_texts):
        if corrected == truth:
            correct += 1
# Example function to calculate accuracy
def calculate_accuracy(corrected_texts, ground_truth_texts):
    correct = 0
    for corrected, truth in zip(corrected_texts, ground_truth_texts):
        if corrected == truth:
            correct += 1
    return (correct / len(ground_truth_texts)) * 100

    print(calculate_accuracy)