<a href="https://colab.research.google.com/github/AshSama12/Sinhala-Spell-and-Grammer-Checker/blob/master/Dl_grammar_checker_with_spell_cheker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
from google.colab import drive
import pandas as pd
from difflib import get_close_matches
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import re

In [32]:
# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
file_path = '/content/drive/MyDrive/AI project/correct and wrong sentences .csv'
df = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
# Tokenization and padding setup
sentences = df.iloc[:, 0].tolist() + df.iloc[:, 1].tolist() + df.iloc[:, 2].tolist()
labels = [0] * len(df.iloc[:, 0]) + [1] * len(df.iloc[:, 1]) + [2] * len(df.iloc[:, 2])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [34]:
# One-hot encode labels
labels = to_categorical(labels, num_classes=3)

# Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [35]:
# Define and train the model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    LSTM(128, return_sequences=False),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/50




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 41ms/step - accuracy: 0.3105 - loss: 1.1016 - val_accuracy: 0.3015 - val_loss: 1.0990
Epoch 2/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step - accuracy: 0.3792 - loss: 1.0846 - val_accuracy: 0.4570 - val_loss: 1.1187
Epoch 3/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.6700 - loss: 0.7290 - val_accuracy: 0.6344 - val_loss: 0.7430
Epoch 4/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.8411 - loss: 0.3937 - val_accuracy: 0.6235 - val_loss: 0.8573
Epoch 5/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 44ms/step - accuracy: 0.8776 - loss: 0.3023 - val_accuracy: 0.6112 - val_loss: 1.0169
Epoch 6/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 45ms/step - accuracy: 0.8895 - loss: 0.2592 - val_accuracy: 0.5784 - val_loss: 1.2652
Epoch 7/50
[1m92/92[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7f82e4767130>

In [42]:
# Save the trained model (optional)
model.save('/content/drive/MyDrive/AI project/grammar_model.h5')



In [36]:
# Load Sinhala dictionary
dictionary_path = '/content/drive/MyDrive/AI project/extended_sinhala_dictionary.txt'
with open(dictionary_path, 'r', encoding='utf-8') as file:
    sinhala_dictionary = set(file.read().splitlines())


In [37]:
# Spell Checker
def sinhala_spell_checker(sentence, dictionary):
    words = sentence.split()
    corrected_words = []
    misspelled_words = []

    for word in words:
        if word in dictionary:
            corrected_words.append(word)
        else:
            matches = get_close_matches(word, dictionary, n=3, cutoff=0.8)
            if matches:
                corrected_words.append(matches[0])
                misspelled_words.append((word, matches))
            else:
                corrected_words.append(word)

    return ' '.join(corrected_words), misspelled_words

In [38]:
# Sinhala Sentence Tokenizer
def sinhala_sent_tokenize(paragraph):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s', paragraph)
    return [s.strip() for s in sentences if s.strip()]


In [39]:
# Grammar Checker
def grammar_checker(paragraph, model, tokenizer, max_length):
    sentences = sinhala_sent_tokenize(paragraph)
    results = []

    for sentence in sentences:
        sequence = tokenizer.texts_to_sequences([sentence])
        padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
        prediction = model.predict(padded_sequence)[0]

        if prediction[0] > 0.5:
            prediction_label = "Correct Grammar"
            suggestion = sentence
        elif prediction[1] > 0.5:
            prediction_label = "Wrong: Rule 1 Error"
            suggestion = "Consider revising the verb to match the subject."
        elif prediction[2] > 0.5:
            prediction_label = "Wrong: Rule 2 Error"
            suggestion = "Ensure the sentence follows Subject-Object-Verb order."
        else:
            prediction_label = "Uncertain Error"
            suggestion = "Review the sentence for possible grammatical mistakes."

        results.append({
            "sentence": sentence,
            "prediction": prediction_label,
            "suggestion": suggestion
        })
    return results

In [40]:
# Process Paragraph
def process_paragraph(paragraph, dictionary, model, tokenizer, max_length):
    corrected_paragraph, misspelled_words = sinhala_spell_checker(paragraph, dictionary)
    grammar_results = grammar_checker(corrected_paragraph, model, tokenizer, max_length)
    return corrected_paragraph, misspelled_words, grammar_results


In [54]:
from tensorflow.keras.models import load_model
# Main script
if __name__ == "__main__":
    # Load model
    model = load_model('/content/drive/MyDrive/AI project/grammar_model.h5')

    # User input
    print("Enter a Sinhala paragraph (Press Enter twice to finish):")
    paragraph = ""
    while True:
        line = input()
        if line.strip() == "":
            break
        paragraph += " " + line

    # Process the paragraph
    corrected_paragraph, misspelled_words, grammar_results = process_paragraph(
        paragraph, sinhala_dictionary, model, tokenizer, max_length
    )

    # Display results
    print("\nMisspelled Words and Suggestions:")
    if not misspelled_words:
        print("No misspelled words found.")
    else:
        for original, suggestions in misspelled_words:
            print(f"Original: {original} | Suggestions: {', '.join(suggestions)}")

    print("\nCorrected Paragraph:")
    print(corrected_paragraph)

    print("\nGrammar Results:")
    for result in grammar_results:
        print(f"Sentence: {result['sentence']}")
        print(f"Prediction: {result['prediction']}")
        print(f"Suggestion: {result['suggestion']}")



Enter a Sinhala paragraph (Press Enter twice to finish):
ඔහුහුවු උදෑසනින්ම පාසල් ගියහ. ඇය ලන්තෑරම රැගෙන පැමිනියාය. ඔහු බෝලයට පයින් ගසයි. අපි සෙමින් ඉදියට ගියෙමු.

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step

Misspelled Words and Suggestions:
Original: බෝලයට | Suggestions: බලයට, බෝගලට
Original: ගසයි. | Suggestions: ගයි., රසයි., ගයති.
Original: ඉදියට | Suggestions: දියට, ඉදිරියට, දිනයට

Corrected Paragraph:
ඔහුහුවු උදෑසනින්ම පාසල් ගියහ. ඇය ලන්තෑරම රැගෙන පැමිනියාය. ඔහු බලයට පයින් ගයි. අපි සෙමින් දියට ගියෙමු.

Grammar Results:
Sentence: ඔහුහුවු උදෑසනින්ම පාසල් ගියහ.
Prediction: Correct Grammar
Suggestion: ඔහුහුවු උදෑසනින්ම පාසල් ගියහ.
Sentence: ඇය ලන්තෑරම රැගෙන පැමිනියාය.
Prediction: Correct Grammar
Suggestion: ඇය ලන්තෑරම රැගෙන පැමිනියාය.
Sentence: 

In [55]:

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)

print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6475 - loss: 2.3420
Test Loss: 2.143390417098999
Test Accuracy: 0.6712141633033752
