In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m143.4/244.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


LSTM Model


In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import difflib
import docx
import pickle  # To save and load the tokenizer

# Load dataset from Excel file
data_path = '/content/drive/MyDrive/AI/Project/Grammer_data_set.xlsx'
data = pd.read_excel(data_path)

# Rename the columns explicitly to avoid issues
data.columns = ['Sentence', 'True_Sentence', 'Label']  # Assuming 'Label' is the target column

# Data Preprocessing
def preprocess_data(data, save_path=None):
    # Tokenize text data
    tokenizer = Tokenizer(oov_token="<OOV>")
    tokenizer.fit_on_texts(data['Sentence'])
    sequences = tokenizer.texts_to_sequences(data['Sentence'])

    # Pad sequences to the same length
    max_len = max(len(seq) for seq in sequences)
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

    # Save the tokenizer if a save path is provided
    if save_path:
        with open(save_path, 'wb') as handle:
            pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return padded_sequences, tokenizer, max_len

# Function to load Sinhala dictionary from a Word file
def load_dictionary(file_path):
    doc = docx.Document(file_path)
    return [paragraph.text.strip() for paragraph in doc.paragraphs if paragraph.text.strip()]

# Load Sinhala words
word_file_path = '/content/drive/MyDrive/AI/Project/Spell_correction_data.docx'
dictionary = load_dictionary(word_file_path)

# Spell Checking Functions
def detect_errors(paragraph, dictionary):
    words = paragraph.split()  # Split the paragraph into words
    misspelled = [word for word in words if word not in dictionary]
    return misspelled

def suggest_correction(word, dictionary):
    closest_match = difflib.get_close_matches(word, dictionary, n=1)
    return closest_match[0] if closest_match else word  # Suggest closest or return original

def correct_paragraph(paragraph, dictionary):
    words = paragraph.split()
    corrected = [
        suggest_correction(word, dictionary) if word not in dictionary else word
        for word in words
    ]
    return " ".join(corrected)

# Save path for the tokenizer
tokenizer_save_path = '/content/drive/MyDrive/AI/Project/tokenizer.pkl'

# Process data
X, tokenizer, max_len = preprocess_data(data, save_path=tokenizer_save_path)

# Convert labels to one-hot encoding for multi-class classification
y = to_categorical(data['Label'].values, num_classes=3)  # Adjust number of classes

# Split data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the Model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    LSTM(32),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')  # Adjusted for 3 classes
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train Model
epochs = 20
batch_size = 32
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size, verbose=1)

# Save Model
model_save_path = '/content/drive/MyDrive/AI/Project/results/sinhala_spell_grammar_model.h5'
model.save(model_save_path)

# Function to check grammar and spelling
def check_sentence(sentence, dictionary):
    # Step 1: Spell check
    misspelled_words = detect_errors(sentence, dictionary)
    corrected_sentence = correct_paragraph(sentence, dictionary)

    # Step 2: Grammar check using the model
    sequence = tokenizer.texts_to_sequences([corrected_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    prediction = model.predict(padded_sequence)
    predicted_class = np.argmax(prediction, axis=1)[0]

    # Step 3: Provide feedback
    feedback = []
    if len(misspelled_words) > 0:
        feedback.append(f"Spelling mistakes detected: {', '.join(misspelled_words)}. Corrected sentence: {corrected_sentence}")
    feedback.append(f"The predicted class for the sentence is: {predicted_class}")

    return " ".join(feedback)

# Evaluate on the 20% test set
y_pred = np.argmax(model.predict(X_test), axis=1)
y_true = np.argmax(y_test, axis=1)

# Print classification report for evaluation
print("Evaluation on 20% test set:")
print(classification_report(y_true, y_pred))


Epoch 1/20




[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 26ms/step - accuracy: 0.6203 - loss: 1.0197 - val_accuracy: 0.6677 - val_loss: 0.6696
Epoch 2/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6312 - loss: 0.6879 - val_accuracy: 0.6677 - val_loss: 0.5389
Epoch 3/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7303 - loss: 0.5132 - val_accuracy: 0.8544 - val_loss: 0.4439
Epoch 4/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8350 - loss: 0.4297 - val_accuracy: 0.9114 - val_loss: 0.2879
Epoch 5/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8989 - loss: 0.2826 - val_accuracy: 0.9241 - val_loss: 0.1716
Epoch 6/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8915 - loss: 0.2220 - val_accuracy: 0.9241 - val_loss: 0.1655
Epoch 7/20
[1m40/40[0m [32m━━━━━━━━━━━━━━



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step
Evaluation on 20% test set:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       211
           1       0.95      0.92      0.94       105

    accuracy                           0.96       316
   macro avg       0.96      0.95      0.95       316
weighted avg       0.96      0.96      0.96       316



In [None]:
import numpy as np
import pickle
import docx
import difflib
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Sinhala dictionary from a Word file
def load_dictionary(file_path):
    doc = docx.Document(file_path)
    return [paragraph.text.strip() for paragraph in doc.paragraphs if paragraph.text.strip()]

# Spell Checking Functions
def detect_errors(words, dictionary):
    return [word for word in words if word not in dictionary]

def suggest_correction(word, dictionary):
    closest_match = difflib.get_close_matches(word, dictionary, n=1)
    return closest_match[0] if closest_match else word

def correct_paragraph(sentence, dictionary):
    words = sentence.split()
    if len(words) <= 2:  # Handle short sentences separately
        return sentence
    corrected = [
        suggest_correction(word, dictionary) if word not in dictionary else word
        for word in words[1:-1]  # Check only the middle words for spelling
    ]
    return " ".join([words[0]] + corrected + [words[-1]])  # Reassemble with first and last word intact


# Define the max_len based on your model training
max_len = 30  # Ensure this matches the value used during training

# Grammar Checking Based on Rules
def check_grammar(sentence):
    words = sentence.split()

    # Rule 1: Starts with 'මම' -> Ends with 'මි'
    if words[0] == "මම":
        if not words[-1].endswith("මි"):
            return False

    # Rule 2: Starts with 'අපි' -> Ends with 'මු'
    elif words[0] == "අපි":
        if not words[-1].endswith("මු"):
            return False

    # Rule 3: Starts with any word -> Ends with 'යි'
    else:
        if not words[-1].endswith("යි"):
            return False

    return True

# Function to check grammar and spelling
def check_sentence(sentence, dictionary, model, tokenizer, max_len):
    words = sentence.split()

    # Step 1: Spell check middle words
    middle_words = words[1:-1]
    misspelled_words = detect_errors(middle_words, dictionary)
    corrected_sentence = correct_paragraph(sentence, dictionary)

    # Step 2: Grammar check using the trained model
    sequence = tokenizer.texts_to_sequences([corrected_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    prediction = model.predict(padded_sequence)
    is_grammar_correct = prediction[0][0] > 0.5

    # Step 3: Provide feedback
    feedback = []
    if misspelled_words:
        feedback.append(f"Spelling mistakes detected: {', '.join(misspelled_words)}.")
        feedback.append(f"Corrected sentence: {corrected_sentence}")
    else:
        feedback.append("No spelling mistakes detected.")

    if not is_grammar_correct:
        feedback.append("The sentence is grammatically incorrect according to the model.")
    else:
        feedback.append("The sentence is grammatically correct.")

    return "\n".join(feedback)

# Load the trained model
model = load_model('/content/drive/MyDrive/AI/Project/results/sinhala_spell_grammar_model.h5')

# Compile the model after loading it to remove the warning
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Load the tokenizer (ensure the path is correct)
tokenizer_save_path = '/content/drive/MyDrive/AI/Project/tokenizer.pkl'
with open(tokenizer_save_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load Sinhala words dictionary
word_file_path = '/content/drive/MyDrive/AI/Project/Spell_correction_data.docx'
dictionary = load_dictionary(word_file_path)

# Interactive Function for Sentence Checking
def check_multiple_sentences():
    print("Enter sentences to check for spelling and grammar. Type 'exit' to stop.")

    while True:
        sentence = input("Enter a sentence: ")

        if sentence.lower() == 'exit':
            print("Exiting...")
            break

        result = check_sentence(sentence, dictionary, model, tokenizer, max_len)
        print(f"Sentence: '{sentence}'\nResult:\n{result}\n")

# Run the function to check sentences
check_multiple_sentences()




Enter sentences to check for spelling and grammar. Type 'exit' to stop.
Enter a sentence: තාත්තා ගුවන්විදුලියට සවන්දෙයි
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 496ms/step
Sentence: 'තාත්තා ගුවන්විදුලියට සවන්දෙයි'
Result:
No spelling mistakes detected.
The sentence is grammatically correct.

Enter a sentence: ඇය ඇපල් කමු
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Sentence: 'ඇය ඇපල් කමු'
Result:
No spelling mistakes detected.
The sentence is grammatically correct.

Enter a sentence: මම කොත්තු කමි
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Sentence: 'මම කොත්තු කමි'
Result:
No spelling mistakes detected.
The sentence is grammatically incorrect according to the model.

Enter a sentence: මම කොත්තු කමු
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Sentence: 'මම කොත්තු කමු'
Result:
No spelling mistakes detected.
The sentence is grammatically incorrect according to the model.

Enter a sente

new


In [None]:
import numpy as np
import pickle
import docx
import difflib
import pandas as pd
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Sinhala dictionary from a Word file
def load_dictionary(file_path):
    doc = docx.Document(file_path)
    return [paragraph.text.strip() for paragraph in doc.paragraphs if paragraph.text.strip()]

# Spell Checking Functions
def detect_errors(words, dictionary):
    return [word for word in words if word not in dictionary]

def suggest_correction(word, dictionary):
    closest_match = difflib.get_close_matches(word, dictionary, n=1)
    return closest_match[0] if closest_match else word

def correct_paragraph(sentence, dictionary):
    words = sentence.split()
    if len(words) <= 2:  # Handle short sentences separately
        return sentence
    corrected = [
        suggest_correction(word, dictionary) if word not in dictionary else word
        for word in words[1:-1]  # Check only the middle words for spelling
    ]
    return " ".join([words[0]] + corrected + [words[-1]])  # Reassemble with first and last word intact

# Define the max_len based on your model training
max_len = 30  # Ensure this matches the value used during training

# Grammar Checking Based on Rules
def check_grammar(sentence):
    words = sentence.split()

    # Rule 1: Starts with 'මම' -> Ends with 'මි'
    if words[0] == "මම":
        if not words[-1].endswith("මි"):
            corrected_sentence = sentence.rstrip(words[-1]) + "මි"
            return False, corrected_sentence

    # Rule 2: Starts with 'අපි' -> Ends with 'මු'
    elif words[0] == "අපි":
        if not words[-1].endswith("මු"):
            corrected_sentence = sentence.rstrip(words[-1]) + "මු"
            return False, corrected_sentence

    # Rule 3: Starts with any word -> Ends with 'යි'
    else:
        if not words[-1].endswith("යි"):
            corrected_sentence = sentence.rstrip(words[-1]) + "යි"
            return False, corrected_sentence

    return True, sentence

# Function to check grammar and spelling
def check_sentence(sentence, dictionary, model, tokenizer, max_len, grammar_data):
    words = sentence.split()

    # Step 1: Spell check middle words
    middle_words = words[1:-1]
    misspelled_words = detect_errors(middle_words, dictionary)
    corrected_sentence = correct_paragraph(sentence, dictionary)

    # Step 2: Grammar check using the trained model
    sequence = tokenizer.texts_to_sequences([corrected_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    prediction = model.predict(padded_sequence)
    is_grammar_correct = prediction[0][0] > 0.5

    # Step 3: Grammar rule check and correction
    grammar_is_correct, grammar_corrected_sentence = check_grammar(corrected_sentence)

    # Step 4: Provide feedback
    feedback = []
    if misspelled_words:
        feedback.append(f"Spelling mistakes detected: {', '.join(misspelled_words)}.")
        feedback.append(f"Corrected sentence: {corrected_sentence}")
    else:
        feedback.append("No spelling mistakes detected.")

    if not grammar_is_correct:
        feedback.append(f"The sentence is grammatically incorrect according to the model.")
    else:
        feedback.append("The sentence is grammatically correct.")

    # Step 5: Map incorrect sentence to the correct sentence using the grammar dataset
    incorrect_sentence = grammar_data.get(sentence)
    if incorrect_sentence:
        feedback.append(f"Corrected Sentence: {incorrect_sentence}")

    return "\n".join(feedback)

# Load the trained model
model = load_model('/content/drive/MyDrive/AI/Project/results/sinhala_spell_grammar_model.h5')

# Compile the model after loading it to remove the warning
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Load the tokenizer (ensure the path is correct)
tokenizer_save_path = '/content/drive/MyDrive/AI/Project/tokenizer.pkl'
with open(tokenizer_save_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load Sinhala words dictionary
word_file_path = '/content/drive/MyDrive/AI/Project/Spell_correction_data.docx'
dictionary = load_dictionary(word_file_path)

# Load Grammar correction dataset
grammar_data_path = '/content/drive/MyDrive/AI/Project/Grammer_data_set.xlsx'
df = pd.read_excel(grammar_data_path)

# Check the column names to verify what they are
print("Column names in the dataset:", df.columns)

# Assuming the columns are 'Sentence' and 'True Sentence'
# Map the incorrect sentence to the correct sentence in a dictionary for fast lookup
grammar_data = dict(zip(df['Sentence'], df['True Sentence']))

# Interactive Function for Sentence Checking
def check_multiple_sentences():
    print("Enter sentences to check for spelling and grammar. Type 'exit' to stop.")

    while True:
        sentence = input("Enter a sentence: ")

        if sentence.lower() == 'exit':
            print("Exiting...")
            break

        result = check_sentence(sentence, dictionary, model, tokenizer, max_len, grammar_data)
        print(f"Sentence: '{sentence}'\nResult:\n{result}\n")

# Run the function to check sentences
check_multiple_sentences()



Column names in the dataset: Index(['Sentence', 'True Sentence', True], dtype='object')
Enter sentences to check for spelling and grammar. Type 'exit' to stop.
Enter a sentence: අපි පලතුරු කයි
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 380ms/step
Sentence: 'අපි පලතුරු කයි'
Result:
No spelling mistakes detected.
The sentence is grammatically incorrect according to the model.
Corrected Sentence: අපි පලතුරු කමු

Enter a sentence: අපි ගීත කියමි
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Sentence: 'අපි ගීත කියමි'
Result:
No spelling mistakes detected.
The sentence is grammatically incorrect according to the model.
Corrected Sentence: අපි ගීත කියමු

Enter a sentence: මම පන්සල යමි
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Sentence: 'මම පන්සල යමි'
Result:
Spelling mistakes detected: පන්සල.
Corrected sentence: මම පන්සල් යමි
The sentence is grammatically correct.

Enter a sentence: නංගී සාරිය අඳිමු
[1m1/1[0m [32m━━━