# **• Statistical Models(N games):**

In [9]:
import re
import pandas as pd
from collections import defaultdict, Counter
import random

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path to the dataset on Google Drive
dataset_path = '/content/drive/My Drive/Colab Notebooks/combined_sentences_dataset.csv'

# Load the dataset
data = pd.read_csv(dataset_path, header=None, names=["Category", "Sentence"])

# Inspect the first few rows and column names
print(data.head())
print(data.columns)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   Category   Sentence
0         0     මම යති
1         0   මම යත්වා
2         0    මම යනවා
3         0  මම යනවාලා
4         0     මම යනු
Index(['Category', 'Sentence'], dtype='object')


In [10]:
# Filter and process data
train_data = data[data['Category'] == 1]['Sentence'].tolist()  # Correct sentences as training
test_data = data['Sentence'].tolist()  # All sentences for testing

In [11]:
# Generate n-grams
def generate_ngrams(text, n=2):
    words = text.split()
    return [tuple(words[i:i + n]) for i in range(len(words) - n + 1)]

# Train n-gram model
def train_ngram_model(sentences, n=2):
    model = defaultdict(Counter)
    for sentence in sentences:
        ngrams = generate_ngrams(sentence, n)
        for w1, w2 in ngrams:
            model[w1][w2] += 1
    return model

# Predict next word
def predict_next_word(model, word):
    if word in model:
        return random.choices(list(model[word].keys()), weights=model[word].values())[0]
    return None

# Evaluate accuracy
def evaluate_model(model, test_sentences):
    correct, total = 0, 0
    for sentence in test_sentences:
        words = sentence.split()
        for i in range(len(words) - 1):
            if predict_next_word(model, words[i]) == words[i + 1]:
                correct += 1
            total += 1
    return correct / total

# Train the bigram model
bigram_model = train_ngram_model(train_data, n=2)

# Evaluate the model
accuracy = evaluate_model(bigram_model, test_data)
print(f"Bigram Model Accuracy: {accuracy * 100:.2f}%")

Bigram Model Accuracy: 40.87%


In [18]:
import re
import pandas as pd
from collections import defaultdict, Counter
import random

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path to the dataset on Google Drive
dataset_path = '/content/drive/My Drive/Colab Notebooks/combined_sentences_dataset.csv'

# Load the dataset
data = pd.read_csv(dataset_path, header=None, names=["Category", "Sentence"])

# Filter and process data
train_data = data[data['Category'] == 1]['Sentence'].tolist()  # Correct sentences as training
test_data = data['Sentence'].tolist()  # All sentences for testing

# Generate n-grams
def generate_ngrams(text, n=2):
    words = text.split()
    return [tuple(words[i:i + n]) for i in range(len(words) - n + 1)]

# Train n-gram model
def train_ngram_model(sentences, n=2):
    model = defaultdict(Counter)
    for sentence in sentences:
        ngrams = generate_ngrams(sentence, n)
        for w1, w2 in ngrams:
            model[w1][w2] += 1
    return model

# Predict next word
def predict_next_word(model, word):
    if word in model:
        return random.choices(list(model[word].keys()), weights=model[word].values())[0]
    return None

# Evaluate accuracy for spell correction and grammar suggestions
def evaluate_model(model, test_sentences):
    correct_spell, correct_grammar, total_spell, total_grammar = 0, 0, 0, 0

    for sentence in test_sentences:
        words = sentence.split()
        for i in range(len(words) - 1):
            predicted = predict_next_word(model, words[i])

            # Grammar suggestion accuracy
            if predicted == words[i + 1]:
                correct_grammar += 1
            total_grammar += 1

            # For spell correction, check similarity (basic Levenshtein or exact match)
            if words[i] == words[i].lower():  # Example heuristic for a "spelling error"
                total_spell += 1
                if predicted == words[i]:
                    correct_spell += 1

    grammar_accuracy = correct_grammar / total_grammar if total_grammar else 0
    spell_accuracy = correct_spell / total_spell if total_spell else 0
    return grammar_accuracy, spell_accuracy

# Train the bigram model
bigram_model = train_ngram_model(train_data, n=2)

# Evaluate the model across 5 runs
results = []
for run in range(5):
    random.seed(run)  # Set random seed for reproducibility
    grammar_acc, spell_acc = evaluate_model(bigram_model, test_data)
    results.append((run + 1, grammar_acc, spell_acc))

# Print results
for run, grammar_acc, spell_acc in results:
    print(f"Run {run}: Grammar Accuracy = {grammar_acc * 100:.2f}%")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Run 1: Grammar Accuracy = 40.90%
Run 2: Grammar Accuracy = 40.86%
Run 3: Grammar Accuracy = 40.89%
Run 4: Grammar Accuracy = 40.77%
Run 5: Grammar Accuracy = 40.92%


--------------------------------------------------------------------------------


In [21]:
import pandas as pd
from difflib import get_close_matches
import random

# Path to the dataset
dataset_path = '/content/drive/My Drive/Colab Notebooks/combined_sentences_dataset.csv'

# Load the dataset
data = pd.read_csv(dataset_path, header=None, names=["Category", "Sentence"])

# Step 1: Create the dictionary from binary 1 categorized sentences
# Filter correct sentences
correct_sentences = data[data['Category'] == 1]['Sentence']

# Tokenize words and create a unique dictionary
dictionary = set()
for sentence in correct_sentences:
    words = sentence.split()
    dictionary.update(words)
dictionary = list(dictionary)  # Convert back to list

# Function to correct spelling
def correct_spelling(word, dictionary):
    matches = get_close_matches(word, dictionary)
    return matches[0] if matches else word

# Step 2: Evaluate spell checker using the dataset
# Simulate a spell-checker dataset with misspelled and correct words (for testing)
spell_checker_data = pd.DataFrame({
    'misspelled_word': ['මම', 'මම', 'යා', 'අපි'],  # Example misspelled words
    'correct_word': ['මම', 'මම', 'යනවා', 'අපි']   # Corresponding correct words
})

# Run evaluations for five different runs
results = []
for run in range(5):
    correct, total = 0, 0
    random.seed(run)  # Set a random seed for reproducibility

    # Shuffle the spell-checker dataset to simulate variability
    spell_checker_data = spell_checker_data.sample(frac=1).reset_index(drop=True)

    # Evaluate the spell checker
    for misspelled, correct_word in zip(spell_checker_data['misspelled_word'], spell_checker_data['correct_word']):
        if correct_spelling(misspelled, dictionary) == correct_word:
            correct += 1
        total += 1

    # Compute accuracy
    accuracy = correct / total * 100 if total > 0 else 0
    results.append((run + 1, accuracy))

# Print results
for run, accuracy in results:
    print(f"Run {run}: Spell Checker Accuracy = {accuracy:.2f}%")


Run 1: Spell Checker Accuracy = 75.00%
Run 2: Spell Checker Accuracy = 75.00%
Run 3: Spell Checker Accuracy = 75.00%
Run 4: Spell Checker Accuracy = 75.00%
Run 5: Spell Checker Accuracy = 75.00%


--------------------------------------------------------------------------------


In [22]:
# Debug predictions
for sentence in test_data[:5]:  # Check predictions for the first 5 test sentences
    words = sentence.split()
    for i in range(len(words) - 1):
        predicted = predict_next_word(bigram_model, words[i])
        print(f"Input: {words[i]}, Expected: {words[i + 1]}, Predicted: {predicted}")

Input: මම, Expected: යති, Predicted: නැටුම්
Input: මම, Expected: යත්වා, Predicted: ගෙදර
Input: මම, Expected: යනවා, Predicted: නැටුම්
Input: මම, Expected: යනවාලා, Predicted: ඔහුගෙන්
Input: මම, Expected: යනු, Predicted: ගෙදර


In [23]:
print("Training Data Sample:", train_data[:5])
print("Test Data Sample:", test_data[:5])


Training Data Sample: ['මම යන්නෙමි', 'මම යන්නෙමිවා', 'මම යමි', 'මම යමිවා', 'මම යවන්නෙමි']
Test Data Sample: ['මම යති', 'මම යත්වා', 'මම යනවා', 'මම යනවාලා', 'මම යනු']


In [24]:
# Inspect the bigram model for a specific word
word_to_check = "මම"  # Replace with a common word in your dataset
print("Bigram Model for word:", word_to_check)
print(bigram_model[word_to_check])


Bigram Model for word: මම
Counter({'නැටුම්': 571, 'ගෙදර': 72, 'පොත': 43, 'බත්': 40, 'වේගයෙන්': 24, 'ඔහුගෙන්': 24, 'ඔහුට': 24, 'එය': 4, 'කතා': 4, 'යන්නෙමි': 3, 'යන්නෙමිවා': 3, 'යමි': 3, 'යමිවා': 3, 'යවන්නෙමි': 3, 'යවමි': 3, 'ගියෙමි': 3, 'යැවෙමි': 3, 'පොතක්': 2, 'මුදල්': 2, 'පිළිකුල්': 2, 'ඔහුව': 2, 'වැරදි': 2, 'ඊයේ': 2, 'පුදුම': 2, 'මේකට': 2, 'මලක්': 1, 'අම්මාට': 1, 'නගරයට': 1, 'පාඩමි': 1, 'අවදි': 1, 'දත්\u200c': 1, 'මුහුණ': 1, 'පාඩම්\u200c': 1, 'සැමවිටම': 1, 'යාච්ඤා': 1, 'රිය': 1, 'ආදරය': 1, 'සිනා': 1, 'හරි': 1, 'උදව්': 1, 'යන්නෙ': 1, 'කථා': 1, 'ගන්නේ': 1, 'ආරක්ෂිතව': 1, 'කාඩ්': 1, 'දෙන්න': 1, 'වැනිලා': 1, 'අයිස්': 1, 'කැමති': 1, 'වැඩ': 1, 'කඩේට': 1, 'ආසයි': 1, 'කියවන්න': 1, 'තේ': 1, 'ෆිල්ම්': 1, 'කන්ද': 1})


In [25]:
# Simplified accuracy check
correct, total = 0, 0
for sentence in test_data[:100]:  # Limit to 100 sentences for debugging
    words = sentence.split()
    for i in range(len(words) - 1):
        predicted = predict_next_word(bigram_model, words[i])
        if predicted == words[i + 1]:
            correct += 1
        total += 1
print(f"Simple Accuracy: {correct / total * 100:.2f}%")


Simple Accuracy: 0.00%


In [26]:
print(f"Training Data Size: {len(train_data)}")
print(f"Test Data Size: {len(test_data)}")


Training Data Size: 3127
Test Data Size: 18106


------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [30]:
import pandas as pd
from collections import defaultdict, Counter
from difflib import get_close_matches
import random

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path to the dataset on Google Drive
dataset_path = '/content/drive/My Drive/Colab Notebooks/combined_sentences_dataset.csv'

# Load the dataset
data = pd.read_csv(dataset_path, header=None, names=["Category", "Sentence"])

# Filter and process data
correct_sentences = data[data['Category'] == 1]['Sentence'].tolist()  # Correct sentences
incorrect_sentences = data[data['Category'] == 0]['Sentence'].tolist()  # Incorrect sentences

# Create a dictionary from correct sentences (for spell checking)
dictionary = set()
for sentence in correct_sentences:
    words = sentence.split()
    dictionary.update(words)
dictionary = list(dictionary)

# Generate n-grams for grammar checking
def generate_ngrams(text, n=2):
    words = text.split()
    return [tuple(words[i:i + n]) for i in range(len(words) - n + 1)]

# Train n-gram model for grammar checking
def train_ngram_model(sentences, n=2):
    model = defaultdict(Counter)
    for sentence in sentences:
        ngrams = generate_ngrams(sentence, n)
        for w1, w2 in ngrams:
            model[w1][w2] += 1
    return model

# Predict the next word for grammar checking
def predict_next_word(model, word):
    if word in model:
        return random.choices(list(model[word].keys()), weights=model[word].values())[0]
    return None

# Function to correct spelling
def correct_spelling(word, dictionary):
    matches = get_close_matches(word, dictionary)
    return matches[0] if matches else word

# Evaluate grammar checker
def evaluate_grammar_model(model, sentences):
    correct_grammar, total_grammar = 0, 0
    for sentence in sentences:
        words = sentence.split()
        for i in range(len(words) - 1):
            predicted = predict_next_word(model, words[i])
            if predicted == words[i + 1]:
                correct_grammar += 1
            total_grammar += 1
    accuracy = correct_grammar / total_grammar if total_grammar else 0
    return accuracy

# Evaluate spell checker
def evaluate_spell_checker(dictionary, test_data):
    correct, total = 0, 0
    for misspelled, correct_word in test_data:
        if correct_spelling(misspelled, dictionary) == correct_word:
            correct += 1
        total += 1
    accuracy = correct / total if total > 0 else 0
    return accuracy

# Train the bigram model for grammar checking
bigram_model = train_ngram_model(correct_sentences, n=2)

# Simulate test data for spell checker
spell_checker_test_data = [
    ('මම', 'මම'),
    ('මම', 'මම'),
    ('යා', 'යනවා'),
    ('අපි', 'අපි')
]

# Run evaluations for 5 iterations
results = []
for run in range(5):
    random.seed(run)

    # Grammar checker accuracy
    grammar_accuracy = evaluate_grammar_model(bigram_model, incorrect_sentences)

    # Shuffle spell-checker test data for each run
    shuffled_spell_test_data = random.sample(spell_checker_test_data, len(spell_checker_test_data))
    spell_accuracy = evaluate_spell_checker(dictionary, shuffled_spell_test_data)

    results.append((run + 1, grammar_accuracy * 100, spell_accuracy * 100))

# Print results
for run, grammar_acc, spell_acc in results:
    print(f"Run {run}: Grammar Accuracy = {grammar_acc:.2f}%, Spell Checker Accuracy = {spell_acc:.2f}%")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Run 1: Grammar Accuracy = 40.75%, Spell Checker Accuracy = 75.00%
Run 2: Grammar Accuracy = 40.77%, Spell Checker Accuracy = 75.00%
Run 3: Grammar Accuracy = 40.89%, Spell Checker Accuracy = 75.00%
Run 4: Grammar Accuracy = 40.68%, Spell Checker Accuracy = 75.00%
Run 5: Grammar Accuracy = 40.94%, Spell Checker Accuracy = 75.00%


In [31]:
# Function to manually check grammar correction for a sentence
def check_grammar(sentence, ngram_model):
    words = sentence.split()
    corrected_sentence = [words[0]]  # Start with the first word
    for i in range(len(words) - 1):
        predicted = predict_next_word(ngram_model, corrected_sentence[-1])
        if predicted:
            corrected_sentence.append(predicted)
        else:
            corrected_sentence.append(words[i + 1])  # Use the original word if no prediction
    return ' '.join(corrected_sentence)

# Function to manually check spell correction for a word
def check_spelling(word, dictionary):
    corrected_word = correct_spelling(word, dictionary)
    return corrected_word


In [33]:
# Example manual tests
while True:
    print("\nChoose an option:")
    print("1. Check Grammar Correction for a Sentence")
    print("2. Check Spell Correction for a Word")
    print("3. Exit")
    choice = input("Enter your choice (1/2/3): ")

    if choice == '1':
        input_sentence = input("Enter a sentence: ")
        corrected = check_grammar(input_sentence, bigram_model)
        print(f"Corrected Sentence: {corrected}")
    elif choice == '2':
        input_word = input("Enter a word: ")
        corrected = check_spelling(input_word, dictionary)
        print(f"Corrected Word: {corrected}")
    elif choice == '3':
        print("Exiting...")
        break
    else:
        print("Invalid choice. Please try again.")



Choose an option:
1. Check Grammar Correction for a Sentence
2. Check Spell Correction for a Word
3. Exit
Enter your choice (1/2/3): 1
Enter a sentence: මම ය
Corrected Sentence: මම නැටුම්

Choose an option:
1. Check Grammar Correction for a Sentence
2. Check Spell Correction for a Word
3. Exit
Enter your choice (1/2/3): 2
Enter a word: යා
Corrected Word: ඔයා

Choose an option:
1. Check Grammar Correction for a Sentence
2. Check Spell Correction for a Word
3. Exit
Enter your choice (1/2/3): 3
Exiting...


In [34]:
import pandas as pd
from difflib import get_close_matches
from collections import defaultdict, Counter
import random

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path to the dataset on Google Drive
dataset_path = '/content/drive/My Drive/Colab Notebooks/combined_sentences_dataset.csv'

# Load the dataset
dataset = pd.read_csv(dataset_path, header=None, names=["Category", "Sentence"])

# Filter correct and incorrect sentences
correct_sentences = dataset[dataset['Category'] == 1]['Sentence'].tolist()
incorrect_sentences = dataset[dataset['Category'] == 0]['Sentence'].tolist()

# --- Utility Functions ---

# Generate n-grams
def generate_ngrams(text, n=2):
    words = text.split()
    return [tuple(words[i:i + n]) for i in range(len(words) - n + 1)]

# Train n-gram model
def train_ngram_model(sentences, n=2):
    model = defaultdict(Counter)
    for sentence in sentences:
        ngrams = generate_ngrams(sentence, n)
        for w1, w2 in ngrams:
            model[w1][w2] += 1
    return model

# Predict next word
def predict_next_word(model, word):
    if word in model:
        return random.choices(list(model[word].keys()), weights=model[word].values())[0]
    return None

# Function to find the closest correct sentence
def find_closest_correct_sentence(input_sentence, correct_sentences):
    matches = get_close_matches(input_sentence, correct_sentences, n=1, cutoff=0.6)
    return matches[0] if matches else None

# Function to check and correct grammar based on dataset
def correct_grammar_with_dataset(input_sentence, dataset):
    # Check if the sentence exists in the dataset
    if input_sentence in dataset['Sentence'].values:
        # Get the category (1 for correct, 0 for incorrect)
        category = dataset.loc[dataset['Sentence'] == input_sentence, 'Category'].values[0]
        if category == 1:
            return input_sentence  # Sentence is already correct
        else:
            # Find a close correct sentence
            closest_sentence = find_closest_correct_sentence(input_sentence, correct_sentences)
            return closest_sentence if closest_sentence else "No close correct sentence found."
    else:
        # Sentence doesn't exist in the dataset; find a close correct sentence
        closest_sentence = find_closest_correct_sentence(input_sentence, correct_sentences)
        return closest_sentence if closest_sentence else "No close correct sentence found."

# Function to correct spelling
def correct_spelling(word, dictionary):
    matches = get_close_matches(word, dictionary)
    return matches[0] if matches else word

# --- Train Statistical Model ---

# Train the bigram model
bigram_model = train_ngram_model(correct_sentences, n=2)

# --- Manual Testing Section ---

while True:
    print("\nChoose an option:")
    print("1. Check Grammar Correction for a Sentence")
    print("2. Check Spell Correction for a Word")
    print("3. Exit")
    choice = input("Enter your choice (1/2/3): ")

    if choice == '1':
        input_sentence = input("Enter a sentence: ")
        corrected = correct_grammar_with_dataset(input_sentence, dataset)
        print(f"Corrected Sentence: {corrected}")
    elif choice == '2':
        input_word = input("Enter a word: ")
        # Create a dictionary from correct sentences
        word_dictionary = set()
        for sentence in correct_sentences:
            words = sentence.split()
            word_dictionary.update(words)
        corrected_word = correct_spelling(input_word, list(word_dictionary))
        print(f"Corrected Word: {corrected_word}")
    elif choice == '3':
        print("Exiting...")
        break
    else:
        print("Invalid choice. Please try again.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Choose an option:
1. Check Grammar Correction for a Sentence
2. Check Spell Correction for a Word
3. Exit
Enter your choice (1/2/3): 1
Enter a sentence: මම යනවා
Corrected Sentence: මම යමිවා

Choose an option:
1. Check Grammar Correction for a Sentence
2. Check Spell Correction for a Word
3. Exit
Enter your choice (1/2/3): මම ය
Invalid choice. Please try again.

Choose an option:
1. Check Grammar Correction for a Sentence
2. Check Spell Correction for a Word
3. Exit
Enter your choice (1/2/3): 1
Enter a sentence: මම ය
Corrected Sentence: මම යමි

Choose an option:
1. Check Grammar Correction for a Sentence
2. Check Spell Correction for a Word
3. Exit
Enter your choice (1/2/3): 1
Enter a sentence: අමුතු වාක්‍ය
Corrected Sentence: No close correct sentence found.

Choose an option:
1. Check Grammar Correction for a Sentence
2. Check Spell Correction for a Word
3.

# **Neural Network-Based Approach:**

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

In [35]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from difflib import get_close_matches
import random

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path to the dataset on Google Drive
dataset_path = '/content/drive/My Drive/Colab Notebooks/combined_sentences_dataset.csv'

# Load the dataset
data = pd.read_csv(dataset_path, header=None, names=["Category", "Sentence"])

# Separate data into correct and incorrect categories
correct_sentences = data[data['Category'] == 1]['Sentence'].tolist()  # Correct sentences
incorrect_sentences = data[data['Category'] == 0]['Sentence'].tolist()  # Incorrect sentences
all_sentences = data['Sentence'].tolist()
labels = data['Category'].tolist()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
# Preprocessing: Convert sentences into bag-of-words features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(all_sentences)  # Feature matrix
y = labels  # Labels (1 for correct, 0 for incorrect)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Neural Network (Multi-Layer Perceptron) for grammar correction
grammar_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
grammar_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = grammar_model.predict(X_test)
grammar_accuracy = accuracy_score(y_test, y_pred)
print(f"Grammar Correction Model Accuracy: {grammar_accuracy * 100:.2f}%")


Grammar Correction Model Accuracy: 79.54%


In [37]:
# Create a dictionary from correct sentences
dictionary = set()
for sentence in correct_sentences:
    words = sentence.split()
    dictionary.update(words)
dictionary = list(dictionary)

# Function to correct spelling using get_close_matches
def correct_spelling_nn(word, dictionary):
    matches = get_close_matches(word, dictionary)
    return matches[0] if matches else word

# Evaluate spell checker
spell_checker_test_data = [
    ('මම', 'මම'),
    ('මග', 'මම'),
    ('යා', 'යනවා'),
    ('අපි', 'අපි'),
]

correct_spell, total_spell = 0, 0
for misspelled, correct_word in spell_checker_test_data:
    if correct_spelling_nn(misspelled, dictionary) == correct_word:
        correct_spell += 1
    total_spell += 1
spell_accuracy = correct_spell / total_spell if total_spell > 0 else 0
print(f"Spell Checker Accuracy: {spell_accuracy * 100:.2f}%")


Spell Checker Accuracy: 50.00%


In [53]:
def basic_grammar_check(text):
    """
    Perform a basic grammar check on the given text.
    This function checks for repeated words, missing punctuation, and basic sentence structure issues.
    """
    grammar_issues = []
    words = tokenize(preprocess_text(text))

    # Check for repeated words
    for i in range(len(words) - 1):
        if words[i] == words[i + 1]:
            grammar_issues.append(f"Repeated word: '{words[i]}' at position {i + 1}")

    # Check if text ends with proper punctuation
    if not re.search(r'[.!?]$', text.strip()):
        grammar_issues.append("The text does not end with proper punctuation (e.g., '.', '!', '?').")

    # Additional grammar checks (example: subject-verb-object pattern)
    if len(words) < 3:
        grammar_issues.append("The text is too short to form a valid sentence.")
    elif len(words) > 3 and words[0] not in dictionary:
        grammar_issues.append(f"The first word '{words[0]}' is not in the dictionary, which might indicate a grammar issue.")

    return grammar_issues

def auto_correct_grammar(text):
    """
    Automatically correct basic grammar mistakes in the input text.
    This function fixes repeated words, ensures proper punctuation, and attempts to restructure sentences if needed.
    """
    words = tokenize(preprocess_text(text))
    corrected_words = []

    # Remove repeated words
    for i in range(len(words)):
        if i == 0 or words[i] != words[i - 1]:
            corrected_words.append(words[i])

    corrected_text = ' '.join(corrected_words)

    # Add proper punctuation if missing
    if not re.search(r'[.!?]$', corrected_text.strip()):
        corrected_text += '.'

    return corrected_text


In [64]:
# Function to predict whether a sentence is correct or incorrect
def check_grammar_nn(sentence, model, vectorizer):
    X = vectorizer.transform([sentence])
    prediction = model.predict(X)
    if prediction[0] == 1:
        return f"The sentence is correct: {sentence}"
    else:
        # Find closest correct sentence if incorrect
        matches = get_close_matches(sentence, correct_sentences)
        return f"The sentence is incorrect. Did you mean: {matches[0]}" if matches else "No suggestions available."

# Manual Test: Grammar Correction
input_sentence = input("Enter a sentence for grammar correction: ")
result = check_grammar_nn(input_sentence, grammar_model, vectorizer)
print(result)


Enter a sentence for grammar correction: අපි යයි\
The sentence is incorrect. Did you mean: අපි යමු


In [65]:
# Manual Test: Spell Correction
input_word = input("Enter a word for spell correction: ")
corrected_word = correct_spelling_nn(input_word, dictionary)
print(f"Corrected Word: {corrected_word}")


Enter a word for spell correction: අරනව
Corrected Word: කරනවා


In [42]:
results = []
for run in range(5):
    random.seed(run)

    # Grammar accuracy for the current run
    y_pred_run = grammar_model.predict(X_test)
    grammar_accuracy_run = accuracy_score(y_test, y_pred_run)

    # Spell accuracy for the current run (shuffling the test data)
    shuffled_spell_test_data = random.sample(spell_checker_test_data, len(spell_checker_test_data))
    correct_spell_run, total_spell_run = 0, 0
    for misspelled, correct_word in shuffled_spell_test_data:
        if correct_spelling_nn(misspelled, dictionary) == correct_word:
            correct_spell_run += 1
        total_spell_run += 1
    spell_accuracy_run = correct_spell_run / total_spell_run if total_spell_run > 0 else 0

    # Store results
    results.append((run + 1, grammar_accuracy_run * 100, spell_accuracy_run * 100))

# Print results for all runs
for run, grammar_acc, spell_acc in results:
    print(f"Run {run}: Grammar Accuracy = {grammar_acc:.2f}%, Spell Checker Accuracy = {spell_acc:.2f}%")


Run 1: Grammar Accuracy = 79.54%, Spell Checker Accuracy = 50.00%
Run 2: Grammar Accuracy = 79.54%, Spell Checker Accuracy = 50.00%
Run 3: Grammar Accuracy = 79.54%, Spell Checker Accuracy = 50.00%
Run 4: Grammar Accuracy = 79.54%, Spell Checker Accuracy = 50.00%
Run 5: Grammar Accuracy = 79.54%, Spell Checker Accuracy = 50.00%
