In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m194.6/244.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.cuda.amp import autocast, GradScaler

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
data_path = '/content/drive/MyDrive/AI/Project/Grammer_data_set.xlsx'
data = pd.read_excel(data_path)
data.columns = ['Sentence', 'True_Sentence', 'Label']

# Preprocess dataset
def encode_labels(label):
    """Encode categorical labels into integers."""
    return int(label)

data['Label'] = data['Label'].apply(encode_labels)

# Split data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['Sentence'], data['Label'], test_size=0.2, random_state=42
)

# Tokenize using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

class GrammarDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):  # Reduced max_len
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = GrammarDataset(train_texts, train_labels, tokenizer)
test_dataset = GrammarDataset(test_texts, test_labels, tokenizer)

# Create data loaders with larger batch size
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Load pre-trained BERT model for classification
num_classes = len(data['Label'].unique())
model = BertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-cased',
    num_labels=num_classes
).to(device)

# Freeze BERT layers to reduce training time
for param in model.bert.parameters():
    param.requires_grad = False

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Use mixed precision for faster training
scaler = GradScaler()

# Training loop with fewer epochs
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")

# Evaluate the model
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predictions.cpu().numpy())

print("Classification Report:")
print(classification_report(y_true, y_pred))

# Save the model
model_save_path = '/content/drive/MyDrive/AI/Project/results/bert_grammar_spell_model.pt'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Using device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():


Epoch 1/10, Loss: 0.6700274765491485
Epoch 2/10, Loss: 0.6533434569835663
Epoch 3/10, Loss: 0.6428937152028084
Epoch 4/10, Loss: 0.6396181434392929
Epoch 5/10, Loss: 0.6411849915981293
Epoch 6/10, Loss: 0.6341244257986546
Epoch 7/10, Loss: 0.6421281337738037
Epoch 8/10, Loss: 0.6383662566542625
Epoch 9/10, Loss: 0.6393151909112931
Epoch 10/10, Loss: 0.6396233320236206
Classification Report:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80       211
           1       0.00      0.00      0.00       105

    accuracy                           0.67       316
   macro avg       0.33      0.50      0.40       316
weighted avg       0.45      0.67      0.53       316



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model saved to /content/drive/MyDrive/AI/Project/results/bert_grammar_spell_model.pt


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import difflib
from docx import Document

# Load the trained BERT model
model_save_path = '/content/drive/MyDrive/AI/Project/results/bert_grammar_spell_model.pt'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_classes = 2  # Adjust based on your dataset labels (e.g., 0: Incorrect, 1: Correct)
model = BertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-cased',
    num_labels=num_classes
).to(device)
model.load_state_dict(torch.load(model_save_path, map_location=device))
model.eval()

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Load the dictionary from .docx file
def load_dictionary_from_docx(file_path):
    document = Document(file_path)
    words = []
    for paragraph in document.paragraphs:
        words.extend(paragraph.text.split())
    return set(words)

dictionary_path = '/content/drive/MyDrive/AI/Project/Spell_correction_data.docx'
dictionary = load_dictionary_from_docx(dictionary_path)
print(f"Loaded {len(dictionary)} words from the dictionary.\n")

# Grammar rules for sentence endings
def check_grammar(sentence):
    words = sentence.split()

    # Rule 1: Starts with 'මම' -> Ends with 'මි'
    if words[0] == "මම":
        if not words[-1].endswith("මි"):
            return False, f"{words[0]} ---> මි"

    # Rule 2: Starts with 'අපි' -> Ends with 'මු'
    elif words[0] == "අපි":
        if not words[-1].endswith("මු"):
            return False, f"{words[0]} ---> මු"

    # Rule 3: Starts with any word -> Ends with 'යි'
    else:
        if not words[-1].endswith("යි"):
            return False, f"{words[0]} ---> යි"

    return True, None

# Spelling checking functions
def detect_errors(word, dictionary):
    return word if word not in dictionary else None

def suggest_correction(word, dictionary):
    closest_match = difflib.get_close_matches(word, dictionary, n=1)
    return closest_match[0] if closest_match else word

# Grammar validation function using BERT
def check_grammar_with_bert(sentence, model, tokenizer, device, max_len=64):
    encoding = tokenizer(
        sentence,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()

    return prediction == 1

# Interactive function for sentence checking
def check_user_input():
    print("👨‍💻 Enter a sentence to check for spelling and grammar. Type 'exit' to stop.\n")

    while True:
        sentence = input("✍️ Enter a sentence: ")

        if sentence.lower() == 'exit':
            print("\nExiting... Goodbye! 👋\n")
            break

        print("\n🔍 Checking spelling and grammar...\n")

        # Step 1: Spell check
        words = sentence.split()
        if len(words) > 1:
            misspelled_word = detect_errors(words[1], dictionary)
            if misspelled_word:
                corrected_word = suggest_correction(misspelled_word, dictionary)
                print(f"❌ Spelling mistake detected in the second word: {misspelled_word}")
                print(f"🔄 Corrected second word: {corrected_word}")
            else:
                print("✔️ No spelling mistakes detected in the second word.")
        else:
            print("⚠️ Insufficient words for spell-checking.")

        # Step 2: Grammar check
        is_correct, correction = check_grammar(sentence)
        if is_correct:
            print("✔️ The sentence is grammatically correct.")
        else:
            print("❌ The sentence is grammatically incorrect.")
            print(f"🔄 Suggested correction based on grammar rules: {correction}")

        # Step 3: BERT Grammar Model Check
        bert_prediction = check_grammar_with_bert(sentence, model, tokenizer, device)
        if bert_prediction:
            print("✔️ BERT Prediction: The sentence is grammatically correct.")
        else:
            print("❌ BERT Prediction: The sentence is grammatically incorrect.")

        print("\n" + "="*50 + "\n")

# Run the function
check_user_input()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_save_path, map_location=device))


Loaded 182 words from the dictionary.

👨‍💻 Enter a sentence to check for spelling and grammar. Type 'exit' to stop.

✍️ Enter a sentence: අපි පලතර කයි

🔍 Checking spelling and grammar...

❌ Spelling mistake detected in the second word: පලතර
🔄 Corrected second word: පලතුරු
❌ The sentence is grammatically incorrect.
🔄 Suggested correction based on grammar rules: අපි ---> මු
❌ BERT Prediction: The sentence is grammatically incorrect.


✍️ Enter a sentence: අපි ගත කියමි

🔍 Checking spelling and grammar...

❌ Spelling mistake detected in the second word: ගත
🔄 Corrected second word: ගීත
❌ The sentence is grammatically incorrect.
🔄 Suggested correction based on grammar rules: අපි ---> මු
❌ BERT Prediction: The sentence is grammatically incorrect.


✍️ Enter a sentence: ඇය පල් කමු

🔍 Checking spelling and grammar...

❌ Spelling mistake detected in the second word: පල්
🔄 Corrected second word: ඇපල්
❌ The sentence is grammatically incorrect.
🔄 Suggested correction based on grammar rules: ඇය --->