## Grammar checker using Deep learning approach

In [1]:
from transformers import T5Tokenizer, BertTokenizer
from datasets import Dataset
import sentencepiece
import pandas as pd
import shutil

In [2]:
file_path = r"D:\7th Semester\AI\Tamil-SpellGrammar-Checker\data\grammar_checker_dataset\final_dataset.csv"
df = pd.read_csv(file_path)

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

In [3]:
dataset

Dataset({
    features: ['Error Sentence', 'Error Type', 'Corrected Sentence'],
    num_rows: 999
})

In [4]:
# Clear cache
cache_dir = "C:\\Users\\Hp\\.cache\\huggingface\\hub"
shutil.rmtree(cache_dir, ignore_errors=True)

In [5]:
tokenizer = BertTokenizer.from_pretrained(r"D:\models\bert-base-uncased")

In [6]:
# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples['Error Sentence'], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples['Corrected Sentence'], padding="max_length", truncation=True, max_length=128)
    inputs['labels'] = targets['input_ids']
    return inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Check the tokenized dataset
print(tokenized_dataset[0])

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

{'Error Sentence': 'அவள் பாடல்கள் பாடுகிறான்.', 'Error Type': 'Subject-Verb Agreement', 'Corrected Sentence': 'அவள் பாடல்கள் பாடுகிறாள்.', 'input_ids': [101, 100, 1388, 29931, 29920, 29928, 29918, 29929, 100, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1,

In [None]:
tokenized_dataset[0]

In [16]:
from accelerate import Accelerator
from transformers import Trainer

print("Accelerate and Transformers are properly installed!")


Accelerate and Transformers are properly installed!


In [15]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

# Load pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained(r'C:\Users\Hp\.cache\huggingface\hub\models--t5-small')

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    save_steps=1000,
    eval_strategy="steps",  # Updated argument
    use_cpu=True  # Replaces `no_cuda=True` for CPU-only training
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Use a validation set here if available
)

trainer.train()

# Save the model after training
model.save_pretrained("./Models")
tokenizer.save_pretrained("./Models")

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [None]:
def correct_sentence(incorrect_sentence):
    inputs = tokenizer(incorrect_sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model.generate(inputs['input_ids'], max_length=128, num_beams=5, early_stopping=True)
    corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_sentence

# Example usage
incorrect_sentence = "நான் பள்ளிக்குப் போகின்றன"
corrected = correct_sentence(incorrect_sentence)
print(corrected)  # Output: "நான் பள்ளிக்குப் போகின்றேன்"
