# Fine-tune T5 for Grammar Correction (STT Errors)

This notebook fine-tunes a T5 model specifically for correcting grammar errors from speech-to-text transcriptions.

**Steps:**
1. Install dependencies
2. Create/load dataset of incorrectâ†’correct pairs
3. Fine-tune T5-small model
4. Test the model
5. Export for use in your application

## 1. Install Dependencies

In [None]:
!pip install transformers datasets accelerate sentencepiece -q

## 2. Create Training Dataset

Create a dataset of common STT grammar errors and their corrections.

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Sample dataset - expand this with more examples!
training_data = [
    # Word order errors
    {"incorrect": "i have a tomorrow match", "correct": "i have a match tomorrow"},
    {"incorrect": "i have a today meeting", "correct": "i have a meeting today"},
    {"incorrect": "i have a tonight party", "correct": "i have a party tonight"},
    {"incorrect": "we have a yesterday game", "correct": "we had a game yesterday"},
    
    # Article errors
    {"incorrect": "this is a umbrella", "correct": "this is an umbrella"},
    {"incorrect": "this is an laptop", "correct": "this is a laptop"},
    {"incorrect": "i have a apple", "correct": "i have an apple"},
    {"incorrect": "she is an teacher", "correct": "she is a teacher"},
    
    # Subject-verb agreement
    {"incorrect": "he dont like it", "correct": "he doesn't like it"},
    {"incorrect": "she have a car", "correct": "she has a car"},
    {"incorrect": "they was going", "correct": "they were going"},
    {"incorrect": "he do his homework", "correct": "he does his homework"},
    
    # Common mistakes
    {"incorrect": "i could of done it", "correct": "i could have done it"},
    {"incorrect": "i should of known", "correct": "i should have known"},
    {"incorrect": "its a nice day", "correct": "it's a nice day"},
    {"incorrect": "your going home", "correct": "you're going home"},
    
    # Tense errors
    {"incorrect": "i go yesterday", "correct": "i went yesterday"},
    {"incorrect": "she see me tomorrow", "correct": "she will see me tomorrow"},
    {"incorrect": "we was there", "correct": "we were there"},
    
    # Preposition errors
    {"incorrect": "different than you", "correct": "different from you"},
    {"incorrect": "married with her", "correct": "married to her"},
    {"incorrect": "in the weekend", "correct": "on the weekend"},
    
    # Add more examples here...
    {"incorrect": "i dont have no money", "correct": "i don't have any money"},
    {"incorrect": "he go always to school", "correct": "he always goes to school"},
    {"incorrect": "she is more better", "correct": "she is better"},
]

# Convert to DataFrame
df = pd.DataFrame(training_data)

# Split into train/validation (80/20)
train_size = int(0.8 * len(df))
train_df = df[:train_size]
val_df = df[train_size:]

# Create Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")
print("\nSample:")
print(train_dataset[0])

## 3. Prepare Data for T5

In [None]:
from transformers import AutoTokenizer

# Load T5 tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocessing function
def preprocess_function(examples):
    # Add prefix for T5
    inputs = ["grammar: " + text for text in examples["incorrect"]]
    targets = examples["correct"]
    
    # Tokenize
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True)

print("Dataset tokenized successfully!")

## 4. Fine-tune T5 Model

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np

# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./grammar-correction-t5",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    fp16=True,  # Use mixed precision for faster training
    push_to_hub=False,
)

# Metric for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Simple accuracy metric
    correct = sum([pred.strip() == label.strip() for pred, label in zip(decoded_preds, decoded_labels)])
    return {"accuracy": correct / len(decoded_labels)}

# Create trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start training
print("Starting training...")
trainer.train()
print("Training complete!")

## 5. Test the Model

In [None]:
def correct_grammar(text):
    """Test the fine-tuned model"""
    input_text = "grammar: " + text
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    
    outputs = model.generate(**inputs, max_length=128)
    corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return corrected

# Test examples
test_sentences = [
    "i have a tomorrow match",
    "this is an laptop",
    "he dont like it",
    "i could of done it",
    "she have a car",
]

print("Testing fine-tuned model:\n")
for sentence in test_sentences:
    corrected = correct_grammar(sentence)
    print(f"Input:     {sentence}")
    print(f"Corrected: {corrected}")
    print()

## 6. Save and Export Model

In [None]:
# Save model locally
model.save_pretrained("./grammar-correction-t5-final")
tokenizer.save_pretrained("./grammar-correction-t5-final")

print("Model saved to ./grammar-correction-t5-final")

# Download to your computer
from google.colab import files
import shutil

# Create zip file
shutil.make_archive('grammar-correction-model', 'zip', './grammar-correction-t5-final')
files.download('grammar-correction-model.zip')

print("\nModel downloaded! Extract and use in your application.")

## 7. (Optional) Push to Hugging Face Hub

In [None]:
# Uncomment and run if you want to upload to Hugging Face

# from huggingface_hub import notebook_login
# notebook_login()

# model.push_to_hub("your-username/grammar-correction-stt")
# tokenizer.push_to_hub("your-username/grammar-correction-stt")

# print("Model uploaded to Hugging Face Hub!")

## 8. How to Use in Your Application

After downloading the model, use it in your Python code:

```python
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load your fine-tuned model
tokenizer = AutoTokenizer.from_pretrained("./grammar-correction-t5-final")
model = AutoModelForSeq2SeqLM.from_pretrained("./grammar-correction-t5-final")

def correct_grammar(text):
    input_text = "grammar: " + text
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Use it
result = correct_grammar("i have a tomorrow match")
print(result)  # Output: "i have a match tomorrow"
```