In [None]:
!pip install kaggle -q
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d satishgunjal/grammar-correction
!unzip grammar-correction.zip -d grammar_dataset


Dataset URL: https://www.kaggle.com/datasets/satishgunjal/grammar-correction
License(s): apache-2.0
Downloading grammar-correction.zip to /content
  0% 0.00/62.4k [00:00<?, ?B/s]
100% 62.4k/62.4k [00:00<00:00, 200MB/s]
Archive:  grammar-correction.zip
  inflating: grammar_dataset/Grammar Correction.csv  


In [None]:
import pandas as pd

# Load CSV
df = pd.read_csv("/content/grammar_dataset/Grammar Correction.csv")
print("Dataset shape:", df.shape)
print(df.head())


Dataset shape: (2018, 4)
   Serial Number         Error Type              Ungrammatical Statement  \
0              1  Verb Tense Errors        I goes to the store everyday.   
1              2  Verb Tense Errors  They was playing soccer last night.   
2              3  Verb Tense Errors     She have completed her homework.   
3              4  Verb Tense Errors            He don't know the answer.   
4              5  Verb Tense Errors            The sun rise in the east.   

                       Standard English  
0           I go to the store everyday.  
1  They were playing soccer last night.  
2       She has completed her homework.  
3           He doesn't know the answer.  
4            The sun rises in the east.  


In [None]:
!pip install transformers datasets peft sacrebleu evaluate torch --quiet

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
import evaluate


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Load CSV (replace path with your Kaggle file path)
from datasets import Dataset
df = pd.read_csv("/content/grammar_dataset/Grammar Correction.csv")

# Keep only the columns we need
df = df[['Ungrammatical Statement', 'Standard English']]
df = df.rename(columns={
    'Ungrammatical Statement': 'inputs',
    'Standard English': 'outputs'
})

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
print(dataset[0])


{'inputs': 'I goes to the store everyday.', 'outputs': 'I go to the store everyday.'}


In [None]:
model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

peft_config = LoraConfig(
    task_type = TaskType.SEQ_2_SEQ_LM,
    inference_mode = False,
    r = 8,
    lora_alpha = 16,
    lora_dropout = 0.01
)

model = get_peft_model(model, peft_config)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Grammar correction dataset (JFLEG or GEC-like)
dataset = load_dataset("jfleg")  # if not available, fallback to custom JSON

# Example structure
print(dataset)
print(dataset['validation'][0])


In [None]:
max_length = 64  # Adjust if needed

def preprocess_function(examples):
    inputs = ["Correct the grammar: " + s for s in examples["inputs"]]
    targets = examples["outputs"]
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=max_length)
    labels = tokenizer(targets, truncation=True, padding="max_length", max_length=max_length)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/2018 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = Seq2SeqTrainingArguments(
    output_dir="./grammar_model",
    learning_rate=5e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    logging_steps=50,
    save_strategy="epoch"
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized.shuffle(seed=42).select(range(1500)),
    tokenizer=tokenizer,
)



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()



Step,Training Loss
50,16.9858
100,2.8406
150,0.7718
200,0.3377
250,0.2364
300,0.1962
350,0.1774
400,0.1617
450,0.1597


TrainOutput(global_step=470, training_loss=2.3331354136162616, metrics={'train_runtime': 182.6503, 'train_samples_per_second': 41.062, 'train_steps_per_second': 2.573, 'total_flos': 644508057600000.0, 'train_loss': 2.3331354136162616, 'epoch': 5.0})

In [None]:
model.save_pretrained("./grammar_model_lora")
tokenizer.save_pretrained("./grammar_model_lora")

('./grammar_model_lora/tokenizer_config.json',
 './grammar_model_lora/special_tokens_map.json',
 './grammar_model_lora/spiece.model',
 './grammar_model_lora/added_tokens.json',
 './grammar_model_lora/tokenizer.json')

In [None]:
import re

def rule_based_correction(text):
    # Example fixes
    text = re.sub(r"\bI has\b", "I have", text)
    text = re.sub(r"\bShe go\b", "She goes", text)
    text = re.sub(r"\bHe go\b", "He goes", text)
    text = re.sub(r"\bThey is\b", "They are", text)
    text = re.sub(r"\bThis apples\b", "These apples", text)
    return text


In [None]:
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
gleu = evaluate.load("google_bleu")

def evaluate_model(sentences_to_evaluate, references=None):
    inputs = sentences_to_evaluate
    # References are optional for cases where you only want predictions

    preds = []
    # Check if CUDA is available and move model to GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for inp in inputs:
        inputs_tokenized = tokenizer(inp, return_tensors="pt")
        # Move input tensors to the same device as the model
        inputs_tokenized = {name: tensor.to(device) for name, tensor in inputs_tokenized.items()}
        outputs = model.generate(**inputs_tokenized, max_length=128)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        #pred = rule_based_correction(pred) # Uncomment to apply rule-based correction
        preds.append(pred)

    if references:
        # Ensure references is a list of lists for BLEU calculation
        references_for_bleu = [[r] for r in references]
        bleu_score = bleu.compute(predictions=preds, references=references_for_bleu)
        rouge_score = rouge.compute(predictions=preds, references=references)
        gleu_score = gleu.compute(predictions=preds, references=references)

        print("BLEU:", bleu_score)
        print("ROUGE:", rouge_score)
        print("GLEU:", gleu_score)
    else:
        print("Predictions:")
        for i, pred in enumerate(preds):
            print(f"Input: {sentences_to_evaluate[i]} -> Prediction: {pred}")

In [None]:
!pip install rouge_score --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
test_sentences = [
    "She go market every morning.",
    "They was late for the meeting.",
    "I has two brother and one sister.",
    "We is waiting at the bus stop.",
    "He don’t like to read books.",
    "My mother cooking dinner right now.",
    "The dogs barks loud in the night.",
    "I seen him at the park yesterday.",
    "She are a good singer.",
    "He want to plays football.",
    "They was go to the cinema last night.",
    "This apple are sweet.",
    "He don’t knows the answer.",
    "She walking to school every day.",
    "We has finished the project already.",
    "I am agree with your opinion.",
    "She cat is very cute.",
    "The boys is running fast.",
    "I didn’t went to the party.",
    "She don’t understands the question.",
    "The book are on the table.",
    "He doesn’t likes pizza.",
    "I am go to the library tomorrow.",
    "She were very happy yesterday.",
    "They is playing in the garden.",
    "We enjoys the movie a lot.",
    "My father work in an office.",
    "The flowers is blooming beautifully.",
    "He go to gym every day.",
    "I was study when you called.",
    "The teacher give us homework yesterday.",
    "She have a red dress.",
    "He don’t want to goes outside.",
    "They goes to school by bus.",
    "This shoes is too big for me.",
    "She not like ice cream.",
    "We was watching TV last night.",
    "He am very tired today.",
    "I likes to play cricket.",
    "She go shopping yesterday.",
    "The child cry loudly.",
    "He don’t play guitar anymore.",
    "We has to complete the assignment.",
    "She doesn’t knows how to swim.",
    "They is waiting outside.",
    "I has been there before.",
    "The sun rise from the west.",
    "She have finished her work.",
    "He go there last week.",
    "We enjoys playing together.",
    "The dog run fastly.",
    "I were busy all day.",
    "She cooking food when I arrive.",
    "They doesn’t like coffee.",
    "The baby cry whole night.",
    "He go to office by train.",
    "She not understand the problem.",
    "I has seen that movie already.",
    "The students was very noisy.",
    "We goes to park on Sunday.",
    "He doesn’t knows my name.",
    "She eat lunch now.",
    "They was play football yesterday.",
    "My friend have a car.",
    "I don’t knows the answer.",
    "She are very clever.",
    "He walking to office every morning.",
    "We was tired after the trip.",
    "She goes not to college today.",
    "They enjoys the concert a lot.",
    "The boy have a toy.",
    "She am reading a book.",
    "He don’t likes to dance.",
    "I seen her in the classroom.",
    "We goes for shopping every weekend.",
    "She doesn’t wants to help.",
    "He is go to the market.",
    "The cat sleep in the sofa.",
    "They goes for walk daily.",
    "I has no time for games.",
    "She cooking when I come home.",
    "The car run very fast.",
    "He not speak English well.",
    "We was plan a trip.",
    "She don’t goes to school regularly.",
    "They is watching a movie.",
    "I am agrees with you.",
    "The teacher explain us yesterday.",
    "He go office at 9am.",
    "She have two dog.",
    "We was play outside.",
    "They enjoys reading novels.",
    "The baby cry when hungry.",
    "I don’t remembers his name.",
    "She are my best friend.",
    "He doesn’t wants to work today.",
    "We goes swimming in summer.",
    "They was walking in the rain.",
    "The sun rise early morning.",
    "She am cooking pasta.",
    "I has a pen and a book."
]


# Check if CUDA is available and move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for sent in test_sentences:
    inputs = tokenizer(sent, return_tensors="pt")
    # Move input tensors to the same device as the model
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
    outputs = model.generate(**inputs, max_length=128)
    corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
    #polished = rule_based_correction(corrected)
    print(f"Input: {sent}")
    print(f"Corrected: {corrected}")
    print("------")

Input: She go market every morning.
Corrected: She went to market every morning.
------
Input: They was late for the meeting.
Corrected: They were late for the meeting.
------
Input: I has two brother and one sister.
Corrected: I have two brothers and one sister.
------
Input: We is waiting at the bus stop.
Corrected: We are waiting at the bus stop.
------
Input: He don’t like to read books.
Corrected: He doesn’t like to read books.
------
Input: My mother cooking dinner right now.
Corrected: My mother cooking dinner right now.
------
Input: The dogs barks loud in the night.
Corrected: The dogs bark loudly in the night.
------
Input: I seen him at the park yesterday.
Corrected: I saw him at the park yesterday.
------
Input: She are a good singer.
Corrected: She is a good singer.
------
Input: He want to plays football.
Corrected: He wants to play football.
------
Input: They was go to the cinema last night.
Corrected: They went to the cinema last night.
------
Input: This apple are swe

In [None]:
# Use the modified evaluate_model function to get predictions for test_sentences
evaluate_model(test_sentences)

Predictions:
Input: She go market every morning. -> Prediction: She went market every morning.
Input: They was late for the meeting. -> Prediction: They were late for the meeting.
Input: I has two brother and one sister. -> Prediction: I have two brothers and one sister.
Input: We is waiting at the bus stop. -> Prediction: We are waiting at the bus stop.
Input: He don’t like to read books. -> Prediction: He doesn’t like reading books.
Input: My mother cooking dinner right now. -> Prediction: My mother cooked dinner right now.
Input: The dogs barks loud in the night. -> Prediction: The dogs bark loudly in the night.
Input: I seen him at the park yesterday. -> Prediction: I saw him at the park yesterday.
Input: She are a good singer. -> Prediction: She is a good singer.
Input: He want to plays football. -> Prediction: He wants to play football.
Input: They was go to the cinema last night. -> Prediction: They were going to the cinema last night.
Input: This apple are sweet. -> Prediction:

In [None]:
import evaluate

# Load metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
# gleu = evaluate.load("gleu") # Removed gleu due to error

# Select a subset for testing (e.g., 100 samples)
test_dataset = dataset.select(range(100))

predictions = []
references = []

# Assuming 'dataset' is your Hugging Face Dataset object with 'inputs' and 'outputs' columns
for example in test_dataset:
    input_sentence = example['inputs']
    # Generate prediction using your model (assuming you have a model loaded and ready, e.g., 'model' and 'tokenizer')
    # You might need to adapt this part based on how your model is set up for inference
    inputs_tokenized = tokenizer("Correct the grammar: " + input_sentence, return_tensors="pt").to(model.device) # Assuming model is on a device
    outputs = model.generate(**inputs_tokenized, max_length=128) # Increased max_length for potentially longer corrections
    pred_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)


    predictions.append(pred_sentence)
    references.append([example['outputs']])  # reference should be a list of lists for some metrics like BLEU

# Compute scores
bleu_score = bleu.compute(predictions=predictions, references=references)
rouge_score = rouge.compute(predictions=predictions, references=references)
# gleu_score = gleu.compute(predictions=predictions, references=references) # Removed gleu computation

print("BLEU:", round(bleu_score['bleu'], 4))
print("ROUGE-L:", round(rouge_score['rougeL'], 4))
# print("GLEU:", round(gleu_score['gleu'], 4)) # Removed gleu print

BLEU: 0.7855
ROUGE-L: 0.9067


In [None]:
model.save_pretrained("./saved_grammar_model")

# Save tokenizer
tokenizer.save_pretrained("./saved_grammar_model")

('./saved_grammar_model/tokenizer_config.json',
 './saved_grammar_model/special_tokens_map.json',
 './saved_grammar_model/spiece.model',
 './saved_grammar_model/added_tokens.json',
 './saved_grammar_model/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Save model to Drive
model.save_pretrained("/content/drive/MyDrive/grammar_model")
tokenizer.save_pretrained("/content/drive/MyDrive/grammar_model")


Mounted at /content/drive


('/content/drive/MyDrive/grammar_model/tokenizer_config.json',
 '/content/drive/MyDrive/grammar_model/special_tokens_map.json',
 '/content/drive/MyDrive/grammar_model/spiece.model',
 '/content/drive/MyDrive/grammar_model/added_tokens.json',
 '/content/drive/MyDrive/grammar_model/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Save model to Drive
model.save_pretrained("/content/drive/MyDrive/saved_grammar_model")
tokenizer.save_pretrained("/content/drive/MyDrive/saved_grammar_model")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


('/content/drive/MyDrive/saved_grammar_model/tokenizer_config.json',
 '/content/drive/MyDrive/saved_grammar_model/special_tokens_map.json',
 '/content/drive/MyDrive/saved_grammar_model/spiece.model',
 '/content/drive/MyDrive/saved_grammar_model/added_tokens.json',
 '/content/drive/MyDrive/saved_grammar_model/tokenizer.json')