In [10]:
!pip install torch torchvision torchaudio transformers datasets pandas rouge_score evaluate

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=63b3cd4297e047a29c31c56bd95c957d6e307962c14bbbca8801ba98948117fb
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.2 rouge_score-0.1.2


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric, Dataset as HFDataset


In [4]:
# Load model and tokenizer
model_name = "farelzii/GEC_Test_v1"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/913 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, input_max_len, target_max_len):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.input_max_len = input_max_len
        self.target_max_len = target_max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        source_text = self.dataframe.loc[index, 'incorrect']
        target_text = self.dataframe.loc[index, 'correct']

        source = self.tokenizer(source_text, max_length=self.input_max_len, padding='max_length', truncation=True, return_tensors="pt")
        target = self.tokenizer(target_text, max_length=self.target_max_len, padding='max_length', truncation=True, return_tensors="pt")

        source_ids = source['input_ids'].squeeze()
        target_ids = target['input_ids'].squeeze()
        attention_mask = source['attention_mask'].squeeze()

        return {
            'input_ids': source_ids,
            'attention_mask': attention_mask,
            'labels': target_ids
        }


In [6]:
# Load evaluation data
eval_data_path = "/content/drive/MyDrive/Final/Test.csv"
eval_df = pd.read_csv(eval_data_path, nrows=10000)
eval_df.columns = ["incorrect", "correct"]

# Create DataLoader
dataset = CustomDataset(eval_df, tokenizer, input_max_len=128, target_max_len=128)
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

In [8]:
def generate_predictions(model, tokenizer, dataloader, device):
    model.eval()
    predictions = []
    references = []
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128, early_stopping=True)

        preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
        refs = [tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=True) for l in labels]

        predictions.extend(preds)
        references.extend(refs)

    return predictions, references

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
predictions, references = generate_predictions(model, tokenizer, dataloader, device)


In [11]:
# Evaluate with BLEU and ROUGE metrics
bleu_metric = load_metric("bleu")
tokenized_predictions = [pred.split() for pred in predictions]
tokenized_references = [[ref.split()] for ref in references]
bleu_score = bleu_metric.compute(predictions=tokenized_predictions, references=tokenized_references)
print(f"BLEU Score: {bleu_score['bleu']:.4f}")

rouge_metric = load_metric("rouge")
rouge_score = rouge_metric.compute(predictions=predictions, references=references)
print(f"ROUGE Scores: {rouge_score}")


BLEU Score: 0.6190
The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
ROUGE Scores: {'rouge1': AggregateScore(low=Score(precision=0.787643213389826, recall=0.8075028989798412, fmeasure=0.7895559203195422), mid=Score(precision=0.7998847597891748, recall=0.8183498709914723, fmeasure=0.8007225405745824), high=Score(precision=0.8117286988673749, recall=0.8276613754357659, fmeasure=0.8113364604518941)), 'rouge2': AggregateScore(low=Score(precision=0.6240407777684528, recall=0.6401979083206834, fmeasure=0.6248926163140319), mid=Score(precision=0.6403596820257177, recall=0.6562331747956835, fmeasure=0.6401312064016904), high=Score(precision=0.6565552045292704, recall=0.6716408031698622, fmeasure=0.6555394252876374)), 'rougeL': AggregateScore(low

*BLEU (Bilingual Evaluation Understudy) score ranges from 0 to 1, with 1 being a perfect match*. A BLEU score of **0.6190** indicates that the model's predictions are relatively close to the reference translations, demonstrating a good level of accuracy in generating grammatically correct text