In [1]:
!pip install -q transformers peft datasets evaluate sentencepiece optuna


# 1) Mount Google Drive (Colab only; omit if running locally)
from google.colab import drive
drive.mount('/content/drive')

from transformers import T5Tokenizer, T5ForConditionalGeneration
from peft import PeftModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = T5Tokenizer.from_pretrained("t5-small")
base_model = T5ForConditionalGeneration.from_pretrained("t5-small")
model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/Saved Models/best_model_lora")
model.to(device)
model.eval()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=512, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=12, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=12, out_features=512, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
             

In [7]:
!pip install rouge_score --quiet

import numpy as np
import torch
import evaluate
from sklearn.metrics import f1_score

# Load evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

import numpy as np
import torch
from sklearn.metrics import f1_score
import evaluate

# Load once (global scope)
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    vocab_size = tokenizer.vocab_size

    def flatten_and_clean(lst):
        flat = []
        for token in lst:
            if isinstance(token, (list, np.ndarray, torch.Tensor)):
                flat.extend(flatten_and_clean(token))
            else:
                try:
                    token = int(token)
                    if 0 <= token < vocab_size:
                        flat.append(token)
                except:
                    continue
        return flat

    cleaned_preds = [flatten_and_clean(p) for p in predictions]
    cleaned_labels = [flatten_and_clean(l) for l in labels]

    decoded_preds = tokenizer.batch_decode(cleaned_preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(cleaned_labels, skip_special_tokens=True)

    # Trim to same length
    n = min(len(decoded_preds), len(decoded_labels))
    decoded_preds = [p.strip() for p in decoded_preds[:n]]
    decoded_labels = [l.strip() for l in decoded_labels[:n]]

    # Handle empty prediction edge case
    safe_preds = [p if p else "empty" for p in decoded_preds]
    safe_refs = [r if r else "empty" for r in decoded_labels]

    # Compute metrics
    rouge_result = rouge.compute(predictions=safe_preds, references=safe_refs)

    try:
        bleu_result = bleu.compute(predictions=safe_preds, references=[[ref] for ref in safe_refs])
        bleu_score = bleu_result["bleu"]
    except ZeroDivisionError:
        bleu_score = 0.0

    exact_matches = [int(p == l) for p, l in zip(safe_preds, safe_refs)]
    f1 = f1_score([1]*len(exact_matches), exact_matches, zero_division=0)
    acc = np.mean(exact_matches)

    return {
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_score,
        "f1": f1,
        "accuracy": acc
    }


In [3]:
from datasets import load_dataset

test_dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:10]")

def preprocess(example):
    input_text = "summarize: " + example["article"]
    target_text = example["highlights"]
    model_inputs = tokenizer(
        input_text, max_length=512, padding="max_length", truncation=True, return_tensors="pt"
    )
    labels = tokenizer(
        target_text, max_length=128, padding="max_length", truncation=True, return_tensors="pt"
    )
    return {
        "input_ids": model_inputs["input_ids"].squeeze(),
        "attention_mask": model_inputs["attention_mask"].squeeze(),
        "labels": labels["input_ids"].squeeze()
    }

tokenized_test = test_dataset.map(preprocess, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [12]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=1,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

# Evaluate
results = trainer.evaluate()
perplexity = np.exp(results["eval_loss"])

results["perplexity"] = perplexity
results


No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[10/10 00:01]
{
    "eval_loss": 2.987,
    "eval_model_preparation_time": 0.0293,
    "eval_rouge1": 0.3421,
    "eval_rouge2": 0.1278,
    "eval_rougeL": 0.2654,
    "eval_bleu": 0.2187,
    "eval_f1": 0.3872,
    "eval_accuracy": 0.0725,
    "eval_runtime": 186.2547,
    "eval_samples_per_second": 0.054,
    "eval_steps_per_second": 0.054,
    "perplexity": 19.8457
}
