In [None]:
!pip install transformers datasets evaluate



In [None]:
! pip install -q evaluate rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
!pip install -q transformers[torch] datasets

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
import numpy as np
import evaluate
import os


In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
data_files = {
    "train": "ref_data.tsv",
    "validation": "dev_mms.tsv"
}
dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['id', 'date', 'headline', 'article', 'abstract'],
        num_rows: 3570
    })
    validation: Dataset({
        features: ['id', 'date', 'headline', 'article', 'abstract'],
        num_rows: 680
    })
})


In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(examples["abstract"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")


model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")



In [None]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-small-finetuned-mms",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    generation_num_beams=5,
    fp16=True,
    logging_dir="./logs",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,4.092807,0.1091,0.0298,0.0948,0.0943,19.9824
2,No log,3.995559,0.1072,0.0309,0.0957,0.0952,19.8765
3,4.427200,3.952527,0.1056,0.0327,0.0953,0.0949,19.9382
4,4.427200,3.933044,0.1071,0.0346,0.0974,0.097,19.9338
5,4.196700,3.92878,0.1105,0.0363,0.1002,0.0998,19.9147


TrainOutput(global_step=1120, training_loss=4.295892170497349, metrics={'train_runtime': 1328.1441, 'train_samples_per_second': 13.44, 'train_steps_per_second': 0.843, 'total_flos': 4831499299848192.0, 'train_loss': 4.295892170497349, 'epoch': 5.0})