In [None]:
! pip install transformers nltk datasets peft torch evaluate rouge_score

In [None]:
import numpy as np 
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from datasets import Dataset,load_from_disk,load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

In [3]:
import os
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
os.environ["WANDB_API_KEY"] =user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "T5B-FT-FNHR-Entitle"

In [4]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mankonbh[0m ([33mankonbh-university-of-leeds[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
os.environ["HF_TOKEN"] = user_secrets.get_secret("HF_ACC_TOK")

# Modelling

In [6]:
model_name="google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [7]:
tokenized_dataset=load_dataset("Ankonbh/Financial-News-Headlines-Reuters")

README.md:   0%|          | 0.00/639 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/6.64M [00:00<?, ?B/s]

data/val-00000-of-00001.parquet:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19661 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/4916 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8193 [00:00<?, ? examples/s]

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [9]:
batch_size = 16
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-FNHR",
    eval_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    report_to='wandb',
)

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [11]:
!pip install evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=a557070f5122b77b04608d6af028adf3ebf60097764e72ff338f83b70638d63c
  Stored in directory: /ro

In [12]:
from evaluate import load
metric = load("rouge")

Downloading builder script: 0.00B [00:00, ?B/s]

In [13]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

In [14]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
prediction_output = trainer.predict(tokenized_dataset['test'])

In [21]:
prediction_output.metrics

{'test_loss': 1.8388476371765137,
 'test_rouge1': 46.9203,
 'test_rouge2': 22.3549,
 'test_rougeL': 42.7079,
 'test_rougeLsum': 42.7135,
 'test_gen_len': 15.9284,
 'test_runtime': 221.1821,
 'test_samples_per_second': 37.042,
 'test_steps_per_second': 1.162}

In [19]:
model_pt = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [20]:
batch_size = 16
args_pt = Seq2SeqTrainingArguments(
    f"{model_name}-pretrained-FNHR",
    eval_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to='wandb',
)

trainer_pt = Seq2SeqTrainer(
    model_pt,
    args_pt,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
prediction_output_pt = trainer_pt.predict(tokenized_dataset['test'])

In [25]:
prediction_output_pt.metrics

{'test_loss': 2.253880739212036,
 'test_rouge1': 39.1419,
 'test_rouge2': 16.754,
 'test_rougeL': 35.6027,
 'test_rougeLsum': 35.6048,
 'test_gen_len': 14.7954,
 'test_runtime': 222.9179,
 'test_samples_per_second': 36.753,
 'test_steps_per_second': 1.153}