In [None]:
! pip install transformers nltk datasets peft torch evaluate rouge_score

In [None]:
import numpy as np 
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from datasets import Dataset,load_from_disk,load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, TaskType, PeftConfig
import torch
from evaluate import load

In [5]:
import os
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
os.environ["WANDB_API_KEY"] =user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "T5B-PEFT-FNHR-Entitle"

In [6]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mankonbh[0m ([33mankonbh-university-of-leeds[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
os.environ["HF_TOKEN"] = user_secrets.get_secret("HF_ACC_TOK")

In [None]:
model_name="google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
metric = load("rouge")

In [9]:
tokenized_dataset=load_dataset("Ankonbh/Financial-News-Headlines-Reuters")

README.md:   0%|          | 0.00/639 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/6.64M [00:00<?, ?B/s]

data/val-00000-of-00001.parquet:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19661 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/4916 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8193 [00:00<?, ? examples/s]

In [3]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05, 
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

print(lora_config)

LoraConfig(task_type=<TaskType.SEQ_2_SEQ_LM: 'SEQ_2_SEQ_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=16, target_modules=None, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)


In [4]:
model = get_peft_model(model, lora_config)

In [5]:
model.print_trainable_parameters()

trainable params: 1,769,472 || all params: 249,347,328 || trainable%: 0.7096


In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

In [22]:
batch_size = 16
args = Seq2SeqTrainingArguments(
    f"{model_name}-PEFT-FNHR",
    eval_strategy="epoch",
    learning_rate=3e-4, 
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=6,
    predict_with_generate=True,
    fp16=True,
    save_strategy="epoch",
    metric_for_best_model="eval_rougeL",
    report_to="wandb",
    save_total_limit=3,
    push_to_hub=False,
)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
trainer.train()

In [None]:
trainer.model.push_to_hub(repo_id="Ankonbh/flan-t5-base-PEFT-FNHR")

In [None]:
prediction_output = trainer.predict(tokenized_dataset['test'])

In [29]:
prediction_output.metrics

{'test_loss': 1.9054192304611206,
 'test_rouge1': 46.1543,
 'test_rouge2': 21.7384,
 'test_rougeL': 42.0157,
 'test_rougeLsum': 42.0307,
 'test_gen_len': 15.9956,
 'test_runtime': 254.6074,
 'test_samples_per_second': 32.179,
 'test_steps_per_second': 1.009}