In [1]:
# Ref
# https://huggingface.co/docs/evaluate/main/en/transformers_integrations
# https://www.kaggle.com/code/fadyelkbeer/mt5-multilingual-xlsum
# https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_&_Biases.ipynb
# https://docs.wandb.ai/guides/integrations/huggingface

In [2]:
%pip install evaluate rouge_score -q

# Wandb Setup

In [3]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

# I have saved my API token with "wandb_api" as Label. 
# If you use some other Label make sure to change the same below. 
wandb_api = user_secrets.get_secret("wandb_api") 

wandb.login(key=wandb_api)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Process data function

In [4]:
from transformers import AutoTokenizer
import re
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum",legacy=False)

Downloading (…)okenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]



In [5]:
def preprocess_function(dataset):
    text_token_len = 512
    sum_token_len = 100
    # Convert the text and summary columns to tensors.
    input_text = dataset["text"]
    target_text = dataset["sum"]

    # this line just follow model card in huggingface
    WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
    input_text = [WHITESPACE_HANDLER(text) for text in input_text]
    
    # Tokenize the input and target text.
    text_token = tokenizer(input_text, truncation=True, padding=True, max_length=text_token_len)
    sum_token = tokenizer(target_text, truncation=True, padding=True, max_length=sum_token_len)

    # model input
    model_inputs = text_token
    model_inputs["labels"] = sum_token["input_ids"]
    return model_inputs

# Setup evaluation

In [6]:
import nltk
import evaluate
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")



Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [7]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

# Train model

In [8]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq
from datasets import load_dataset
import numpy as np

In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Downloading pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

In [10]:
project_name = "text-summarization-model"
use_artifact = "dylanon/text-summarization-model/slices:v2"
train_dataset = "slice_0.parquet"
train_rows = 10000
run_name = "demo"
# with wandb.init(project=project_name) as run:

run = wandb.init(project=project_name, name=run_name, job_type="train")

# load dataset-artifact
artifact = run.use_artifact("dylanon/text-summarization-model/slices:v2", type='dataset')
artifact_dir = artifact.download()
    
# read artifact
dataset = load_dataset('parquet',data_files=(artifact_dir + "/" + train_dataset), split="train[:{}]".format(train_rows))
dataset = dataset.train_test_split(test_size=0.2)
    
# prepare data
tokenized_dataset = dataset.map(preprocess_function, batched=True)
    
# train
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    report_to="wandb",
    auto_find_batch_size = True,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    fp16=True, # this can use with cuda only
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# remove unused variable to free memory
import gc
import torch

del dataset
del tokenized_dataset
gc.collect()
torch.cuda.empty_cache()

# train
trainer.train()

# export model to file
trainer.save_model("./exported_model")


[34m[1mwandb[0m: Currently logged in as: [33m63050123[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact slices:v2, 6632.04MB. 11 files... 
[34m[1mwandb[0m:   11 of 11 files downloaded.  
Done. 0:0:26.5


Downloading and preparing dataset parquet/default to /root/.cache/huggingface/datasets/parquet/default-ed68884e7a518a50/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/default-ed68884e7a518a50/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,3.183173,0.360435,0.139613,0.332533,0.33159


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,2.796121,0.382214,0.162603,0.361021,0.361516
2,No log,2.739464,0.390615,0.172929,0.369338,0.369474


Downloading and preparing dataset parquet/default to /root/.cache/huggingface/datasets/parquet/default-a1b4b9c41d8eef6c/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/default-a1b4b9c41d8eef6c/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?ba/s]



0,1
eval/loss,█▂▁
eval/rouge1,▁▆█
eval/rouge2,▁▆█
eval/rougeL,▁▆█
eval/rougeLsum,▁▇█
eval/runtime,█▂▁
eval/samples_per_second,▁▇█
eval/steps_per_second,▁▇█
train/epoch,▁▁██
train/global_step,▁▃██

0,1
eval/loss,2.73946
eval/rouge1,0.39062
eval/rouge2,0.17293
eval/rougeL,0.36934
eval/rougeLsum,0.36947
eval/runtime,19.906
eval/samples_per_second,10.047
eval/steps_per_second,0.653
train/epoch,2.0
train/global_step,200.0


ValueError: too many values to unpack (expected 2)