In [1]:
# Ref
# https://huggingface.co/docs/evaluate/main/en/transformers_integrations
# https://www.kaggle.com/code/fadyelkbeer/mt5-multilingual-xlsum
# https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_&_Biases.ipynb
# https://docs.wandb.ai/guides/integrations/huggingface

In [2]:
!pip install evaluate rouge_score wandb transformers[torch] sentencepiece -q
!pip install accelerate -U -q
!pip install datasets -U -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.0.2 which is incompatible.
cudf 23.8.0 requires protobuf<5,>=4.21, but you have protobuf 3.20.3 which is incompatible.
cuml 23.8.0 requires dask==2023.7.1, but you have dask 2023.9.0 which is incompatible.
dask-cuda 23.8.0 requires dask==2023.7.1, but you have dask 2023.9.0 which is incompatible.
dask-cuda 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.0.2 which is incompatible.
dask-cudf 23.8.0 requires dask==2023.7.1, but you have dask 2023.9.0 which is incompatible.
dask-cudf 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you 

In [3]:
import os
import shutil

In [4]:
continute_train = False # @param {type:"boolean"}

## Wandb Setup

In [5]:
import wandb
wandb.login(key="633eeb120e6c05fe397c3e72cd4fda73233e2b23")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
# @title ##Initialize wandb
project_name = "text-summarization-model"

run = wandb.init(project=project_name, job_type="train")#, name=run_name)

[34m[1mwandb[0m: Currently logged in as: [33mdylanonwic[0m ([33mdylanon[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Process data function

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum", legacy=False)

Downloading (…)okenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]



## Setup evaluation

In [None]:
import nltk
import evaluate
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result



## Get model

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq
from datasets import load_dataset
import datasets
import numpy as np

In [None]:
model_dir = "./imported_model"

if continute_train:

    artifact = run.use_artifact('dylanon/text-summarization-model/summarization-model:latest', type='model')

    if not os.path.exists(model_dir):
        artifact_dir = artifact.download(root=model_dir)

    model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

else:
    model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum", cache_dir=model_dir)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

## Get datasets

In [None]:
# load dataset-artifact
artifact_dir = './imported_datasets'
use_artifact = "dylanon/text-summarization-model/tokenized-dataset:latest"

if not os.path.exists(artifact_dir):
    artifact = run.use_artifact(use_artifact, type='dataset')
    artifact_dir = artifact.download(root=artifact_dir)

tokenized_dataset = datasets.load_from_disk(artifact_dir)
tokenized_dataset = datasets.Dataset.from_dict(tokenized_dataset[:10_000])
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=False, seed=1150)

In [None]:
tokenized_dataset

## Train config

In [None]:
wandb.config = {
    "evaluation_strategy":"steps",
    "auto_find_batch_size" : True,
#   "per_device_train_batch_size":32,
#   "per_device_eval_batch_size":8,
    "learning_rate":5e-4, # default 5e-4
    "warmup_steps":50,
    "weight_decay":0.01, # default 0.01
#     "lr_scheduler_type":"linear",
    "gradient_accumulation_steps":32,  #16
    "eval_accumulation_steps":64,
    "gradient_checkpointing":True,
#   "optim":"adamw_bnb_8bit",
    "num_train_epochs":1,
#   "max_steps" :10000,

    "save_steps":0,
    "eval_steps":100,
    "logging_steps":10,
    "save_total_limit":0,
    "load_best_model_at_end":False,
    "fp16":True, # this can use with cuda only
    "predict_with_generate":True,
}

In [None]:
# train argment
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    report_to="wandb",
    **wandb.config
)

trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# @title ##Train
remove unused variable to free memory
import gc
import torch

# train
if continute_train: trainer.train(resume_from_checkpoint = './imported_model')
else: trainer.train()


directory_path = "./exported_model"

# Check if the directory exists
if os.path.exists(directory_path):
    try:
        # Delete the directory and its contents
        shutil.rmtree(directory_path)
        print(f"Directory '{directory_path}' deleted successfully.")
    except Exception as e:
        print(f"Error deleting '{directory_path}': {str(e)}")
    

# export model to file
trainer.save_model(directory_path)

In [None]:
# import gc
# from torch import cuda
# cuda.empty_cache()
# gc.collect()

In [None]:
# @title ##Evaluation
evaluation_results = trainer.evaluate()#eval_dataset=dataset['test'])
evaluation_results

In [None]:
# @title ##Upload model to wandb
art = wandb.Artifact(f"summarization-model", type="model")

for dir in os.listdir("./exported_model"):
    art.add_file(os.path.join( "./exported_model" , dir))

wandb.log(evaluation_results)
wandb.log_artifact(art)

In [None]:
# from time import time
# from google.colab import runtime
# time.sleep(300)
# runtime.unassign()