In [None]:
%%capture
%pip install unsloth
%pip install datasets
%pip install evaluate 
%pip install rouge_score

In [None]:
import os
import sys

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [None]:
import json

import evaluate
import numpy as np
import torch
from datasets import load_dataset
from dotenv import dotenv_values
from huggingface_hub import login
from transformers import DataCollatorForSeq2Seq, TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

from src.config import Config

In [None]:
config = dotenv_values("../.env")
login(token=config["HF_TOKEN"])

##### Reference Links For Fine Tunning Llama 3.2 With Unsloth

1. [Fine-tuning Llama 3.2 Using Unsloth](https://www.kdnuggets.com/fine-tuning-llama-using-unsloth)
2. [Fine-tuning Llama 3 with Unsloth: A Beginner’s Guide](https://medium.com/@seekmeai/fine-tuning-llama-3-with-unsloth-a-beginners-guide-d239d48eaf71)


In [None]:
# https://huggingface.co/docs/datasets/en/loading#hugging-face-hub
dataset = load_dataset("TRnlp/MixSub")

# Changing all the column names to have uniform singular forms
# All column names are now in singular form
dataset = dataset.rename_column("Highlights", "Highlight")

# execute the following lines to train the model on the entire dataset.
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# in case prototyping, set this to true
IS_TRIM_DS = True

if IS_TRIM_DS:
    # Define how many examples to use for training and evaluation during setup
    TRAIN_DATASET_LENGTH = 2000
    EVAL_DATASET_LENGTH = 1000

    # select less number of examples for training, and even less for testing,
    # if everything goes well,  we can fine tune on a larger dataset
    train_dataset = train_dataset.select(range(TRAIN_DATASET_LENGTH))
    eval_dataset = eval_dataset.select(range(EVAL_DATASET_LENGTH))


# Check train dataset before appending 'Prompt' column
# train_dataset.to_pandas().head()
# eval_dataset.to_pandas().head()

In [None]:
def format_abstract_highlight_as_prompt(examples: list, tokenizer):
    prompts: list[str] = []

    abstracts = examples["Abstract"]
    highlights = examples["Highlight"]

    for abstract, highlight in zip(abstracts, highlights):
        row_json = [
            {
                "role": "system",
                "content": Config.SUMMARIZER_INSTRUCTION,
            },
            {
                "role": "user",
                "content": abstract,
            },
            {
                "role": "assistant",
                # Must add EOS_TOKEN, otherwise your generation will go on forever!
                "content": highlight + tokenizer.eos_token,
            },
        ]

        prompt = tokenizer.apply_chat_template(
            row_json,
            tokenize=False,
            add_generation_prompt=False,
            return_tensors="pt",
        )

        prompts.append(prompt)

    return {
        "Prompt": prompts,
    }


In [None]:
TRAINED_MODEL_ACNT = "AdityaMayukhSom"
TRAINED_MODEL_NAME = "Llama-3.2-1B-Instruct-bnb-4bit-MixSub"
TRAINED_MODEL_REPO = f"{TRAINED_MODEL_ACNT}/{TRAINED_MODEL_NAME}"

fast_language_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length=2048,
    dtype=torch.bfloat16 if is_bfloat16_supported() else torch.float16,
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    fast_language_model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "up_proj",
        "down_proj",
        "gate_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
    use_rslora=False,
    loftq_config=None,
)

In [None]:
# Append Prompt column on which the model will be trained
train_dataset = train_dataset.map(
    lambda eg: format_abstract_highlight_as_prompt(eg, tokenizer),
    batched=True,
)
eval_dataset = eval_dataset.map(
    lambda eg: format_abstract_highlight_as_prompt(eg, tokenizer),
    batched=True,
)

# Check train dataset after adding 'Prompt' column
# train_dataset.to_pandas().head()
# eval_dataset.to_pandas().head()

In [None]:
train_dataset[0]["Prompt"]

#### Setup The Evaluation Metrics Calculator Function

[Reference To Calculate ROUGE Metrics](https://medium.com/@MUmarAmanat/llm-evaluation-with-rouge-0ebf6cf2aed4)


In [None]:
# https://huggingface.co/learn/nlp-course/chapter3/3
# evaluate.list_evaluation_modules(
#     # change this to None to list all metrics, but here we are interested only
#     # in 'metric', not in 'comparison' or 'measurement' in case 'rouge' or 'bleu'
#     # doesn't show up in the results, we may need to change it to something else
#     # to check what other options are available as module in the evaluate library.
#     module_type="metric",
#     include_community=True,
#     with_details=False,
# )

In [None]:
# https://huggingface.co/docs/evaluate/package_reference/loading_methods#evaluate.load.path
# Trainer.py does not have this, the following snippet is sourced from this link.
# https://github.com/huggingface/trl/issues/862#issuecomment-1896074498

# we need to compute two metrices seperately
# https://discuss.huggingface.co/t/log-multiple-metrics-while-training/8115/2
metric_bleu = evaluate.load("bleu")
metric_rouge = evaluate.load("rouge")


def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)


def compute_metrics(eval_preds):
    # Here preds are all_preds and labels are label_ids/all_labels.
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    # Replace -100 in the preds as we can't decode them
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)

    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(pred.strip()) for pred in decoded_preds]

    decoded_labels = ["\n".join(label.strip()) for label in decoded_labels]

    scores_bleu = metric_bleu.compute(
        predictions=decoded_preds, references=decoded_labels
    )
    scores_rouge = metric_rouge.compute(
        predictions=decoded_preds, references=decoded_labels
    )

    # https://www.freecodecamp.org/news/python-merge-dictionaries-merging-two-dicts-in-python/
    # https://www.datacamp.com/tutorial/python-dictionary-append

    scores: dict[str, str] = {}

    for key, score in scores_bleu.items():
        scores[f"bleu_{key}"] = score

    for key, score in scores_rouge.items():
        scores[f"rouge_{key}"] = score

    # print(scores_bleu)
    # print(scores_rouge)

    return scores


#### Setup Hyperparameters

We need to specifically check what is the optimal train and eval batch size, to optimally utilize the GPU, so that the kaggle runtime doesn't expire or crash.


In [None]:
PER_DEVICE_TRAIN_BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH_SIZE = 8
GRADIENT_ACCUMULATION_STEPS = 4
WARMUP_STEPS = 5
SAVE_STEPS = 4
EVAL_STEPS = 4
MAX_STEPS = 60
NUM_TRAIN_EPOCHS = 3
SAVE_TOTAL_LIMIT = 2
LEARNING_RATE = 2e-4
WEIGHT_DECAY = 0.01

training_args = TrainingArguments(
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=WARMUP_STEPS,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    num_train_epochs=NUM_TRAIN_EPOCHS,  # Set this to 1 for one full training run
    save_total_limit=SAVE_TOTAL_LIMIT,
    save_steps=SAVE_STEPS,
    # max_steps = MAX_STEPS,
    learning_rate=LEARNING_RATE,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    optim="adamw_8bit",
    weight_decay=WEIGHT_DECAY,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir=TRAINED_MODEL_NAME,
    report_to="none",
    load_best_model_at_end=True,
    push_to_hub=True,
)


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    # The field on which to train the model, we have added the generated prompt under 'Prompt'
    dataset_text_field="Prompt",
    # max_seq_length=MAX_SEQ_LEN,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,
    args=training_args,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>system<|end_header_id|>",
    response_part="<|start_header_id|>assistant<|end_header_id|>",
)

### Model Training


In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

trainer_stats = trainer.train()

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


### IMPORTANT: DO NOT RUN THIS WHILE TRYING OUT, AS IT SHOWS UP IN HUGGINGFACE HISTORY


In [None]:
trainer.push_to_hub(
    commit_message="first epoch fine tuning on mixsub",
    model_name=TRAINED_MODEL_NAME,
    # language="en",
    # finetuned_from=MODEL_NAME,
    # dataset=DATASET_NAME
)

### Model Evaluation


In [None]:
# This is to evaluate the fine-tuned model on the eval dataset
# it will compute the compute metrics for the model
results = trainer.evaluate()

In [None]:
print(json.dumps(results, indent=4))

{
    "eval_loss": 3.1856908798217773,
    "eval_model_preparation_time": 0.0092,
    "eval_bleu_bleu": 0.14854094732470374,
    "eval_bleu_precisions": [
        0.19877656421979512,
        0.18098445376371416,
        0.1342828077314344,
        0.10077618017559486
    ],
    "eval_bleu_brevity_penalty": 1.0,
    "eval_bleu_length_ratio": 4.982493144906138,
    "eval_bleu_translation_length": 47244,
    "eval_bleu_reference_length": 9482,
    "eval_rouge_rouge1": 0.3459109751281285,
    "eval_rouge_rouge2": 0.31609199783984276,
    "eval_rouge_rougeL": 0.27451221994786584,
    "eval_rouge_rougeLsum": 0.34688877011766794,
    "eval_runtime": 45.5213,
    "eval_samples_per_second": 0.659,
    "eval_steps_per_second": 0.176
}
