# Overview

In this notebook, we will try to fine-tune Mistal 7b for a multiclass classification task.

In [None]:
!pip install transformers==4.36.2
!pip install accelerate==0.25.0
!pip install evaluate==0.4.1
!pip install datasets==2.15.0
!pip install peft==0.7.1
!pip install bitsandbytes==0.41.3

In [None]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Fine-tune-models-with-QLoRA"
os.environ["WANDB_NOTES"] = "Fine tune model with QLoRA"
os.environ["WANDB_NAME"] = "ft-mistral-with-customize-ds-with-QLoRA"
os.environ["MODEL_NAME"] = "mistralai/Mistral-7B-v0.1"

# Loading the dataset

Here we use the function `iterative_train_test_split` from skmultilearn. This creates an even split for unbalanced multilabel datasets for us. 

In [None]:
import csv
import random
import numpy as np
from skmultilearn.model_selection import iterative_train_test_split
from datasets import Dataset, DatasetDict

random.seed(0)

with open('/kaggle/input/multilabel-classification-dataset/train.csv', newline='') as instance:
    data=list(csv.reader(instance, delimiter=','))
    header_row=data.pop(0)

# shuffle data
random.shuffle(data)

# reshape
idx, text, labels=list(
    zip(*[(int(row[0]), 
        f'Title:{row[1].strip()}\n\nAbstract: {row[2].strip()}',
        row[3:]) for row in data]))
labels=np.array(labels, dtype=int)

# create label weights
label_weights=1-labels.sum(axis=0)/labels.sum()

# stratified train test split for multilabel datasets
row_ids=np.arange(len(labels))
train_idx,y_train, val_idx, y_val=iterative_train_test_split(row_ids[:,np.newaxis], labels, test_size=0.1)
x_train=[text[i] for i in train_idx.flatten()]
x_val=[text[i] for i in val_idx.flatten()]

# create dataset in hf format
ds=DatasetDict({
    'train': Dataset.from_dict({'text':x_train,'labels':y_train}),
    'val': Dataset.from_dict({'text':x_val,'labels':y_val})
})
ds

# Loading tokenizer and define preprocess function

In [None]:
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))
tokenizer.pad_token=tokenizer.eos_token
tokenizer

In [None]:
import functools

def preprocess_func(examples, tokenizer):
    tokenized_inputs=tokenizer(examples['text'])
    tokenized_inputs['labels']=examples['labels']
    return tokenized_inputs

tokenized_ds=ds.map(functools.partial(preprocess_func, tokenizer=tokenizer), batched=True)
tokenized_ds=tokenized_ds.with_format('torch')
tokenized_ds

# Loading the model

We download Mistral 7B and quantize it with nf4(QLoRA). And training it by using LoRA.

In [None]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification
from peft import LoraConfig, TaskType, prepare_model_for_kbit_training, get_peft_model

# quantization config
quantization_config= BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16 #bfloat16 for special hardware, we use float16
)

# lora config
lora_config=LoraConfig(
    r=16, # the dimension of the low-rank matrices
    lora_alpha=8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout=0.05, # dropout probability of the LoRA layers
    bias='none',
    task_type=TaskType.SEQ_CLS
)

# load model
model=AutoModelForSequenceClassification.from_pretrained(
    os.getenv('MODEL_NAME'),
    quantization_config=quantization_config,
    num_labels=labels.shape[1]
)

model=prepare_model_for_kbit_training(model)
model=get_peft_model(model, lora_config)
model.config.pad_token_id=tokenizer.pad_token_id
model

# Training

Before we do the training, we have to define some custom functions that our trianer will use, like: data collator and metrics.

**Data Collator**

We need to tell the trainer how it should preprocess batches coming from the dataset before they can be passed to the model

**Metrics**

We furthermore need pass a function to the trainer which defindes the evaluation metrics we want to compute in addition to the loss.

In [None]:
from sklearn.metrics import f1_score

# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys=['input_ids', 'attention_mask', 'labels']
    d={k:[dic[k] for dic in batch] for k in dict_keys}
    d['input_ids']=torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask']=torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels']=torch.stack(d['labels'])
    return d

# define which metrics to compute for evaluation
def compute_metrics(p):
    predictions,labels=p
    f1_micro=f1_score(labels, predictions>0, average='micro')
    f1_macro=f1_score(labels, predictions>0, average='macro')
    f1_weighted=f1_score(labels, predictions>0, average='weighted')
    return {
        'f1_micro':f1_micro,
        'f1_macro':f1_macro,
        'f1_weighted': f1_weighted
    }

# Define customer trainer

We need to define a custom trainer class to able to calculate our multilabel loss which treats each output neuron as a binary classification instance. To be able to use our label weights for the loss we also need to define it as a class attribute in the `__init__` method so the `compute_loss` method has access to it.

In [None]:
# create custom trainer class to be able to pass label weights and calculate mutilabel loss
from transformers import Trainer
import torch.nn.functional as F

class CustomTrainer(Trainer):
    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights=label_weights
    def compute_loss(self, model, inputs, return_outputs=False):
        labels=inputs.pop("labels")
        
        #forward pass
        outputs=model(**inputs)
        logits=outputs.get("logits")
        
        # compute custom loss
        loss=F.binary_cross_entropy_with_logits(
            logits, 
            labels.to(torch.float32),
            pos_weight=self.label_weights
        )
        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers import TrainingArguments

training_args=TrainingArguments(
    output_dir=os.getenv('WANDB_NAME'),
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    report_to='wandb',
    run_name=os.getenv('WANDB_NAME')
)

trainer=CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
    tokenizer=tokenizer,
    data_collator=functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    label_weights=torch.tensor(label_weights, device=model.device)
)

trainer.train()

In [None]:
trainer.push_to_hub(os.getenv('WANDB_NAME'))
tokenizer.push_to_hub(os.getenv('WANDB_NAME'))