<a href="https://www.kaggle.com/code/aisuko/ft-llm-on-a-custom-ds-for-multi-classification?scriptVersionId=161597378" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Overview

In this notebook, we will try to fine-tune Mistal 7b for a multiclass classification task.

In [1]:
!pip install transformers==4.36.2
!pip install accelerate==0.25.0
!pip install evaluate==0.4.1
!pip install datasets==2.15.0
!pip install peft==0.7.1
!pip install bitsandbytes==0.41.3

Collecting transformers==4.36.2
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.37.0
    Uninstalling transformers-4.37.0:
      Successfully uninstalled transformers-4.37.0
Successfully installed transformers-4.36.2
Collecting accelerate==0.25.0
  Downloading accelerate-0.25.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25

In [2]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Fine-tune-models-with-QLoRA"
os.environ["WANDB_NOTES"] = "Fine tune model with QLoRA"
os.environ["WANDB_NAME"] = "ft-mistral-with-customize-ds-with-QLoRA"
os.environ["MODEL_NAME"] = "mistralai/Mistral-7B-v0.1"

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
!accelerate estimate-memory ${MODEL_NAME} --library_name transformers

Loading pretrained config for `mistralai/Mistral-7B-v0.1` from `transformers`...
config.json: 100%|█████████████████████████████| 571/571 [00:00<00:00, 3.16MB/s]
┌────────────────────────────────────────────────────────┐
│  Memory Usage for loading `mistralai/Mistral-7B-v0.1`  │
├───────┬─────────────┬──────────┬───────────────────────┤
│ dtype │Largest Layer│Total Size│  Training using Adam  │
├───────┼─────────────┼──────────┼───────────────────────┤
│float32│  864.03 MB  │ 27.49 GB │       109.96 GB       │
│float16│  432.02 MB  │ 13.74 GB │        54.98 GB       │
│  int8 │  216.01 MB  │ 6.87 GB  │        27.49 GB       │
│  int4 │   108.0 MB  │ 3.44 GB  │        13.74 GB       │
└───────┴─────────────┴──────────┴───────────────────────┘


# Loading the dataset

Here we use the function `iterative_train_test_split` from skmultilearn. This creates an even split for unbalanced multilabel datasets for us. 

In [4]:
import csv
import random
import numpy as np
from skmultilearn.model_selection import iterative_train_test_split
from datasets import Dataset, DatasetDict

random.seed(0)

with open('/kaggle/input/multilabel-classification-dataset/train.csv', newline='') as instance:
    data=list(csv.reader(instance, delimiter=','))
    header_row=data.pop(0)

# shuffle data
random.shuffle(data)

# reshape
idx, text, labels=list(
    zip(*[(int(row[0]), 
        f'Title:{row[1].strip()}\n\nAbstract: {row[2].strip()}',
        row[3:]) for row in data]))
labels=np.array(labels, dtype=int)

# create label weights
label_weights=1-labels.sum(axis=0)/labels.sum()

# stratified train test split for multilabel datasets
row_ids=np.arange(len(labels))
train_idx,y_train, val_idx, y_val=iterative_train_test_split(row_ids[:,np.newaxis], labels, test_size=0.1)
x_train=[text[i] for i in train_idx.flatten()]
x_val=[text[i] for i in val_idx.flatten()]

# create dataset in hf format
ds=DatasetDict({
    'train': Dataset.from_dict({'text':x_train,'labels':y_train}),
    'val': Dataset.from_dict({'text':x_val,'labels':y_val})
})
ds.shape

{'train': (18884, 2), 'val': (2088, 2)}

In [5]:
smaller_train=ds['train'].select(range(1000))
smaller_val=ds['val'].select(range(500))
ds['train']=smaller_train
ds['val']=smaller_val
ds.shape

{'train': (1000, 2), 'val': (500, 2)}

# Loading tokenizer and define preprocess function

In [6]:
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))
tokenizer.pad_token=tokenizer.eos_token
tokenizer

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-v0.1', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
import functools

def preprocess_func(examples, tokenizer):
    tokenized_inputs=tokenizer(examples['text'])
    tokenized_inputs['labels']=examples['labels']
    return tokenized_inputs

tokenized_ds=ds.map(functools.partial(preprocess_func, tokenizer=tokenizer), batched=True)
tokenized_ds=tokenized_ds.with_format('torch')
tokenized_ds

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    val: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
})

# Loading the model

We download Mistral 7B and quantize it with nf4(QLoRA). And training it by using LoRA.

In [8]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification
from peft import LoraConfig, TaskType, prepare_model_for_kbit_training, get_peft_model

# quantization config
quantization_config= BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16, #bfloat16 for special hardware, we use float16
    llm_int8_enable_fp32_cpu_offload=True
)

# lora config
lora_config=LoraConfig(
    r=16, # the dimension of the low-rank matrices
    lora_alpha=8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout=0.05, # dropout probability of the LoRA layers
    bias='none',
    task_type=TaskType.SEQ_CLS
)

# load model
model=AutoModelForSequenceClassification.from_pretrained(
    os.getenv('MODEL_NAME'),
    quantization_config=quantization_config,
    device_map='auto',
    torch_dtype=torch.float16,
    num_labels=labels.shape[1]
)

model=prepare_model_for_kbit_training(model)
model=get_peft_model(model, lora_config)
model.config.pad_token_id=tokenizer.pad_token_id
model

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): MistralForSequenceClassification(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
     

# Training

Before we do the training, we have to define some custom functions that our trianer will use, like: data collator and metrics.

**Data Collator**

We need to tell the trainer how it should preprocess batches coming from the dataset before they can be passed to the model

**Metrics**

We furthermore need pass a function to the trainer which defindes the evaluation metrics we want to compute in addition to the loss.

In [9]:
from sklearn.metrics import f1_score

# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys=['input_ids', 'attention_mask', 'labels']
    d={k:[dic[k] for dic in batch] for k in dict_keys}
    d['input_ids']=torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask']=torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels']=torch.stack(d['labels'])
    return d

# define which metrics to compute for evaluation
def compute_metrics(p):
    predictions,labels=p
    f1_micro=f1_score(labels, predictions>0, average='micro')
    f1_macro=f1_score(labels, predictions>0, average='macro')
    f1_weighted=f1_score(labels, predictions>0, average='weighted')
    return {
        'f1_micro':f1_micro,
        'f1_macro':f1_macro,
        'f1_weighted': f1_weighted
    }

# Define customer trainer

We need to define a custom trainer class to able to calculate our multilabel loss which treats each output neuron as a binary classification instance. To be able to use our label weights for the loss we also need to define it as a class attribute in the `__init__` method so the `compute_loss` method has access to it.

In [10]:
# create custom trainer class to be able to pass label weights and calculate mutilabel loss
from transformers import Trainer
import torch.nn.functional as F

class CustomTrainer(Trainer):
    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights=label_weights
    def compute_loss(self, model, inputs, return_outputs=False):
        labels=inputs.pop("labels")
        
        #forward pass
        outputs=model(**inputs)
        logits=outputs.get("logits")
        
        # compute custom loss
        loss=F.binary_cross_entropy_with_logits(
            logits, 
            labels.to(torch.float32),
            pos_weight=self.label_weights
        )
        return (loss, outputs) if return_outputs else loss

2024-02-04 00:33:56.374767: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-04 00:33:56.374896: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-04 00:33:56.549207: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


The detail of the training parameters please check the notebnook [Fine-tuning Microsoft phi2](https://www.kaggle.com/code/aisuko/fine-tuning-microsoft-phi2)

In [11]:
from transformers import TrainingArguments

training_args=TrainingArguments(
    output_dir=os.getenv('WANDB_NAME'),
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=5, # number of steps before optimizing
    num_train_epochs=1,
    weight_decay=0.01,
    max_steps=100, # Total number of training steps
    optim="paged_adamw_8bit", # Keep the optimizer state and quantize it
#     bf16=True, # Do not supported in Kaggle environment, require Ampere....
    fp16=True, # use fp16 16bit(mixed) precision training instead of 32-bit training.
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    report_to='wandb',
    run_name=os.getenv('WANDB_NAME')
)

trainer=CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
    tokenizer=tokenizer,
    data_collator=functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    label_weights=torch.tensor(label_weights, device=model.device)
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33murakiny[0m ([33mcausal_language_trainer[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


KeyboardInterrupt: 

In [None]:
trainer.push_to_hub(os.getenv('WANDB_NAME'))
tokenizer.push_to_hub(os.getenv('WANDB_NAME'))

# Credit

* https://medium.com/@lukas.hauzenberger/multilabel-classification-using-mistral-7b-on-a-single-gpu-with-quantization-and-lora-8f848b5237f3