In [None]:
!pip install transformers==4.46.3 accelerate scikit-multilearn peft datasets sentence-transformers bitsandbytes trl wandb

In [None]:
!pip install flash-attn --no-build-isolation

In [3]:
import os
import re
import time
import json
import torch
import random
import pandas as pd
import numpy as np
from datetime import datetime
from pydantic import BaseModel
from collections import defaultdict
from dataclasses import dataclass
from torch.utils.data import DataLoader
from scipy.special import expit as sigmoid
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

from trl import SFTTrainer
from datasets import Dataset, load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorWithPadding, DataCollatorForLanguageModeling, BitsAndBytesConfig

### Keys

We need HuggingFace access tokens to upload the fine-tuned models to the HuggingFace repository and Weights & Biases (WandB) API keys to record the training metrics in WandB.

In [None]:
os.environ["HF_TOKEN"]=""
os.environ["WANDB_API_KEY"]=""

### Function to create quantization config

In [None]:
def quantization_config(config):
  """
  Creates a configuration for 4-bit quantization of a model.

  Parameters
  ==========
  config (object) : Configuration object with attributes.

  Returns
  =======
  BitsAndBytesConfig: Configuration for 4-bit quantization.
  """
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=config.load_in_4bit,
      bnb_4bit_quant_type=config.bnb_4bit_quant_type,
      bnb_4bit_compute_dtype=config.bnb_4bit_compute_dtype,
      bnb_4bit_use_double_quant=config.bnb_4bit_use_double_quant,
  )
  return bnb_config

### Function to update model related settings

In [None]:
def update_model_related_settings(checkpoint, config):
  """
  Updates configuration settings related to saving the model, including checkpoint and naming.

  Parameters
  ==========
  checkpoint (str): Path to the model checkpoint.
  config (object): Configuration object to be updated.

  Returns
  =======
  config (object): Updated configuration object.
  """
  model_name = f"{checkpoint.split('/')[-1]}_finetuned_with_classification_head"
  config.checkpoint = checkpoint
  currtime = datetime.now().strftime("%Y_%m_%d_%H_%M")
  config.model_name = f"{model_name}_{currtime}"
  config.local_save_path = config.model_name
  return config

### Function to tokenize the dataset

In [12]:
def tokenize_dataset(tokenizer, config, dataset_name='bhujith10/multi_class_classification_dataset'):
  """
  Tokenizes and preprocesses a dataset

  Parameters
  ==========
  tokenizer (Tokenizer): The tokenizer used to preprocess the dataset
  config (object): Configuration object containing settings related to the tokenizer
  dataset_name (str): The name or path of the dataset to load

  Returns
  =======
  Tokenized and preprocessed dataset
  """

  def tokenize(batch):
    """
    Tokenizes a single batch of text data.

    Parameters
    ==========
    batch (dict): A dictionary containing a batch of text data.

    Returns
    =======
    dict: A dictionary with tokenized data.
    """
    return tokenizer(batch['text'], truncation=True, padding=True, max_length=config.max_length, return_tensors='pt')

  ds = load_dataset(dataset_name)

  ds_encoded = ds.map(tokenize, batched=True, batch_size=None)

  for split in ds_encoded:
    ds_encoded[split].set_format('torch')

  # Convert the labels into float datatype
  ds_encoded = ds_encoded.map(lambda x: {"labels_f": x["labels"].to(torch.float)},remove_columns=["labels"])
  ds_encoded = ds_encoded.rename_column("labels_f", "labels")

  return ds_encoded

In [None]:
def compute_metrics(pred):
    """
    Computes evaluation metrics between predicted and true labels.
    This function would be used by the Trainer class during finetuning of the model.

    Parameters
    ==========
    pred : The prediction output from a model, containing:
        - pred.label_ids: The true labels.
        - pred.predictions: The raw predictions (logits) from the model.

    Returns
    =======
    dict: A dictionary containing the calculated "micro f1" and "macro f1" scores.
    """
    # Extract true labels and raw predictions from the input
    y_true = pred.label_ids

    # Apply sigmoid to each label to get the individual probabilities
    y_pred = sigmoid(pred.predictions)

    # Convert probabilities to binary predictions based on a threshold of 0.5
    y_pred = (y_pred > 0.5).astype(float)

    # Generate a classification report to compute metrics
    clf_dict = classification_report(
        y_true,
        y_pred,
        zero_division=0,
        output_dict=True
    )

    # Extract and return the micro and macro F1 scores from the report
    return {
        "micro f1": clf_dict["micro avg"]["f1-score"],
        "macro f1": clf_dict["macro avg"]["f1-score"]
    }


### Function to calculate model size

In [13]:
def calculate_model_size(model):
  """
  Calculates the total, trainable, and memory size of model parameters.

  Parameters
  ==========
  model: The model whose parameters are to be calculated.

  Returns
  =======
  tuple: Total parameters, trainable parameters, and memory size in MB.
  """
  total_params = sum(p.numel() for p in model.parameters())
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  param_size_in_bytes = sum(p.numel() * p.element_size() for p in model.parameters())
  param_size_in_mb = param_size_in_bytes / (1024 ** 2)

  return total_params, trainable_params, param_size_in_mb

### Function to convert probabilities into labels

In [None]:
def return_encodings(predicted_probabilities):
  """
  Converts predicted probabilities into binary encodings by selecting the highest probability.

  Parameters
  ==========
  predicted_probabilities (numpy array): A 1D array of predicted probabilities for each class.

  Returns
  =======
  numpy array: A binary array where the position of the maximum probability are marked with 1, and others are marked with 0.
  """
  # Create binary encodings by setting the maximum probability to 1 and others to 0
  predicted_encodings = (predicted_probabilities == np.max(predicted_probabilities)).astype(int)
  return predicted_encodings

### Function to calculate F1 score

In [None]:
def calculate_f1_score(y_true, y_pred):
    """
    Calculates micro and macro F1-scores given the predicted and actual labels

    Parameters
    ==========
    y_true (numpy array): Actual labels
    y_pred (numpy array): Predicted labels

    Returns
    =======
    dict: A dictionary containing micro f1 and macro f1 scores.
    """
    # Generate a classification report to compute detailed metrics
    clf_dict = classification_report(
        y_true,
        y_pred,
        zero_division=0,
        output_dict=True
    )

    return {
        "micro f1": clf_dict["micro avg"]["f1-score"],
        "macro f1": clf_dict["macro avg"]["f1-score"]
    }

### Function to load model and tokenizer

Load the model with classification head

In [4]:
def load_model_and_tokenizer(config,
                             add_pad_token=False,
                             quantization=False,
                             peft=False,
                             load_model_for_sequence_classification=False):
  """
  Loads a model and its tokenizer based on the provided configuration.

  Parameters
  ==========
  config (object): Configuration object with model and tokenizer attributes.
  add_pad_token (bool): Whether to add a padding token.
  peft (bool): Whether to apply LORA or not.
  quantization (bool): Whether to apply quantization.
  load_model_for_sequence_classification (bool): Whether to load the model with classification head or not

  Returns
  =======
  Loaded model and tokenizer.
  """
  tokenizer = AutoTokenizer.from_pretrained(config.checkpoint)

  # Llama version 3 models already have a padding token
  # Hence we need not add a padding token
  if add_pad_token:
      if 'Llama' in tokenizer.name_or_path:
          tokenizer.pad_token = '<|finetune_right_pad_id|>'
      else:
          tokenizer.add_special_tokens({"pad_token":"<pad>"})

  # I faced some errors while right padding in Mistral models
  # Hence set padding_side as left for Mistral models alone
  if 'Mistral' in tokenizer.name_or_path:
      tokenizer.padding_side = "left"
  else:
      tokenizer.padding_side = "right"

  if load_model_for_sequence_classification:
      # Load the model with classification head
      # num_labels specifies the number of neurons in the output layer
      model =  AutoModelForSequenceClassification.from_pretrained(
          pretrained_model_name_or_path=config.checkpoint,
          quantization_config=quantization_config(config) if quantization else None,
          torch_dtype=torch.bfloat16 if config.bf16 else torch.float16,
          num_labels=config.num_labels,
          problem_type=config.problem_type
      )

  else:
      model = AutoModel.from_pretrained(checkpoint)

  if add_pad_token:
      model.config.pad_token_id = tokenizer.pad_token_id
      if 'Llama' not in tokenizer.name_or_path:
          model.resize_token_embeddings(len(tokenizer))
  if peft:
      peft_config = LoraConfig(
          task_type=TaskType.SEQ_CLS,
          r=config.lora_rank,
          lora_alpha=config.lora_alpha,
          lora_dropout=config.lora_dropout,
          bias=config.lora_bias,
          #target_modules=["q_proj", "k_proj"]
      )

      model = get_peft_model(model, peft_config)

  return model, tokenizer

### Function to finetune the model

In [None]:
def finetune_model(model,tokenizer,ds,config):
  """
  Fine-tunes a model using the provided dataset and configuration.

  Parameters
  ==========
  model: The pre-trained model to fine-tune.
  tokenizer: The tokenizer for the model.
  ds: The dataset containing 'train' and 'val' splits.
  config: Configuration object with training parameters.

  Returns
  =======
  Fine-tuned model.
  """
  num_train_epochs = config.num_train_epochs
  train_batch_size = config.batch_size
  gradient_accumulation_steps = config.gradient_accumulation_steps
  max_steps = int((len(ds['train'])*num_train_epochs)/(train_batch_size*gradient_accumulation_steps))

  training_args = TrainingArguments(
      output_dir=f"./{config.model_name}_results",
      max_steps=max_steps,
      learning_rate=3e-5,
      lr_scheduler_type="cosine",
      optim="paged_adamw_32bit",
      per_device_train_batch_size=train_batch_size,
      per_device_eval_batch_size=train_batch_size,
      gradient_accumulation_steps=gradient_accumulation_steps,
      gradient_checkpointing=config.gradient_checkpointing,
      weight_decay=0.01,
      warmup_ratio=0.1,
      evaluation_strategy="steps",
      save_strategy="epoch",
      logging_strategy="steps",
      logging_steps=100,
      save_steps=max_steps-100,
      eval_steps=100,
      report_to="wandb",
      run_name=f"{config.repo_user_id}/{config.model_name}_results",
      fp16=config.fp16,
      bf16=config.bf16
  )

  trainer = Trainer(
      model=model,
      tokenizer=tokenizer,
      args=training_args,
      train_dataset=ds["train"],
      eval_dataset=ds["val"],
      compute_metrics=compute_metrics
  )

  trainer.train()

  trainer.model.save_pretrained(config.local_save_path)

  return trainer.model

### Function to upload the model to HuggingFace

In [None]:
def upload_model_to_huggingface(finetuned_model, config, tokenizer=None, add_pad_token=False, peft=False, quantized=False):
  """
  Uploads a fine-tuned model and tokenizer to the Hugging Face Hub

  Parameters
  ==========
  finetuned_model (PreTrainedModel): The fine-tuned model to be uploaded.
  config (object): Configuration object containing required settings such as repo_user_id and model_name.
  tokenizer (PreTrainedTokenizer): The tokenizer associated with the model.
  peft (bool): Indicates if the model uses Parameter-Efficient Fine-Tuning (PEFT).
  quantized (bool): Indicates if the model is quantized.

  Returns
  =======
  None: The function uploads the model and tokenizer directly to the Hugging Face Hub.
  """
  if peft and not quantized:
    finetuned_model.push_to_hub(f"{config.repo_user_id}/{config.model_name}_adapter", safe_serialization=True, max_shard_size='3GB')
    model = finetuned_model.merge_and_unload()

  elif peft and quantized:
    # When we load the base model, we don't need to again attach the adapters
    # Hence peft is set to False in the below load_model_and_tokenizer function
    finetuned_model.push_to_hub(f"{config.repo_user_id}/{config.model_name}_adapter", safe_serialization=True, max_shard_size='3GB')
    base_model, tokenizer = load_model_and_tokenizer(config,
                                                     quantization=quantized,
                                                     add_pad_token=add_pad_token,
                                                     peft=False,
                                                     load_model_for_sequence_classification=True)
    model = PeftModel.from_pretrained(model=base_model, model_id=f"{config.repo_user_id}/{config.model_name}_adapter")
    model = model.merge_and_unload()

  else:
    model = finetuned_model

  model.push_to_hub(f"{config.repo_user_id}/{config.model_name}", safe_serialization=True, max_shard_size='3GB')
  tokenizer.push_to_hub(f"{config.repo_user_id}/{config.model_name}")

### Dataclass to hold the key parameters required while finetuning the models

In [6]:
@dataclass
class Config:
  checkpoint:str = "microsoft/deberta-v3-base"
  max_length:int = 512
  num_labels:int = 6
  problem_type:str = "multi_label_classification"
  lora_rank:int = 8
  lora_alpha:int = 32
  lora_dropout:float = 0.1
  lora_bias:str = "none"
  device:str = "cuda" if torch.cuda.is_available() else "cpu"
  device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None
  repo_user_id:str = "bhujith10"
  model_name:str = ""
  local_save_path:str = ""
  bf16:bool = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
  fp16:bool = torch.cuda.is_available() and not torch.cuda.is_bf16_supported()
  load_in_4bit:bool = True
  bnb_4bit_quant_type:bool = "nf4"
  bnb_4bit_compute_dtype:bool = "float16"
  bnb_4bit_use_double_quant:bool = False
  num_train_epochs:int = 1
  batch_size:int = 4
  gradient_accumulation_steps:int = 2
  gradient_checkpointing:bool = True

config = Config()

## Finetuning

This is the entrypoint where we load the dataset, model, finetune the model and upload it to HuggingFace.

In [None]:
# Set the prefered base model name
checkpoint = "meta-llama/Llama-3.1-8B-Instruct"

# Update parameters such as model_save_path and checkpoint with the base model name
config = update_model_related_settings(checkpoint, config)

# Load the model and tokenizer from HuggingFace
model, tokenizer = load_model_and_tokenizer(config=config,
                                            quantization=True,
                                            add_pad_token=True,
                                            peft=True,
                                            load_model_for_sequence_classification=True
                                            )

# Load the processed dataset from HuggingFace and tokenize it
ds = tokenize_dataset(tokenizer,
                      config,
                      dataset_name='bhujith10/multi_class_classification_dataset')

# Sample the dataset if needed
# ds['train'] = ds['train'].select(range(10000)).shuffle()
# ds['val'] = ds['val'].select(range(1000)).shuffle()

# Fine tuned model
finetuned_model = finetune_model(model,tokenizer,ds,config)

In [None]:
# Upload model to HuggingFace
upload_model_to_huggingface(finetuned_model, config, tokenizer=tokenizer, add_pad_token=True, peft=True, quantized=True)

## Inference

In [14]:
checkpoint = "bhujith10/Mistral-7B-Instruct-v0.3_finetuned_with_classification_head_2024_12_26_08_34"

# Load the model and tokenizer from HuggingFace
model, tokenizer = load_model_and_tokenizer(config=config,
                                            quantization=False,
                                            add_pad_token=False,
                                            peft=False,
                                            load_model_for_sequence_classification=True
                                            )

# Tokenize the dataset
ds_tokenized = tokenize_dataset(tokenizer, config, dataset_name='bhujith10/multi_class_classification_dataset')

README.md:   0%|          | 0.00/510 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/10.8M [00:00<?, ?B/s]

val-00000-of-00001.parquet:   0%|          | 0.00/692k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16771 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/1056 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3145 [00:00<?, ? examples/s]

Map:   0%|          | 0/16771 [00:00<?, ? examples/s]

Map:   0%|          | 0/1056 [00:00<?, ? examples/s]

Map:   0%|          | 0/3145 [00:00<?, ? examples/s]

Map:   0%|          | 0/16771 [00:00<?, ? examples/s]

Map:   0%|          | 0/1056 [00:00<?, ? examples/s]

Map:   0%|          | 0/3145 [00:00<?, ? examples/s]

In [None]:
model.to('cuda')

MistralForSequenceClassification(
  (model): MistralModel(
    (embed_tokens): Embedding(32769, 4096, padding_idx=32768)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_atten

### Run inference on the test dataset in batches

In [None]:
 # DataLoader for batching
batch_size = 4
dataloader = DataLoader(ds_tokenized['test'], batch_size=batch_size)

# Initialize lists to store predicted and actual labels
predicted_labels = []
actual_labels = []

start_time = time.time()

# Loop through each batch in the DataLoader
for i, inputs in enumerate(dataloader):
    print(f"Batch {i}")

    # Move input tensors (input IDs and attention mask) to the GPU for faster computation
    batch = {key: inputs[key].to('cuda') for key in ['input_ids', 'attention_mask']}

    # Extract and store the actual labels from the batch
    actual_labels.extend(list(i) for i in inputs['labels'].detach().to(torch.int8).numpy())

    # Get predictions from the model
    outputs = model(**batch)

    # Extract logits
    logits_array = outputs.logits.detach().cpu().to(torch.float16).numpy()

    # Convert logits into binary predicted labels
    tmp_predicted_labels = [list(return_encodings(arr)) for arr in logits_array]
    predicted_labels.extend(tmp_predicted_labels)

end_time = time.time()

print('total time ', end_time - start_time)


In [None]:
calculate_f1_score(actual_labels,predicted_labels)