In [None]:
!pip install transformers==4.46.3 accelerate scikit-multilearn peft datasets sentence-transformers bitsandbytes trl protobuf wandb

In [None]:
!pip install flash-attn --no-build-isolation

In [None]:
import os
import re
import time
import torch
import random
import pandas as pd
import numpy as np
from datetime import datetime
from collections import defaultdict
from dataclasses import dataclass
from torch.utils.data import DataLoader
from scipy.special import expit as sigmoid
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

from trl import SFTTrainer
from trl.trainer import SFTConfig
from datasets import Dataset, load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorWithPadding, DataCollatorForLanguageModeling, BitsAndBytesConfig

### Keys

We need HuggingFace access tokens to upload the fine-tuned models to the HuggingFace repository and Weights & Biases (WandB) API keys to record the training metrics in WandB.

In [None]:
os.environ["HF_TOKEN"]=""
os.environ["WANDB_API_KEY"]=""

In [None]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### Function to create Quantization config

In [None]:
def quantization_config(config):
  """
  Creates a configuration for 4-bit quantization of a model.

  Parameters
  ==========
  config (object) : Configuration object with attributes.

  Returns
  =======
  BitsAndBytesConfig: Configuration for 4-bit quantization.
  """
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=config.load_in_4bit,
      bnb_4bit_quant_type=config.bnb_4bit_quant_type,
      bnb_4bit_compute_dtype=config.bnb_4bit_compute_dtype,
      bnb_4bit_use_double_quant=config.bnb_4bit_use_double_quant,
  )
  return bnb_config

### Function to update model related settings

In [None]:
def update_model_related_settings(checkpoint, config):
  """
  Updates configuration settings related to saving the model, including checkpoint and naming.

  Parameters
  ==========
  checkpoint (str): Path to the model checkpoint.
  config (object): Configuration object to be updated.

  Returns
  =======
  config (object): Updated configuration object.
  """
  model_name = f"{checkpoint.split('/')[-1]}_instruction_tuned"
  config.checkpoint = checkpoint
  currtime = datetime.now().strftime("%Y_%m_%d_%H_%M")
  config.model_name = f"{model_name}_{currtime}"
  config.local_save_path = config.model_name
  return config

### Function to calculate model size

In [None]:
def calculate_model_size(model):
  """
  Calculates the total, trainable, and memory size of model parameters.

  Parameters
  ==========
  model: The model whose parameters are to be calculated.

  Returns
  =======
  tuple: Total parameters, trainable parameters, and memory size in MB.
  """
  total_params = sum(p.numel() for p in model.parameters())
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  param_size_in_bytes = sum(p.numel() * p.element_size() for p in model.parameters())
  param_size_in_mb = param_size_in_bytes / (1024 ** 2)

  return total_params, trainable_params, param_size_in_mb

### Function to return the subjects given the label encodings

In [None]:
def return_subjects(labels):
  """
  Maps binary labels to their corresponding subjects and returns the list of selected subjects.

  Parameters
  ==========
  labels (list): A binary list (1 or 0) indicating whether a subject is relevant (1) or not (0).

  Returns
  =======
  list: A list of selected subjects corresponding to labels with a value of 1.
  """
  subjects = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']

  selected_subjects = [subject for subject, label  in zip(subjects, labels) if label == 1]
  return selected_subjects

### Function to format the text for finetuning

In [None]:
def format_text(sample, tokenizer):
  """
  Formats the input sample for a language model using a chat-based template.
  It includes system instructions, an example interaction, and the user's input.

  Parameters
  ==========
  sample (dict): A dictionary containing the sample data, including 'text' (title and abstract) and 'labels'.
  tokenizer: A tokenizer object used for formatting text for language model input.

  Returns
  =======
  dict: The updated sample dictionary with the formatted text added as 'formatted_text'.
  """
  system_content = """
  Given the title and abstract of a research paper, classify it into one or more of the following subjects based on its content: ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance'].

  Output Requirements:

  Return only the most appropriate subjects (1 to 3) from the given list.
  Do not include subjects outside the provided list.
  Avoid selecting all subjects; focus on those most relevant to the paper's content.

  RETURN ONLY A LIST AND NOTHING ELSE
  """
  user_content = sample['text']

  example_user_input = f"""
  Title: Efficient methods for computing integrals in electronic structure calculations,
  Abstract: Efficient methods are proposed, for computing integrals appeaing in electronic structure calculations.
  The methods consist of two parts: the first part is to represent the integrals as contour integrals and the second one is to evaluate the contour integrals by the Clenshaw-Curtis quadrature.
  The efficiency of the proposed methods is demonstrated through numerical experiments.
  """
  example_user_output=f"""
  ['Physics']
  """

  if 'mistralai' in tokenizer.name_or_path:

      assistant_content = str(return_subjects(sample['labels'])) + '\t' + tokenizer.eos_token

      user_content = system_content + "\n" + example_user_input + "\n" + example_user_output + "\n" + user_content

      messages = [
          {
              "role": "user",
              "content": user_content
          },
          {
              "role":"assistant",
              "content":assistant_content
          }
      ]

  else:
      assistant_content = str(return_subjects(sample['labels'])) + '\t' + '<|end_of_text|>'
      messages = [
          {
              "role": "system",
              "content": system_content,
          },
          {
              "role": "user",
              "content": example_user_input
          },
          {
              "role": "assistant",
              "content": example_user_output
          },
          {
              "role": "user",
              "content": user_content
          },
          {
              "role":"assistant",
              "content":assistant_content
          }
      ]


  sample["formatted_text"] = tokenizer.apply_chat_template(messages, tokenize=False)

  return sample

### Function to format text for inference

In [None]:
def format_text_for_inference(sample, tokenizer):
  """
  Formats the input sample for a language model using a chat-based template.
  It includes system instructions, an example interaction, and the user's input.

  Parameters
  ==========
  sample (dict): A dictionary containing the sample data, including 'text' (title and abstract) and 'labels'.
  tokenizer: A tokenizer object used for formatting text for language model input.

  Returns
  =======
  dict: The input sample augmented with a 'formatted_text' key containing the processed text.
  """

  # Instruction prompt
  system_content = f"""
  Given the title and abstract of a research paper, classify it into one or more of the following subjects based on its content: ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance'].

  Output Requirements:

  Return only the most appropriate subjects (1 to 3) from the given list.
  Do not include subjects outside the provided list.
  Avoid selecting all subjects; focus on those most relevant to the paper's content.
  You are provided with an example

  RETURN ONLY A LIST AND NOTHING ELSE
  """

  # The actual text for classification
  user_content = sample['text']

  # Input and output for better context to the model
  example_user_input = f"""
  Title: Efficient methods for computing integrals in electronic structure calculations,
  Abstract: Efficient methods are proposed, for computing integrals appearing in electronic structure calculations.
  The methods consist of two parts: the first part is to represent the integrals as contour integrals and the second one is to evaluate the contour integrals by the Clenshaw-Curtis quadrature.
  The efficiency of the proposed methods is demonstrated through numerical experiments.
  """
  example_user_output = f"""
  ['Physics']
  """

  # Handle different tokenizer types
  if 'mistralai' in tokenizer.name_or_path:
      # For Mistral tokenizers
      user_content = system_content + "\n" + example_user_input + "\n" + example_user_output + "\n" + user_content
      messages = [
          {
              "role": "user",
              "content": user_content
          }
      ]
  else:
      # For other tokenizers, use a different template
      messages = [
          {
              "role": "system",
              "content": system_content,
          },
          {
              "role": "user",
              "content": example_user_input
          },
          {
              "role": "assistant",
              "content": example_user_output
          },
          {
              "role": "user",
              "content": user_content
          }
      ]

  # Apply the chat template to the sample using the tokenizer
  sample["formatted_text"] = tokenizer.apply_chat_template(messages, tokenize=False)

  return sample

### Function to return encodings given the subjects

In [None]:
def return_encodings_from_subjects(predicted_subjects):
  """
  Converts a list of predicted subjects into a binary encoding based on predefined subject categories.

  Parameters
  ==========
  predicted_subjects (list): A list of subject names predicted from a model.

  Returns
  =======
  predicted_encodings (list): A binary list of length 6 where each element indicates the presence (1) or absence (0)
        of a subject in the `predicted_subjects` list, corresponding to a particular order:
  """
  subjects = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']
  predicted_encodings = [1 if subject in predicted_subjects else 0 for subject in subjects]
  return predicted_encodings

### Function to extract the predictions list from raw responses

In [None]:
def extract_list_from_raw_response(text):
  """
  Extracts a list of predicted subjects from a raw text response and returns their binary encoding.

  Parameters
  ==========
  text (str): A string containing a raw response that may include a list of predicted subjects
              enclosed in square brackets.

  Returns
  =======
  predicted_encodings (list): A binary list of length 6 encoding the presence or absence of each predefined subject
        in the extracted list of predicted subjects. If no valid list is found in the text, returns [0, 0, 0, 0, 0, 0].
  """
  # Regex pattern to match the last list
  pattern = r"\[(?:'[^']*'(?:, )?)*\]"
  match = re.search(pattern, text)

  predicted_encodings = [0, 0, 0, 0, 0, 0]

  if match:
      predicted_subjects = ast.literal_eval(match.group())
      predicted_encodings = return_encodings_from_subjects(predicted_subjects)

  return predicted_encodings

### Function to load the model and tokenizer from HuggingFace Repo given the name of the model

In [None]:
def load_model_and_tokenizer(config,
                             add_pad_token=True,
                             use_cache=False,
                             quantization=False):
  """
  Loads a model and its tokenizer based on the provided configuration.

  Parameters
  ==========
  config (object): Configuration object with model and tokenizer attributes.
  add_pad_token (bool): Whether to add a padding token.
  use_cache (bool): Whether to enable caching for the model.
  quantization (bool): Whether to apply quantization.

  Returns
  =======
  Loaded model and tokenizer.
  """
  tokenizer = AutoTokenizer.from_pretrained(config.checkpoint, add_prefix_space=True)

  # Llama provides a default padding token. We just have to set it.
  # In case of other models, we have to add the pad tokens
  if add_pad_token:
      if 'Llama' in config.checkpoint:
          tokenizer.pad_token='<|finetune_right_pad_id|>'
          tokenizer.padding_side = 'right'
      else:
          tokenizer.add_special_tokens({"pad_token":'<pad>'})
          tokenizer.padding_side = 'left'

  # use_cache can be set to False while finetuning
  model = AutoModelForCausalLM.from_pretrained(
      config.checkpoint,
      attn_implementation="flash_attention_2",
      use_cache=use_cache,
      device_map=config.device_map,
      quantization_config=quantization_config(config) if quantization else None,
      torch_dtype=torch.bfloat16 if config.bf16 else torch.float16,
      trust_remote_code=True
      )

  # In case, a new token was created, this change has to be made in model config also
  if add_pad_token:
      model.config.pad_token_id = tokenizer.pad_token_id
      if 'Llama' not in config.checkpoint:
          model.resize_token_embeddings(len(tokenizer))

  return model, tokenizer

### Function to finetune the model

In [None]:
def finetune_model(model,tokenizer,dataset,config,peft=False,dataset_text_field='text'):
  """
  Fine-tunes a model using the provided dataset and configuration.

  Parameters
  ==========
  model: The pre-trained model to fine-tune.
  tokenizer: The tokenizer for the model.
  dataset: The dataset containing 'train' and 'val' splits.
  config: Configuration object with training parameters.
  peft (bool): Whether to apply parameter-efficient fine-tuning (LoRA).
  dataset_text_field (str): Field in the dataset containing the text data.

  Returns
  =======
  Fine-tuned model.
  """
  num_train_epochs = config.num_train_epochs
  train_batch_size = config.batch_size
  gradient_accumulation_steps = config.gradient_accumulation_steps
  max_steps = int((len(dataset['train'])*num_train_epochs)/(train_batch_size*gradient_accumulation_steps))

  if peft:
      # Load LoRA configuration
      peft_config = LoraConfig(
          r=config.lora_rank,
          lora_alpha=config.lora_alpha,
          lora_dropout=config.lora_dropout,
          bias=config.lora_bias,
          task_type="CAUSAL_LM",
          #target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
      )

  training_arguments = SFTConfig(
      max_steps=max_steps,
      per_device_train_batch_size=train_batch_size,
      gradient_accumulation_steps=gradient_accumulation_steps,
      gradient_checkpointing=True,
        learning_rate=3e-4,
        fp16=config.fp16,
        bf16=config.bf16,
        output_dir=f"{config.model_name}_outputs",
        optim="paged_adamw_32bit",
        eval_steps=100,
        save_steps=max_steps-100,
        logging_steps=100,
        save_strategy="steps",
        evaluation_strategy="steps",
        warmup_ratio=0.02,
        report_to="wandb",
        run_name=f"{config.repo_user_id}/{config.model_name}_results",
        lr_scheduler_type="cosine",
        dataset_batch_size=4,
        max_seq_length=config.max_length,
        dataset_text_field=dataset_text_field
    )

  # Set supervised fine-tuning parameters
  trainer = SFTTrainer(
        model=model,
        peft_config=peft_config,
        train_dataset=dataset["train"],
        eval_dataset=dataset["val"],
        tokenizer=tokenizer,
        args=training_arguments
  )

  # Finetune the model
  trainer.train()

  # Save the model locally
  trainer.model.save_pretrained(config.local_save_path)

  return trainer.model


### Function to upload the finetuned model to HuggingFace

In [None]:
def upload_model_to_huggingface(finetuned_model, config, tokenizer=None, peft=False, quantized=False):
  """
  Uploads a fine-tuned model and tokenizer to the Hugging Face Hub

  Parameters
  ==========
  finetuned_model (PreTrainedModel): The fine-tuned model to be uploaded.
  config (object): Configuration object containing required settings such as repo_user_id and model_name.
  tokenizer (PreTrainedTokenizer): The tokenizer associated with the model.
  peft (bool): Indicates if the model uses Parameter-Efficient Fine-Tuning (PEFT).
  quantized (bool): Indicates if the model is quantized.

  Returns
  =======
  None: The function uploads the model and tokenizer directly to the Hugging Face Hub.
  """
  # Handle PEFT models without quantization
  if peft and not quantized:
      # Push only the adapter weights to the Hugging Face Hub
      finetuned_model.push_to_hub(
          f"{config.repo_user_id}/{config.model_name}_adapter",
          safe_serialization=True,
          max_shard_size='3GB'
      )
      # Merge the adapter weights into the base model for full model upload
      model = finetuned_model.merge_and_unload()

  # Handle PEFT models with quantization
  elif peft and quantized:
      # Push the adapter weights to the Hugging Face Hub
      finetuned_model.push_to_hub(
          f"{config.repo_user_id}/{config.model_name}_adapter",
          safe_serialization=True,
          max_shard_size='3GB'
      )
      # Load the base model with quantization
      base_model, tokenizer = load_model_and_tokenizer(
          config, quantization=quantized, add_pad_token=True
      )
      # Load the adapter and merge it into the base model
      model = PeftModel.from_pretrained(
          model=base_model,
          model_id=f"{config.repo_user_id}/{config.model_name}_adapter"
      )
      model = model.merge_and_unload()

  # For models without PEFT or quantization
  else:
      model = finetuned_model

  # Push the final merged model to the Hugging Face Hub
  model.push_to_hub(
      f"{config.repo_user_id}/{config.model_name}",
      safe_serialization=True,
      max_shard_size='5GB'
  )

  # Push the tokenizer to the Hugging Face Hub if provided
  if tokenizer:
      tokenizer.push_to_hub(f"{config.repo_user_id}/{config.model_name}")

### Dataclass to hold the key parameters required while finetuning the models

In [None]:
@dataclass
class Config:
  checkpoint:str = "microsoft/deberta-v3-base"
  max_length:int = 1024
  num_labels:int = 6
  lora_rank:int = 8
  lora_alpha:int = 32
  lora_dropout:float = 0.1
  lora_bias:str = "none"
  device:str = "cuda" if torch.cuda.is_available() else "cpu"
  device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None
  repo_user_id:str = "bhujith10"
  model_name:str = ""
  local_save_path:str = ""
  bf16:bool = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
  fp16:bool = torch.cuda.is_available() and not torch.cuda.is_bf16_supported()
  load_in_4bit:bool = True
  bnb_4bit_quant_type:bool = "nf4"
  bnb_4bit_compute_dtype:bool = "float16"
  bnb_4bit_use_double_quant:bool = False
  num_train_epochs:int = 1
  batch_size:int = 4
  gradient_accumulation_steps:int = 2
  gradient_checkpointing:bool = True

config = Config()

## Finetuning

This is the entrypoint where we load the dataset, model, finetune the model and upload it to HuggingFace.

In [None]:
# Set the prefered base model name
checkpoint = "meta-llama/Llama-3.1-8B-Instruct"

# Update parameters such as model_save_path and checkpoint with the base model name
config = update_model_related_settings(checkpoint, config)

# Load the model and tokenizer from HuggingFace
model, tokenizer = load_model_and_tokenizer(config,
                                            add_pad_token=True,
                                            quantization=True
                                           )

# Load the processed dataset from HuggingFace
ds = load_dataset('bhujith10/multi_class_classification_dataset')

# Format the dataset with the respective model chat template
ds = ds.map(format_text,
            fn_kwargs={"tokenizer": tokenizer},
            #remove_columns=column_names,
            desc="Applying chat template")

sampled_train = ds['train'].shuffle(seed=42).select(range(5000))
sampled_val = ds['val'].shuffle(seed=42).select(range(len(ds['val'])))
sampled_test = ds['test'].shuffle(seed=42).select(range(len(ds['test'])))

# Combine the sampled datasets into one DatasetDict
sampled_dataset = DatasetDict({
    'train': sampled_train,
    'val': sampled_val,
    'test': sampled_test
})

# Finetune the model
finetuned_model = finetune_model(model, tokenizer, sampled_dataset, config, peft=True, dataset_text_field='formatted_text')

# Upload the model to HuggingFace
upload_model_to_huggingface(finetuned_model, config, tokenizer, peft=True, quantized=True)

In [None]:
calculate_model_size(finetuned_model)

(4544008192, 3407872, 5346.015625)

## Inference

Make predictions using the finetuned model

In [None]:
checkpoint = "bhujith10/Meta-Llama-3.1-8B-Instruct_instruction_tuned_2024_12_05_13_48_adapter"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=True)

model, tokenizer = load_model_and_tokenizer(config,
                                            add_pad_token=False,
                                            use_cache=True,
                                            quantization=False)

model.to('cuda')


In [None]:
calculate_model_size(model)

(4545335296, 0, 5354.5390625)

In [None]:
# Apply the format_text_for_inference function to the test dataset
test_ds = ds['test'].map(
    format_text_for_inference,
    fn_kwargs={"tokenizer": tokenizer},
    desc="Applying chat template"
)

### Run inference on the test dataset in batches

In [None]:
batch_size = 4
dataloader = DataLoader(test_ds, batch_size=batch_size)

# Initialize lists to store actual and generated subjects
actual_subjects = []
generated_texts = []

# Set the model to evaluation mode
model.eval()

start_time = time.time()

with torch.inference_mode():
    for i, batch in enumerate(dataloader):
        print(f"Batch {i}")

        inputs = tokenizer(
            batch['formatted_text'],
            padding=True,
            truncation=True,
            max_length=1024,
            return_tensors='pt'
        )

        # Move input tensors (input IDs and attention mask) to the GPU for faster computation
        tmp_batch = {key: inputs[key].to(config.device) for key in ['input_ids', 'attention_mask']}

        outputs = model.generate(**tmp_batch, max_new_tokens=256)

        generated_text = [
            tokenizer.decode(output[input.shape[-1]:], skip_special_tokens=True)
            for input, output in zip(inputs['input_ids'], outputs)
        ]

        generated_texts.extend(generated_text)
        actual_subjects.extend([return_subjects(label) for label in batch['labels']])

end_time = time.time()

In [None]:
print(end_time - start_time)

1628.9085702896118


### Store the responses as csv

In [None]:
pd.DataFrame({'generated_texts_mistral_7b':generated_texts}).to_csv('mistral_7b_raw_generated_outputs.csv',index=False)

## Extract predicted subjects list from the response

In [None]:
import re
import ast

# Read the csv file which contains the raw responses
raw_responses_df = pd.read_csv('/content/qwen_finetuned_7b_raw_generated_outputs.csv')
raw_responses_df.columns = ['text']

# Apply the function on the responses to extract the predicted subjects list
raw_responses_df['predicted_labels'] = raw_responses_df['text'].apply(extract_list_from_raw_response)

# Calculate F1 scores
predicted_labels = raw_responses_df['predicted_labels'].to_list()
actual_labels = ds['test']['labels']

calculate_f1_score(actual_labels,predicted_labels)

### Calculate F1 scores only for the valid responses

Valid responses are the ones which contain the predicted subjects in list format.

In [None]:
modified_predicted_labels = []
modified_actual_labels = []

for i in range(len(predicted_labels)):
  if predicted_labels[i] != [0, 0, 0, 0, 0, 0]:
    modified_predicted_labels.append(predicted_labels[i])
    modified_actual_labels.append(actual_labels[i])

calculate_f1_score(modified_actual_labels,modified_predicted_labels)

In [None]:
# Use this command to find the largest files in the disk
# !find / -type f -exec du -h {} + | sort -rh | head -n 10

# Use this command to delete it
# !rm -r /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/blobs/92ecfe1a2414458b4821ac8c13cf8cb70aed66b5eea8dc5ad9eeb4ff309d6d7b