<a href="https://colab.research.google.com/github/Chinjuj2017/Chinjuj2017/blob/main/cross_entropy__QLoRA_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Q-LoRA based fine-tuning of  PT5-XL-UniRef50 for multiclass classification**

This notebook is for the downstreaming task of classifying the protein sequences by fine-tuning PT5-XL-UniRef50 (https://arxiv.org/abs/2007.06225). We have used Q-LoRA for parameter optimization with a customised classifier head added to the LLM.

Installing all the required libraries

In [None]:
!pip install -q -U bitsandbytes #for quantization of LoRa weights
!pip install -q transformers #==4.30
!pip install -q -U git+https://github.com/huggingface/peft.git #For LoRa implementation
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install -q evaluate
!pip install -q accelerate
!pip install -q SentencePiece

#!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

Import all the packages required

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from torch.utils.data import DataLoader

import re
import numpy as np
import pandas as pd
import copy

import transformers,datasets
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.t5.modeling_t5 import T5Config, T5PreTrainedModel, T5Stack
from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
from transformers import T5EncoderModel, T5Tokenizer,AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, set_seed

from evaluate import load
from datasets import Dataset

from tqdm import tqdm
import random

from scipy import stats
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
print("Torch version: ",torch.__version__)
print("Cuda version: ",torch.version.cuda)
print("Numpy version: ",np.__version__)
print("Pandas version: ",pd.__version__)
print("Transformers version: ",transformers.__version__)
print("Datasets version: ",datasets.__version__)

Torch version:  2.2.1+cu121
Cuda version:  12.1
Numpy version:  1.25.2
Pandas version:  2.0.3
Transformers version:  4.40.0
Datasets version:  2.19.0


In [None]:
#BENCHMARKS_DIR = '/Desktop/DeepPPF/GPCR/FAMILY/SUBFAM'
#BENCHMARKS_DIR='/content/drive/MyDrive/DeepPPF/GPCR/FAMILY/SUBFAM/'
BENCHMARKS_DIR= '/content/drive/MyDrive/DeepPPF' #give path to your data

#BENCHMARKS_DIR = '/home/chinju/DeepPPF/GPCR/FAMILY/SUBFAM/'
BENCHMARK_NAME = 'subfam'

Set the environment variable if required

In [None]:
import os
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

**Loading data and data preparation**

In [None]:
# train_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.train.csv' % BENCHMARK_NAME)
train_set_file_path = os.path.join(BENCHMARKS_DIR,'%s.train.csv' % BENCHMARK_NAME)
#train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
#my_train=pd.read_csv('C:/Users/IIIT KOTTAYAM/Desktop/DeepPPF/GPCR/FAMILY/SUBFAM/subfam.train.csv')
my_train=pd.read_csv(train_set_file_path)
#train_set.shape

In [None]:
test_set_file_path = os.path.join(BENCHMARKS_DIR,'%s.test.csv' % BENCHMARK_NAME)
my_test=pd.read_csv(test_set_file_path)

In [None]:
#del my_train
#del my_valid
#del my_test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
lab = []
for i in my_train['flabel']:
  try:
    lab.append(int(i))
  except:
    lab.append(0)

In [None]:
my_train['label']=lab
my_train, my_valid = train_test_split(my_train, stratify = my_train['label'], test_size = 0.1, random_state = 0)

#test_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.test.csv' % BENCHMARK_NAME)
#test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()

print(f'{len(my_train)} training set records, {len(my_valid)} validation set records, {len(my_test)} test set records.')

6395 training set records, 711 validation set records, 832 test set records.


In [None]:
my_train.rename(columns = {'seq':'sequence'}, inplace = True)
my_valid.rename(columns = {'seq':'sequence'}, inplace = True)
#my_train.rename(columns = {'flabel':'label'}, inplace = True)
my_test.rename(columns = {'seq':'sequence'}, inplace = True)
my_test.rename(columns = {'flabel':'label'}, inplace = True)

In [None]:
#my_train['label'] = my_train['label'].astype(int)
#my_valid['label'] = my_valid['label'].astype(int)
#my_test['label'] = my_test['label'].astype(int)

In [None]:
my_valid.head()

Unnamed: 0,Subsubfamilylabel,label1,flabel,Sfamily,slabel,sslabel,seq,label
1283,14,A,0,ClassA_Peptide,12,14,MASFSAETNSTDLLSQPWNEPPVILSMVILSLTFLLGLPGNGLVLW...,0
2588,69,C,2,ClassC_PutPher,34,69,VYLSPHFLQLSYGPFYSIFSDNEQYPYLYQMGPKDSSLALAMVSFI...,2
180,44,A,0,ClassA_Peptide,12,44,MPALGSQRRLLGSLNCTPPATLPFTLAPNRTGPQCLEVSIPDGLFL...,0
2884,83,A,0,ClassA_Peptide,12,83,MACSGVDCPVEASDYSPSTPVEGATTEPWLPTPGNNSGNITDTSDD...,0
760,38,A,0,ClassA_Interleukin8,6,38,MEYINWDNYSLEDLFGDIDNYTYNTEMPIIPADSAPCRPESLDINK...,0


For printing the number of trainable parameters after QLoRA matrices are incorporated in the LLM

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

Adding customised classifier head to the model

In [None]:

class ClassConfig:
    def __init__(self, dropout=0.2, num_labels=5):
        self.dropout_rate = dropout
        self.num_labels = num_labels

class T5EncoderClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, class_config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(class_config.dropout_rate)
        self.out_proj = nn.Linear(config.hidden_size, class_config.num_labels)

    def forward(self, hidden_states):

        hidden_states =  torch.mean(hidden_states,dim=1)  # avg embedding

        hidden_states = self.dropout(hidden_states)
        hidden_states = self.dense(hidden_states)
        hidden_states = torch.tanh(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.out_proj(hidden_states)

        return hidden_states

class T5EncoderForSimpleSequenceClassification(T5PreTrainedModel):

    def __init__(self, config: T5Config, class_config):
        super().__init__(config)
        self.num_labels = class_config.num_labels
        self.config = config

        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        encoder_config = copy.deepcopy(config)
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        self.encoder = T5Stack(encoder_config, self.shared)

        self.dropout = nn.Dropout(class_config.dropout_rate)
        self.classifier = T5EncoderClassificationHead(config, class_config)

        # Initialize weights and apply final processing
        self.post_init()

        # Model parallel
        self.model_parallel = False
        self.device_map = None

    def parallelize(self, device_map=None):
        self.device_map = (
            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
            if device_map is None
            else device_map
        )
        assert_device_map(self.device_map, len(self.encoder.block))
        self.encoder.parallelize(self.device_map)
        self.classifier = self.classifier.to(self.encoder.first_device)
        self.model_parallel = True

    def deparallelize(self):
        self.encoder.deparallelize()
        self.encoder = self.encoder.to("cpu")
        #self.encoder = self.encoder.to(self.device_map)
        self.model_parallel = False
        self.device_map = None
        torch.cuda.empty_cache()

    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)

    def get_encoder(self):
        return self.encoder

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            #labels=labels,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]
        logits = self.classifier(hidden_states)
        #print(logits)

        #hidden_states = outputs[0]
        #logits = self.classifier(hidden_states)

        loss = None

        if labels is not None:

            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                #print("single_label")
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                 #print("multi_label")
                 loss_fct = BCEWithLogitsLoss()
                 loss = loss_fct(logits, labels)


          #print("printing labels :",labels)
          #num_labels = self.num_labels
          #logits = logits.view(-1, num_labels)
          #labels = labels.view(1, num_labels)
          #labels= torch.tensor([[0.0,1.0,2.0,3.0,4.0]]) #give the target labels of your classification task here
          #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
          #labels=labels.to(device)
          #print("printing labels defined:",labels)
          #print("printing logits",logits)


          #print('printing loss',loss)


        loss.requires_grad = True
        if not return_dict:

            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

LoRA Configuration

In [None]:
#num_labels=5
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_kbit_training
def PT5_classification_model(num_labels):
    # Load PT5 and tokenizer
    # possible to load the half preciion model (thanks to @pawel-rezo for pointing that out)
    '''
    half_precision=False
    if not half_precision:
        model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_bfd")
        tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_bfd")
    elif half_precision and torch.cuda.is_available() :
        tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)
        model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc", torch_dtype=torch.float16).to(torch.device('cuda'))
    else:
          raise ValueError('Half precision can be run on GPU only.')
    '''
    l_config = LoraConfig(
    r=8,
    lora_alpha=32,
    #target_modules=["query_key_value"],
    target_modules = "all-linear",
    #target_modules =  ".*SelfAttention|.*EncDecAttention",
    #lora_layers = "q|k|v|o",
    bias="none",
    task_type="TaskType.SEQ_CLS", #"CAUSAL_LM"
)

    model_id = "Rostlab/prot_t5_xl_uniref50"
    tokenizer = T5Tokenizer.from_pretrained(model_id)
    model = T5EncoderModel.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0},num_labels=num_labels)
    #model = model.to(torch.device("cuda"))

    # Create new Classifier model with PT5 dimensions
    class_config=ClassConfig(num_labels=num_labels)
    class_model=T5EncoderForSimpleSequenceClassification(model.config,class_config)
    model = get_peft_model(class_model,l_config)
    print_trainable_parameters(model)

    # Set encoder and embedding weights to checkpoint weights
    class_model.shared=model.shared
    class_model.encoder=model.encoder

    # Delete the checkpoint model
    model=class_model
    del class_model

    #Print number of trainable parameters
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print("ProtT5_Classfier\nTrainable Parameter: "+ str(params))


    '''
    # Freeze Embeddings and Encoder (except LoRA)
    for (param_name, param) in model.shared.named_parameters():
                param.requires_grad = False
    for (param_name, param) in model.encoder.named_parameters():
                param.requires_grad = False

    for (param_name, param) in model.named_parameters():
            if re.fullmatch(model.config.trainable_param_names, param_name):
             param.requires_grad = True

    # Print trainable Parameter
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print("ProtT5_LoRA_Classfier\nTrainable Parameter: "+ str(params) + "\n")
    '''


    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    #model.config
    return model, tokenizer

In [None]:
from datasets import Dataset
# Set random seeds for reproducibility of your trainings run
def set_seeds(s):
    torch.manual_seed(s)
    np.random.seed(s)
    random.seed(s)
    set_seed(s)

# Dataset creation
def create_dataset(tokenizer,seqs,labels):
    tokenized = tokenizer(seqs, max_length=1024, padding=True, truncation=True)
    dataset = Dataset.from_dict(tokenized)
    #datas=Dataset.from_dict
    dataset = dataset.add_column("labels", labels)

    return dataset

# Main training fuction
def train_per_protein(
        train_df,         #training data
        valid_df,         #validation data
        num_labels= 5,    #1 for regression, >1 for classification

        # effective training batch size is batch * accum
        # we recommend an effective batch size of 8
        batch= 8,         #for training
        accum= 2,         #gradient accumulation

        val_batch = 16,   #batch size for evaluation
        epochs= 10,       #training epochs
        lr= 3e-4,         #recommended learning rate
        seed= 42,

        #random seed
        #deepspeed= True,  #if gpu is large enough disable deepspeed for training speedup
        #mixed= False,     #enable mixed precision training
        gpu= 1 ):         #gpu selection (1 for first gpu)

    # Set gpu device
    os.environ["CUDA_VISIBLE_DEVICES"]=str(gpu-1)

    # Set all random seeds
    set_seeds(seed)

    # load model
    model, tokenizer = PT5_classification_model(num_labels=num_labels)

    # Preprocess inputs
    # Replace uncommon AAs with "X"
    train_df["sequence"]=train_df["sequence"].str.replace('|'.join(["O","B","U","Z"]),"X",regex=True)
    valid_df["sequence"]=valid_df["sequence"].str.replace('|'.join(["O","B","U","Z"]),"X",regex=True)
    # Add spaces between each amino acid for PT5 to correctly use them
    train_df['sequence']=train_df.apply(lambda row : " ".join(row["sequence"]), axis = 1)
    valid_df['sequence']=valid_df.apply(lambda row : " ".join(row["sequence"]), axis = 1)


    # Create Datasets

    #train_set=create_dataset(tokenizer,list(train_df['sequence']),list(train_df['label']))
    #valid_set=create_dataset(tokenizer,list(valid_df['sequence']),list(valid_df['label']))
    train_set=create_dataset(tokenizer,list(train_df['sequence']),train_df['label'])
    valid_set=create_dataset(tokenizer,list(valid_df['sequence']),valid_df['label'])
    # Huggingface Trainer arguments
    args = TrainingArguments(
        "./",
        evaluation_strategy = "epoch",
        logging_strategy = "epoch",
        save_strategy = "no",
        learning_rate=lr,
        per_device_train_batch_size=batch,
        per_device_eval_batch_size=val_batch,
        gradient_accumulation_steps=accum,
        num_train_epochs=epochs,
        seed = seed,
        #deepspeed= ds_config if deepspeed else None,
        fp16 = False, #fp16=mixed
        remove_unused_columns=False,
        #optimizers="AdamW",
        #optim="paged_adamw_8bit",
        optim="adamw_bnb_8bit"#adamw_torch",
    )

    # Metric definition for validation data
    def compute_metrics(eval_pred):
        if num_labels>1:  # for classification
            metric = load("accuracy")
            predictions, labels = eval_pred
            predictions = np.argmax(predictions, axis=1)
        else:  # for regression
            metric = load("spearmanr")
            predictions, labels = eval_pred

        return metric.compute(predictions=predictions, references=labels)

    # Trainer
    trainer = transformers.Trainer(
        model,
        args,
        train_dataset=train_set,
        eval_dataset=valid_set,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        #logging_steps=1,
        #output_dir="outputs",


    )

    # Train model
    trainer.train()

    return tokenizer, model, trainer.state.log_history


In [None]:
import torch
from transformers import T5Tokenizer, T5EncoderModel, BitsAndBytesConfig,AutoModelForSequenceClassification,AutoTokenizer

#model_id = "Rostlab/prot_t5_xl_uniref50"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
tokenizer, model, history = train_per_protein(my_train,my_valid,num_labels=5,batch=2,accum=8, epochs=2,seed=42)

trainable params: 10641448 || all params: 1219837997 || trainable%: 0.8723656769317705
ProtT5_Classfier
Trainable Parameter: 10641448




Epoch,Training Loss,Validation Loss,Accuracy
0,1.9531,1.996433,0.257384
1,1.9437,1.996433,0.257384


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

