In [1]:
import os
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split, StratifiedKFold
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
from bitsandbytes.optim import AdamW8bit
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm, trange
from sklearn.metrics import accuracy_score, log_loss
from transformers import get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup, AutoModel, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import wandb
pl.seed_everything(56)

Seed set to 56


56

In [1]:
# for gemma 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
# for llama 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
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
class CFG:
    class data:
        train_path = 'lmsys-chatbot-arena/train.csv'
        tokenizer = 'google/gemma-2-9b-it'
        num_workers = 12
        nfolds = 5
        batch_size = 2
        use_prefix = False
        max_length_prompt = 150
        max_length_response = 500
        max_length = 150 + 500 + 500 + 10
        seed = 56
    class model:
        model = 'google/gemma-2-9b-it'
        optim = torch.optim.AdamW
        num_labels = 3
        torch_dtype = torch.bfloat16
        scheduler= 'cosine'
        warnap_steps = 0.0 #0.25
        label_smoothing = 0.0
        pool = 'last_token'
        max_epoches = 5
        cls_drop_type = None
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        cls_drop = 0.0
        lr_fn = 1e-4
        lr = 1e-4
        turn_off_drop = True
        num_cycles = 0.5
        eps = 1e-7
        weight_decay = 0.0
        weight_decay_fn = 0.0
        betas = (0.9, 0.999)
        use_lora = True
        class lora:
            r = 16
            lora_alpha = 32
            lora_dropout = 0.01
            bias = 'none'
            use_dora = False
            layers_to_transform = [i for i in range(48) if i > 4]
            target_modules = ['k_proj','q_proj','v_proj']#['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj']
    seed = 56
    fold_number = 0

In [4]:
def make_df(path):
    data = pd.read_csv(path)
    df = pd.DataFrame()
    df['id'] = data['id']
    df['label'] = data.apply(lambda x: np.argmax([x.winner_model_a, x.winner_model_b, x.winner_tie]),axis=1)
    
    df['prompt'] = data['prompt'].apply(eval)
    df['prompt'] = df['prompt'].apply(lambda x: ' Next Sentence: '.join(x))
    
    data['response_a'] = data['response_a'].apply(lambda x: x.replace('null',"'null'"))
    df['response_a'] = data['response_a'].apply(eval)
    df['response_a'] = df['response_a'].apply(lambda x: ' Next Sentence: '.join(x))
    
    data['response_b'] = data['response_b'].apply(lambda x: x.replace('null',"'null'"))
    df['response_b'] = data['response_b'].apply(eval)
    df['response_b'] = df['response_b'].apply(lambda x: ' Next Sentence: '.join(x))
    
    return df

In [5]:
class PLDataset(Dataset):
    def __init__(self, df, tokenizer):
        super().__init__()
        self.cfg = CFG.data
        self.data = df
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
    def __len__(self):
        return len(self.data)
    
    def _truc_text(self, text,max_length):
        ids = self.tokenizer.encode(text,max_length=max_length,truncation=True,add_special_tokens=False)
        return self.tokenizer.decode(ids)
    
    def __getitem__(self, index):
        row = self.data.iloc[index]
        label, prompt, response_a, response_b = row['label'], row['prompt'], row['response_a'], row['response_b']
        if np.random.random() > 0.5:
            response_a, response_b = response_b, response_a
            label = [1,0,2][label]
        
        prompt = self._truc_text(prompt, self.cfg.max_length_prompt)
        response_a = self._truc_text(response_a, self.cfg.max_length_response)
        response_b = self._truc_text(response_b, self.cfg.max_length_response)
        
        text = f"Prompt {prompt} Response A: {response_a} Response B: {response_b}"
        
        encode = self.tokenizer.encode_plus(
            text,
            max_length=self.cfg.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encode.input_ids.squeeze(0),
            'attention_mask': encode.attention_mask.squeeze(0),
            'labels': label
        }

In [6]:
class PLDataModule(pl.LightningDataModule):
    def __init__(self):
        super().__init__()
        self.cfg = CFG.data
        self.is_setup = False
        
    def prepare_data(self):
        self.df = make_df(self.cfg.train_path)
        self.tokenizer = AutoTokenizer.from_pretrained(self.cfg.tokenizer)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = 'right'
        
    def setup(self, stage: str):
        kf = StratifiedKFold(n_splits=self.cfg.nfolds, shuffle=True, random_state=self.cfg.seed)
        splits = [(x,y) for x,y in  kf.split(self.df,self.df['label'])][CFG.fold_number]
        self.train_df, self.val_df = self.df.iloc[splits[0]], self.df.iloc[splits[1]]
        self.train_dataset = PLDataset(self.train_df,self.tokenizer)
        self.val_dataset = PLDataset(self.val_df,self.tokenizer)
        self.is_setup = True
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                         batch_size=self.cfg.batch_size,
                         num_workers=self.cfg.num_workers,
                         pin_memory=True,
                         shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          pin_memory=True,
                          shuffle=False)
    
    def predict_dataloader(self):
        return DataLoader(self.predict_dataset,
                          batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          pin_memory=True,
                          shuffle=False)

In [7]:
def softmax(x):
    return torch.tensor(x).softmax(dim=-1).tolist()

class AverageMeter():
    def __init__(self):
        self.preds = []
        self.preds_pr = []
        self.labels = []
        self.history = []
    
    def update(self,y_t,y_p,y_pr):
        self.labels += y_t
        self.preds += y_p
        self.preds_pr += y_pr
        
    def clean(self):
        self.preds = []
        self.labels = []
        self.preds_pr = []

    def calc_metrics(self):
        metrics = {}
        metrics['accuracy'] = accuracy_score(self.labels, self.preds)
        try:
            metrics['log_loss'] = log_loss(self.labels,[softmax(x) for x in self.preds_pr])
        except:
            print('Metric Error')
        self.history.append(metrics)
        
        return metrics

In [8]:
class PLModule(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.cfg = CFG.model
        self.config = AutoConfig.from_pretrained(self.cfg.model)
        self.config.pad_token_id = self.config.eos_token_id
        self.config.num_labels = self.cfg.num_labels
        self.model = AutoModelForSequenceClassification.from_pretrained(self.cfg.model,config=self.config)
        self.model = prepare_model_for_kbit_training(self.model)
        peft_config = LoraConfig(
            r=self.cfg.lora.r,
            lora_alpha=self.cfg.lora.lora_alpha,
            lora_dropout=self.cfg.lora.lora_dropout,
            bias=self.cfg.lora.bias,
            task_type='SEQ_CLS',
            layers_to_transform=self.cfg.lora.layers_to_transform,
            use_dora=self.cfg.lora.use_dora,
            target_modules=self.cfg.lora.target_modules
        )
        self.model = get_peft_model(self.model, peft_config)
        self.avg_meter = AverageMeter()
        
    def forward(self, batch):
        output = self.model(**batch)
        return output

    def training_step(self, batch, i):
        out = self(batch)
        loss = out.loss
        self.log('train_loss', loss.item())
        return loss
            
    def validation_step(self, batch, i):
        out = self(batch)
        loss = out.loss
        self.log('val_loss',loss.item())
        preds = out.logits.argmax(dim=-1).tolist()
        self.avg_meter.update(batch['labels'].tolist(),preds,out.logits.tolist())
    
    def predict_step(self, batch, i):
        out = self(batch)
        logits = out.logits
        return logits.argmax(dim=-1).tolist()
                
    def on_validation_epoch_end(self):
        metrics = self.avg_meter.calc_metrics()
        self.log_dict(metrics)
        self.avg_meter.clean()
            
    def configure_optimizers(self):        
        optimizer_parameters = [
            {'params': [p for n, p in self.model.model.named_parameters() if not any(nd in n for nd in self.cfg.no_decay)],
             'lr': self.cfg.lr, 'weight_decay': self.cfg.weight_decay},
            {'params': [p for n, p in self.model.model.named_parameters() if any(nd in n for nd in self.cfg.no_decay)],
             'lr': self.cfg.lr, 'weight_decay': 0.0},
            {'params': [p for n, p in self.model.named_parameters() if "model" not in n],
             'lr': self.cfg.lr_fn, 'weight_decay': self.cfg.weight_decay_fn}
        ]
        
        optim = self.cfg.optim(
            optimizer_parameters,
            lr=self.cfg.lr,
            betas=self.cfg.betas,
            weight_decay=self.cfg.weight_decay,
            eps=self.cfg.eps
        )
        
        if self.cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(optim,
                                                        num_training_steps=self.cfg.num_training_steps,
                                                        num_warmup_steps=self.cfg.num_training_steps * self.cfg.warnap_steps,
                                                        num_cycles=self.cfg.num_cycles)
        elif self.cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(optim,
                                                        num_training_steps=self.cfg.num_training_steps,
                                                        num_warmup_steps=self.cfg.num_training_steps * self.cfg.warnap_steps)
        else:
            return optim
        
        scheduler = {'scheduler': scheduler,'interval': 'step', 'frequency': 1}

        return [optim], [scheduler]

In [9]:
dm = PLDataModule()
dm.prepare_data()
dm.setup(0)

In [10]:
CFG.model.num_training_steps = len(dm.train_dataloader()) * CFG.model.max_epoches

In [11]:
model = PLModule()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-9b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
wandb.login(key="31520b01739d418e5d77a11fd8a79a70b189b8bc")
os.environ['WANDB_API_KEY'] = "31520b01739d418e5d77a11fd8a79a70b189b8bc"
wandb.init(project='KAGGLE_LMSYS',name='gemma_lora')

[34m[1mwandb[0m: Currently logged in as: [33mandrewkhl[0m ([33mandlh[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [13]:
lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step')
checkpoint_cb = pl.callbacks.ModelCheckpoint(
    dirpath='./outputs/',
    filename='model_{epoch:02d}-{accuracy:.4f}',
    monitor='accuracy',
    mode='max',
    save_last=True
)

trainer = pl.Trainer(
    accelerator="gpu",
    precision='bf16',
    callbacks = [lr_monitor],#[lr_monitor,checkpoint_cb],
    logger = pl.loggers.WandbLogger(save_code=True),
    log_every_n_steps=1,
    accumulate_grad_batches=4,
    #enable_checkpointing=False,
    min_epochs=1,
    devices=1,
    val_check_interval=0.5,
    max_epochs=CFG.model.max_epoches
)

/usr/local/lib/python3.11/dist-packages/lightning_fabric/connector.py:571: `precision=bf16` is supported for historical reasons but its usage is discouraged. Please set your precision to bf16-mixed instead!
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, datamodule=dm,ckpt_path='lightning_logs/uiy58rgf/checkpoints/epoch=0-step=5747.ckpt')

Restoring states from the checkpoint path at lightning_logs/uiy58rgf/checkpoints/epoch=0-step=5747.ckpt
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:360: The dirpath has changed from './lightning_logs/uiy58rgf/checkpoints' to './lightning_logs/33esi75y/checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                               | Params | Mode 
---------------------------------------------------------------------
0 | model | PeftModelForSequenceClassification | 9.3 B  | train
---------------------------------------------------------------------
11.2 M    Trainable params
9.2 B     Non-trainable params
9.3 B     Total params
37,011.769Total estimated model params size (MB)
Restored all states from the checkpoint at lightning_logs/uiy58rgf/checkpoints/epoch=

Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loops/training_epoch_loop.py:161: You're resuming from a checkpoint that ended before the epoch ended and your dataloader is not resumable. This can cause unreliable results if further training is done. Consider using an end-of-epoch checkpoint or make your dataloader resumable by implementing the `state_dict` / `load_state_dict` interface.
2024-08-02 14:51:44.672486: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 14:51:44.672482: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 14:51:44.672543: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when

Validation: |          | 0/? [00:00<?, ?it/s]

2024-08-02 14:51:45.766544: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 14:51:45.766599: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-02 14:51:45.768250: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-02 14:51:45.777569: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-02 14:51:46.083730: E external/local_xla/xla/

Metric Error


2024-08-02 14:53:17.780703: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 14:53:17.780706: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 14:53:17.780770: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-02 14:53:17.780780: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-02 14:53:17.782339: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory

In [15]:
trainer.validate(model, datamodule=dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

2024-08-02 14:30:43.386208: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 14:30:43.386279: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-02 14:30:43.387597: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 14:30:43.387654: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-02 14:30:43.388185: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory f

In [19]:
model.avg_meter.calc_metrics()



{'accuracy': 0.5471230158730159, 'log_loss': 0.9775314038874701}