In [1]:
!pip install transformers==4.42.3 accelerate==0.32.1 bitsandbytes==0.43.1 peft==0.11.1 \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files

Looking in links: /kaggle/input/lmsys-wheel-files
Processing /kaggle/input/lmsys-wheel-files/transformers-4.42.3-py3-none-any.whl
Processing /kaggle/input/lmsys-wheel-files/accelerate-0.32.1-py3-none-any.whl
Processing /kaggle/input/lmsys-wheel-files/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl
Processing /kaggle/input/lmsys-wheel-files/peft-0.11.1-py3-none-any.whl
Processing /kaggle/input/lmsys-wheel-files/tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from transformers==4.42.3)
Installing collected packages: tokenizers, transformers, accelerate, peft, bitsandbytes
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.0
    Uninstalling tokenizers-0.21.0:
      Successfully uninstalled tokenizers-0.21.0
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.0
    Uninstalling transformers-4.47.0:
      Successfully uninstalled transformers-4.47.0
  Attempting uninstal

In [2]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import random
import warnings
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import numpy as np, pandas as pd, polars as pl

import torch
import transformers
from datasets import Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import (GemmaTokenizerFast, TrainingArguments, Trainer, EvalPrediction,
                          Gemma2ForSequenceClassification, DataCollatorWithPadding)

warnings.simplefilter('ignore')
print('PyTorch version:', torch.__version__)
print('Transformers version:', transformers.__version__)

PyTorch version: 2.5.1+cu121
Transformers version: 4.42.3


In [3]:
class PATHS:
    train_path = '/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet'
    model_path = '/kaggle/input/gemma-2/transformers/gemma-2-2b/2'
    output_path = '/kaggle/working/gemma-2-2b-finetuned-seq-cls-wsdm'

class CFG:
    seed = 42
    n_splits = 5
    fold_idx = 0
    max_length = 1900
    num_layers = 26
    freeze_layers = 10
    fp16 = True
    optim = 'adamw_torch'
    learning_rate = 5e-5
    weight_decay = 0.01
    lr_scheduler = 'linear'
    warmup_ratio = 0.01
    train_epochs = 1
    grad_accum_steps = 4
    train_batch_size = 1
    eval_batch_size = 1
    eval_steps = 9600
    save_steps = 200

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    layers_to_transform=[i for i in range(CFG.num_layers) if i >= CFG.freeze_layers],
    bias='none',
    task_type='SEQ_CLS',
)

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True
seed_everything(seed=CFG.seed)

In [4]:
tokenizer = GemmaTokenizerFast.from_pretrained(PATHS.model_path, add_eos_token = True, padding_side = "right")

def tokenize(tokenizer, prompt, response_a, response_b, max_length=CFG.max_length):
    prompt = ["<prompt>: " + t for t in prompt]
    response_a = ["\n\n<response_a>: " + t for t in response_a]
    response_b = ["\n\n<response_b>: " + t for t in response_b]
    texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
    tokenized = tokenizer(texts, max_length=max_length, truncation=True)
    return tokenized['input_ids'], tokenized['attention_mask']

In [5]:
%%time
train_df = pl.read_parquet(PATHS.train_path).to_pandas()

for col in ['prompt', 'response_a', 'response_b']:
    train_df[col] = train_df[col].fillna('')
    text_list = []
    if col == 'prompt':
        max_no = 402
        s_no = 200
        e_no = -201
    else:
        max_no = 702
        s_no = 350
        e_no = -351
    for text in tqdm(train_df[col]):
        encoded = tokenizer(text, return_offsets_mapping=True)
        if len(encoded['input_ids']) > max_no:
            start_idx, end_idx = encoded['offset_mapping'][s_no]
            new_text = text[:end_idx]
            #print(len(tokenizer(text[:end_idx])['input_ids']))
            start_idx, end_idx = encoded['offset_mapping'][e_no]
            #print(len(tokenizer(text[start_idx:])['input_ids']))
            new_text = new_text + "\n(snip)\n" + text[start_idx:]
            #print(len(tokenizer(new_text)['input_ids']), new_text)
            text = new_text
        text_list.append(text)
    train_df[col] = text_list

100%|██████████| 48439/48439 [00:29<00:00, 1662.24it/s]
100%|██████████| 48439/48439 [01:04<00:00, 747.69it/s]
100%|██████████| 48439/48439 [01:07<00:00, 722.96it/s]

CPU times: user 2min 42s, sys: 1.31 s, total: 2min 43s
Wall time: 2min 43s





In [6]:
%%time
train_df['winner_encoded'] = train_df['winner'].map({'model_a': 0, 'model_b': 1})

data = pd.DataFrame()
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, train_df["prompt"], train_df["response_a"], train_df["response_b"])
data["labels"] = train_df["winner_encoded"]

folds = [
    (
        [i for i in range(len(data)) if i % CFG.n_splits != fold_idx],
        [i for i in range(len(data)) if i % CFG.n_splits == fold_idx]
    ) 
    for fold_idx in range(CFG.n_splits)
]
train_idx, eval_idx = folds[CFG.fold_idx]

ds = Dataset.from_pandas(data)

CPU times: user 1min 45s, sys: 7.36 s, total: 1min 52s
Wall time: 1min 52s


In [7]:
def compute_metrics(eval_pred: EvalPrediction) -> dict:
    pred, label = eval_pred.predictions, eval_pred.label_ids
    accuracy = accuracy_score(y_true=label, y_pred=pred.argmax(-1))
    return {'accuracy': accuracy}

train_args = TrainingArguments(
    report_to='none',
    output_dir=PATHS.output_path,
    fp16=CFG.fp16,
    optim=CFG.optim,
    learning_rate=CFG.learning_rate,
    weight_decay=CFG.weight_decay,
    lr_scheduler_type=CFG.lr_scheduler,
    warmup_ratio=CFG.warmup_ratio,
    num_train_epochs=CFG.train_epochs,
    gradient_accumulation_steps=CFG.grad_accum_steps,
    per_device_train_batch_size=CFG.train_batch_size,
    per_device_eval_batch_size=CFG.eval_batch_size,
    eval_steps=CFG.eval_steps,
    save_steps=CFG.save_steps,
    evaluation_strategy='steps',
    save_strategy='steps',
    save_total_limit=3,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    load_best_model_at_end=False,
)

model = Gemma2ForSequenceClassification.from_pretrained(
    PATHS.model_path,
    num_labels=2,
    use_cache=False,
    device_map="auto",
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainer = Trainer(
    args=train_args, 
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds.select(train_idx),
    eval_dataset=ds.select(eval_idx),
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/gemma-2/transformers/gemma-2-2b/2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 12,784,128 || all params: 2,627,130,624 || trainable%: 0.4866


In [8]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
9600,0.635,0.616717,0.652766


TrainOutput(global_step=9687, training_loss=0.6860104314040705, metrics={'train_runtime': 35869.4068, 'train_samples_per_second': 1.08, 'train_steps_per_second': 0.27, 'total_flos': 4.684515090407286e+17, 'train_loss': 0.6860104314040705, 'epoch': 0.9999225826430286})