- Training notebook: https://www.kaggle.com/code/ravaghi/wsdm-cup-gemma-2-9b-4-bit-qlora-training

# Imports and configs

In [1]:
!pip install accelerate peft bitsandbytes transformers trl unsloth
!pip install --upgrade 'optree>=0.13.0'

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting unsloth
  Downloading unsloth-2025.3.14-py3-none-any.whl.metadata (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.3/59.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.0 (from accelerate)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting unsloth_zoo>=2025.3.

In [2]:
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.base import clone
from concurrent.futures import ThreadPoolExecutor
from timeit import default_timer as timer
from peft import PeftModel
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from scipy.special import logit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import joblib
import optuna
import torch
import json
import gc
import os

warnings.filterwarnings('ignore')

In [3]:
class CFG:
    train_path = '/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet'
    test_path = '/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet'
    sample_sub_path = '/kaggle/input/wsdm-cup-multilingual-chatbot-arena/sample_submission.csv'

    data_path = '/kaggle/input/wsdm-cup-gemma-2-9b-4-bit-qlora'

    gemma_dir = "/kaggle/input/gemma-2-9b-4bit-it-unsloth/transformers/default/1/gemma-2-9b-it-4bit-unsloth_old"
    lora_dir = "/kaggle/input/wsdm-cup-gemma-2-9b-4-bit-qlora/gemma2-9b-4bit/fold-0/gemma-2-9b-it-bnb-4bit-3072-8-f0/checkpoint-2900"
    
    max_length = 3072
    batch_size = 4

    target = 'winner'
    n_folds = 5
    seed = 42

    char_vectorizer_params = {
        'analyzer': "char",
        "lowercase": False,
        "max_df": 0.605,
        "max_features": 331,
        "min_df": 0.075,
        "ngram_range": (1, 3),
        "strip_accents": "unicode"
    }

    word_vectorizer_params = {
        "analyzer": "word",
        "lowercase": True,
        "max_df": 0.985,
        "max_features": 769,
        "min_df": 0.01,
        "ngram_range": (1, 2),
        "strip_accents": "unicode"
    }

# Gemma-2 9b 4-bit

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
test = pd.read_parquet(CFG.test_path).fillna('')
train, test = train_test_split(test, test_size=0.2, random_state=1)
val, test = train_test_split(test, test_size=0.5, random_state=1)
train = pd.concat((train, val))

In [6]:
if len(test) > 10_000:
    time_limit = int(3600 * 12) 
else:
    time_limit = int(3600 * 4.75)

## Tokenizing

In [7]:
def tokenize(tokenizer, prompt, response_a, response_b, max_length=CFG.max_length):
    prompt = ["<prompt>: " + t for t in prompt]
    response_a = ["\n\n<response_a>: " + t for t in response_a]
    response_b = ["\n\n<response_b>: " + t for t in response_b]
    
    texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
    tokenized = tokenizer(texts, max_length=max_length, truncation=True)
    
    return tokenized['input_ids'], tokenized['attention_mask']

In [8]:
tokenizer = GemmaTokenizerFast.from_pretrained(CFG.gemma_dir)
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

In [9]:
for col in ['prompt', 'response_a', 'response_b']:
    test[col] = test[col].fillna('')
    text_list = []
    if col == "prompt":
        max_no = 512
        s_no = 255
        e_no = -256
    else:
        max_no = 3072
        s_no = 1535
        e_no = -1536
    for text in tqdm(test[col]):
        encoded = tokenizer(text, return_offsets_mapping=True)
        if len(encoded['input_ids']) > max_no:
            start_idx, end_idx = encoded['offset_mapping'][s_no]
            new_text = text[:end_idx]
            start_idx, end_idx = encoded['offset_mapping'][e_no]
            new_text = new_text + "\n(snip)\n" + text[start_idx:]
            text = new_text
        text_list.append(text)
    test[col] = text_list

100%|██████████| 4844/4844 [00:03<00:00, 1435.34it/s]
100%|██████████| 4844/4844 [00:06<00:00, 710.93it/s]
100%|██████████| 4844/4844 [00:06<00:00, 722.09it/s]


In [10]:
data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
# swap response_a & response_b
aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)

## Model

In [11]:
model_0 = Gemma2ForSequenceClassification.from_pretrained(
    CFG.gemma_dir,
    device_map=torch.device("cuda:0"),
    use_cache=False,
)

model_1 = Gemma2ForSequenceClassification.from_pretrained(
    CFG.gemma_dir,
    device_map=torch.device("cuda:1"),
    use_cache=False,
)

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/gemma-2-9b-4bit-it-unsloth/transformers/default/1/gemma-2-9b-it-4bit-unsloth_old and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/gemma-2-9b-4bit-it-unsloth/transformers/default/1/gemma-2-9b-it-4bit-unsloth_old and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
model_0 = PeftModel.from_pretrained(model_0, CFG.lora_dir)
model_1 = PeftModel.from_pretrained(model_1, CFG.lora_dir)

In [13]:
model_0.eval()
model_1.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Gemma2ForSequenceClassification(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584, padding_idx=0)
        (layers): ModuleList(
          (0-15): 16 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): Linear4bit(in_features=3584, out_features=4096, bias=False)
              (k_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
              (v_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
              (o_proj): Linear4bit(in_features=4096, out_features=3584, bias=False)
            )
            (mlp): Gemma2MLP(
              (gate_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
              (up_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
              (down_proj): Linear4bit(in_features=14336, out_features=3584, bias=False)
              (act_fn): PytorchGELUTanh()
 

## Inference

In [14]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size, max_length=CFG.max_length):
    winners = []
    
    for start_idx in tqdm(range(0, len(df), batch_size)):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        winners.extend(proba[:, 1].tolist())
    
    df['winner'] = winners
    
    return df

In [15]:
global_timer = timer()

In [16]:
data['index'] = np.arange(len(data), dtype=np.int32)
data = data.sort_values("length", ascending=False)

In [17]:
data_dict = {}
data_dict[0] = data[data["length"] > 1024].reset_index(drop=True)
data_dict[1] = data[data["length"] <= 1024].reset_index(drop=True)

In [18]:
result_df = []
for i, batch_size in enumerate([CFG.batch_size, CFG.batch_size]):
    if len(data_dict[i]) == 0:
        continue
        
    sub_1 = data_dict[i].iloc[0::2].copy()
    sub_2 = data_dict[i].iloc[1::2].copy()
    
    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(
            inference, 
            (sub_1, sub_2), 
            (model_0, model_1), 
            (torch.device("cuda:0"), torch.device("cuda:1")), 
            (batch_size, batch_size)
        )
        
    result_df.append(pd.concat(list(results), axis=0))

  0%|          | 0/310 [00:00<?, ?it/s]
  0%|          | 1/310 [00:15<1:20:47, 15.69s/it]
  1%|          | 2/310 [00:31<1:20:01, 15.59s/it][A
  1%|          | 3/310 [00:47<1:20:49, 15.80s/it][A
  1%|▏         | 4/310 [01:03<1:22:24, 16.16s/it][A
  2%|▏         | 5/310 [01:21<1:24:21, 16.59s/it][A
  2%|▏         | 6/310 [01:39<1:26:54, 17.15s/it][A
  2%|▏         | 7/310 [01:58<1:29:36, 17.74s/it][A
  3%|▎         | 8/310 [02:16<1:30:15, 17.93s/it][A
  3%|▎         | 9/310 [02:34<1:29:51, 17.91s/it][A
  3%|▎         | 10/310 [02:52<1:29:15, 17.85s/it][A
  4%|▎         | 11/310 [03:10<1:29:13, 17.91s/it][A
  4%|▍         | 12/310 [03:28<1:29:39, 18.05s/it][A
  4%|▍         | 13/310 [03:47<1:29:44, 18.13s/it][A
  5%|▍         | 14/310 [04:05<1:29:22, 18.11s/it][A
  5%|▍         | 15/310 [04:23<1:28:49, 18.07s/it][A
  5%|▌         | 16/310 [04:41<1:28:34, 18.08s/it][A
  5%|▌         | 17/310 [04:59<1:28:33, 18.13s/it][A
  6%|▌         | 18/310 [05:17<1:28:19, 18.15s/it][A
 

In [19]:
result_df = pd.concat(result_df).sort_values('index').reset_index(drop=True)

In [20]:
result_df

Unnamed: 0,id,input_ids,attention_mask,length,index,winner
0,271ba404fc25609b8ceb6f37c3375f278c42e2479514ea...,"[2, 235322, 39038, 78880, 7878, 52696, 15243, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",509,0,0.522689
1,ce7be103ac8f040accc9b8ec421a6cbf78189c503671df...,"[2, 235322, 39038, 78880, 496, 791, 476, 21244...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2220,1,0.479565
2,b39725e0a7992eb0c5f34542f960727a4fd306741568fb...,"[2, 235322, 39038, 78880, 590, 791, 476, 79801...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",922,2,0.509886
3,bb33f84e8e85da9e41fa271f6210fdffba0400d725a04c...,"[2, 235322, 39038, 78880, 43540, 3569, 611, 12...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1062,3,0.354972
4,364c8146a6a236fbdfa34492dba10405a0e448d3062316...,"[2, 235322, 39038, 78880, 11941, 476, 10058, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",631,4,0.874988
...,...,...,...,...,...,...
4839,8bd008cebcfd4a9e504f0dbca5a5664d412ca9b4c4045f...,"[2, 235322, 39038, 78880, 664, 2841, 71765, 58...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",351,4839,0.219756
4840,198aa3635a6fc9f6a7d069e82cc9cddcbc783e3a0e83e8...,"[2, 235322, 39038, 78880, 163034, 30976, 5536,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1005,4840,0.781539
4841,584414c294899b255a9576ade65032610ca1937cffcad4...,"[2, 235322, 39038, 78880, 12236, 1758, 603, 65...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",914,4841,0.062674
4842,95fb0a8dabf14619a9564334dc390cbf170a0e09905086...,"[2, 235322, 39038, 78880, 24424, 108, 511, 218...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1923,4842,0.528807


In [34]:
(result_df.winner <= 0.5).astype(int)

0       0
1       1
2       0
3       1
4       0
       ..
4839    1
4840    0
4841    1
4842    0
4843    1
Name: winner, Length: 4844, dtype: int64

In [27]:
(test["winner"] == 'model_a').astype(int)

7419     0
39112    0
33946    1
35392    1
10254    0
        ..
26506    1
4869     0
16686    1
28451    0
28115    1
Name: winner, Length: 4844, dtype: int64

In [29]:
from sklearn.metrics import classification_report

In [33]:
print(classification_report((result_df.winner <= 0.5).astype(int), (test["winner"] == 'model_a').astype(int), digits=6))

              precision    recall  f1-score   support

           0   0.718813  0.723222  0.721011      2446
           1   0.715904  0.711426  0.713658      2398

    accuracy                       0.717382      4844
   macro avg   0.717359  0.717324  0.717335      4844
weighted avg   0.717373  0.717382  0.717371      4844

