- Inference notebook: https://www.kaggle.com/code/ravaghi/wsdm-cup-gemma-2-9b-4-bit-qlora-inference
- Inference notebook: https://www.kaggle.com/code/ravaghi/wsdm-cup-gemma-2-9b-logistic-regression

# Imports and configs

In [1]:
!pip install accelerate peft bitsandbytes transformers trl unsloth

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub>=0.21.0 (from accelerate)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading bitsandbytes-0.45.3-py3-none-manylinux

In [2]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
import pandas as pd
import warnings
import joblib
import torch

warnings.filterwarnings("ignore")

In [3]:
class CFG:
    checkpoint = "unsloth/gemma-2-9b-it-bnb-4bit"
    max_length = 3072
    n_splits = 5
    current_fold = 0
    optim_type = "adamw_8bit"
    per_device_train_batch_size = 2
    per_device_eval_batch_size = 8
    gradient_accumulation_steps = 2
    n_epochs = 1
    freeze_layers = 16
    lr = 2e-4
    warmup_steps = 20
    lora_r = 16
    lora_alpha = lora_r * 2
    lora_dropout = 0.05
    lora_bias = "none"
    seed=42

In [4]:
CHECKPOINT_BASE_NAME = f"gemma-2-9b-it-bnb-4bit-{CFG.max_length}-{CFG.per_device_train_batch_size}-f{CFG.current_fold}"

# Loading data

In [5]:
dataset = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet").sample(100).reset_index(drop=True)
dataset["winner"] = dataset["winner"].map({"model_a": 0, "model_b": 1})

In [6]:
skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
for i, (_, val_index) in enumerate(skf.split(dataset, dataset["winner"])):
    dataset.loc[val_index, "fold"] = i

In [7]:
train = dataset[dataset["fold"] != CFG.current_fold]
val = dataset[dataset["fold"] == CFG.current_fold]

In [8]:
train = Dataset.from_pandas(train)
val = Dataset.from_pandas(val)

# Tokenizing

In [9]:
tokenizer = GemmaTokenizerFast.from_pretrained(CFG.checkpoint)
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [10]:
class Tokenizer:
    def __init__(self, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch):
        prompt = ["<prompt>: " + t for t in batch["prompt"]]
        response_a = ["\n\n<response_a>: " + t for t in batch["response_a"]]
        response_b = ["\n\n<response_b>: " + t for t in batch["response_b"]]
        texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
        return {**tokenized, "labels": batch["winner"]}

In [11]:
encode = Tokenizer(tokenizer, max_length=CFG.max_length)

train = train.map(encode, batched=True)
val = val.map(encode, batched=True)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

# Modeling

In [12]:
lora_config = LoraConfig(
    r=CFG.lora_r,
    lora_alpha=CFG.lora_alpha,
    target_modules=["q_proj", "k_proj", "v_proj"],
    layers_to_transform=[i for i in range(42) if i >= CFG.freeze_layers],
    lora_dropout=CFG.lora_dropout,
    bias=CFG.lora_bias,
    task_type=TaskType.SEQ_CLS,
)

In [13]:
model = Gemma2ForSequenceClassification.from_pretrained(
    CFG.checkpoint,
    num_labels=2,
    torch_dtype=torch.float16,
    device_map="auto",
)

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Training

In [15]:
def compute_metrics(eval_preds):
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(preds).float().softmax(-1).numpy()
    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
    return {"acc": acc, "log_loss": loss}

In [16]:
training_args = TrainingArguments(
    output_dir=CHECKPOINT_BASE_NAME,
    overwrite_output_dir=True,
    num_train_epochs=CFG.n_epochs,
    per_device_train_batch_size=CFG.per_device_train_batch_size,
    gradient_accumulation_steps=CFG.gradient_accumulation_steps,
    per_device_eval_batch_size=CFG.per_device_eval_batch_size,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="steps",
    save_steps=200,
    save_total_limit=1,
    optim=CFG.optim_type,
    fp16=True,
    learning_rate=CFG.lr,
    warmup_steps=CFG.warmup_steps,
    report_to="none"
)

In [17]:
trainer = Trainer(
    args=training_args,
    model=model,
    tokenizer=tokenizer,
    train_dataset=train,
    eval_dataset=val,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Acc,Log Loss,Runtime,Samples Per Second,Steps Per Second
1,1.645,1.142503,0.6,1.142552,120.37,0.166,0.025


TrainOutput(global_step=20, training_loss=1.6133472442626953, metrics={'train_runtime': 1653.8652, 'train_samples_per_second': 0.048, 'train_steps_per_second': 0.012, 'total_flos': 7337411281274880.0, 'train_loss': 1.6133472442626953, 'epoch': 1.0})

# Saving OOF file

In [19]:
y_true = val["winner"]
logits = trainer.predict(val).predictions
y_pred_probs = torch.from_numpy(logits).float().softmax(-1).numpy()

In [20]:
acc = accuracy_score(y_true=y_true, y_pred=y_pred_probs.argmax(-1))
print(f"Fold {CFG.current_fold} - Accuracy: {acc:.4f}")

Fold 0 - Accuracy: 0.6000


In [21]:
joblib.dump(y_pred_probs, f"y_pred_probs_fold_{CFG.current_fold}_acc_{acc:.6f}.pkl")

['y_pred_probs_fold_0_acc_0.600000.pkl']