In [1]:
!pip install accelerate peft bitsandbytes transformers trl unsloth optree

Collecting peft
  Downloading peft-0.15.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.16.0-py3-none-any.whl.metadata (12 kB)
Collecting unsloth
  Downloading unsloth-2025.3.18-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.0 (from accelerate)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting transformers
  Downloading transformers-4.50.0-py3-none-any.whl.metadata (39 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting unsloth_zoo>=2025.3.14 (from unsloth)
  Downloading unsloth_zoo-2025.3.16-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (fr

In [9]:
import torch
import pandas as pd
import joblib
import warnings
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from transformers import (DistilBertForSequenceClassification, DistilBertTokenizerFast,
                          Trainer, TrainingArguments, DataCollatorWithPadding)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

In [10]:
class CFG:
    checkpoint = "distilbert-base-uncased"
    max_length = 2048
    n_splits = 5
    current_fold = 0
    per_device_train_batch_size = 16
    per_device_eval_batch_size = 32
    gradient_accumulation_steps = 1
    n_epochs = 3
    lr = 2e-5
    warmup_steps = 50
    lora_r = 8
    lora_alpha = 16
    lora_dropout = 0.1
    lora_bias = "none"
    seed = 42

In [12]:
dataset = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet").reset_index(drop=True)
dataset["winner"] = dataset["winner"].map({"model_a": 0, "model_b": 1})

train, test = train_test_split(dataset, test_size=0.2, random_state=1)
val, test = train_test_split(test, test_size=0.5, random_state=1)

train = Dataset.from_pandas(train)
val = Dataset.from_pandas(val)
test = Dataset.from_pandas(test)

In [13]:
tokenizer = DistilBertTokenizerFast.from_pretrained(CFG.checkpoint)
tokenizer.padding_side = "right"

def encode(batch):
    texts = [f"<prompt>: {p}\n\n<response_a>: {a}\n\n<response_b>: {b}" 
             for p, a, b in zip(batch["prompt"], batch["response_a"], batch["response_b"])]
    tokenized = tokenizer(texts, max_length=CFG.max_length, truncation=True, padding="max_length")
    return {**tokenized, "labels": batch["winner"]}

train = train.map(encode, batched=True)
val = val.map(encode, batched=True)
test = test.map(encode, batched=True)

Map:   0%|          | 0/38751 [00:00<?, ? examples/s]

Map:   0%|          | 0/4844 [00:00<?, ? examples/s]

Map:   0%|          | 0/4844 [00:00<?, ? examples/s]

In [14]:
lora_config = LoraConfig(
    r=CFG.lora_r,
    lora_alpha=CFG.lora_alpha,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=CFG.lora_dropout,
    bias=CFG.lora_bias,
    task_type=TaskType.SEQ_CLS,
)

model = DistilBertForSequenceClassification.from_pretrained(
    CFG.checkpoint,
    num_labels=2,
    torch_dtype=torch.float16,
    device_map="auto",
)

model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def compute_metrics(eval_preds):
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(preds).float().softmax(-1).numpy()
    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
    return {"acc": acc, "log_loss": loss}

training_args = TrainingArguments(
    output_dir="distilbert_finetuned",
    num_train_epochs=CFG.n_epochs,
    per_device_train_batch_size=CFG.per_device_train_batch_size,
    gradient_accumulation_steps=CFG.gradient_accumulation_steps,
    per_device_eval_batch_size=CFG.per_device_eval_batch_size,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    optim="adamw_torch",
    fp16=True,
    learning_rate=CFG.lr,
    warmup_steps=CFG.warmup_steps,
    logging_dir="logs",
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
trainer.train()

RuntimeError: The size of tensor a (2048) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
y_true = val["winner"]
logits = trainer.predict(val).predictions
y_pred_probs = torch.from_numpy(logits).float().softmax(-1).numpy()
acc = accuracy_score(y_true=y_true, y_pred=y_pred_probs.argmax(-1))
print(f"Fold {CFG.current_fold} - Accuracy: {acc:.4f}")

joblib.dump(y_pred_probs, f"y_pred_probs_fold_{CFG.current_fold}_acc_{acc:.6f}.pkl")