In [1]:
!pip install accelerate peft bitsandbytes transformers trl unsloth optree

Collecting peft
  Downloading peft-0.15.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.16.0-py3-none-any.whl.metadata (12 kB)
Collecting unsloth
  Downloading unsloth-2025.3.18-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.0 (from accelerate)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting transformers
  Downloading transformers-4.50.0-py3-none-any.whl.metadata (39 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting unsloth_zoo>=2025.3.14 (from unsloth)
  Downloading unsloth_zoo-2025.3.16-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (fr

In [2]:
import torch
import pandas as pd
import joblib
import warnings
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (LongformerForSequenceClassification, LongformerTokenizerFast,
                          Trainer, TrainingArguments, DataCollatorWithPadding)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

warnings.filterwarnings("ignore")



In [3]:
class CFG:
    checkpoint = "allenai/longformer-base-4096"
    max_length = 2048
    per_device_train_batch_size = 8
    per_device_eval_batch_size = 16
    gradient_accumulation_steps = 2
    n_epochs = 3
    lr = 2e-5
    warmup_steps = 20
    lora_r = 8
    lora_alpha = 16
    lora_dropout = 0.1
    lora_bias = "none"
    seed = 42

In [4]:
from sklearn.model_selection import train_test_split

In [13]:
dataset = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet").reset_index(drop=True)
dataset["winner"] = dataset["winner"].map({"model_a": 0, "model_b": 1})

train, test = train_test_split(dataset, test_size=0.2, random_state=1)
val, test = train_test_split(test, test_size=0.5, random_state=1)

train = Dataset.from_pandas(train.sample(10000))
val = Dataset.from_pandas(val)
test = Dataset.from_pandas(test)

tokenizer = LongformerTokenizerFast.from_pretrained(CFG.checkpoint)
tokenizer.padding_side = "right"

In [14]:
def encode(batch):
    texts = [f"<prompt>: {p}\n\n<response_a>: {a}\n\n<response_b>: {b}" 
             for p, a, b in zip(batch["prompt"], batch["response_a"], batch["response_b"])]
    tokenized = tokenizer(texts, max_length=CFG.max_length, truncation=True, padding="max_length")
    return {**tokenized, "labels": batch["winner"]}

In [15]:
train = train.map(encode, batched=True)
val = val.map(encode, batched=True)
test = test.map(encode, batched=True)

lora_config = LoraConfig(
    r=CFG.lora_r,
    lora_alpha=CFG.lora_alpha,
    target_modules=["query", "value"],
    lora_dropout=CFG.lora_dropout,
    bias=CFG.lora_bias,
    task_type=TaskType.SEQ_CLS,
)

model = LongformerForSequenceClassification.from_pretrained(
    CFG.checkpoint,
    num_labels=2,
    torch_dtype=torch.float16,
    device_map="auto",
)

model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4844 [00:00<?, ? examples/s]

Map:   0%|          | 0/4844 [00:00<?, ? examples/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import TrainerCallback

In [17]:
def compute_metrics(eval_preds):
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(preds).float().softmax(-1).numpy()
    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
    return {"acc": acc, "log_loss": loss}

class LoggingCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 10 == 0:  # Log every 10 steps
            if state.log_history and 'loss' in state.log_history[-1]:
                print(f"Step {state.global_step}: Loss = {state.log_history[-1]['loss']}, All = {state.log_history[-1]}")

training_args = TrainingArguments(
    output_dir="longformer_finetuned",
    num_train_epochs=CFG.n_epochs,
    per_device_train_batch_size=CFG.per_device_train_batch_size,
    gradient_accumulation_steps=CFG.gradient_accumulation_steps,
    per_device_eval_batch_size=CFG.per_device_eval_batch_size,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    optim="adamw_torch",
    fp16=True,
    learning_rate=CFG.lr,
    warmup_steps=CFG.warmup_steps,
    logging_dir="logs",
    logging_steps=10,
    report_to="none"
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    callbacks=[LoggingCallback()],
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Acc,Log Loss,Runtime,Samples Per Second,Steps Per Second
1,0.697,0.693223,0.496697,0.693222,594.2744,8.151,0.51
2,0.6972,0.693403,0.507845,0.693405,593.8627,8.157,0.51
3,0.6977,0.693082,0.504335,0.69308,600.7461,8.063,0.504


Step 20: Loss = 0.6967, All = {'loss': 0.6967, 'grad_norm': 3.047994613647461, 'learning_rate': 1e-05, 'epoch': 0.016, 'step': 10}
Step 30: Loss = 0.6955, All = {'loss': 0.6955, 'grad_norm': 1.8335925340652466, 'learning_rate': 2e-05, 'epoch': 0.032, 'step': 20}
Step 40: Loss = 0.7059, All = {'loss': 0.7059, 'grad_norm': 2.5207717418670654, 'learning_rate': 1.9892183288409707e-05, 'epoch': 0.048, 'step': 30}
Step 50: Loss = 0.6915, All = {'loss': 0.6915, 'grad_norm': 0.9613805413246155, 'learning_rate': 1.978436657681941e-05, 'epoch': 0.064, 'step': 40}
Step 60: Loss = 0.6987, All = {'loss': 0.6987, 'grad_norm': 2.152090072631836, 'learning_rate': 1.9676549865229113e-05, 'epoch': 0.08, 'step': 50}
Step 70: Loss = 0.6976, All = {'loss': 0.6976, 'grad_norm': 1.2096996307373047, 'learning_rate': 1.9568733153638815e-05, 'epoch': 0.096, 'step': 60}
Step 80: Loss = 0.6903, All = {'loss': 0.6903, 'grad_norm': 1.8332371711730957, 'learning_rate': 1.946091644204852e-05, 'epoch': 0.112, 'step': 

TrainOutput(global_step=1875, training_loss=0.6931615804036458, metrics={'train_runtime': 16481.106, 'train_samples_per_second': 1.82, 'train_steps_per_second': 0.114, 'total_flos': 3.973809586176e+16, 'train_loss': 0.6931615804036458, 'epoch': 3.0})

In [None]:
y_true = test["winner"]
logits = trainer.predict(test).predictions
y_pred_probs = torch.from_numpy(logits).float().softmax(-1).numpy()
acc = accuracy_score(y_true=y_true, y_pred=y_pred_probs.argmax(-1))
print(f"Test - Accuracy: {acc:.4f}") 

joblib.dump(y_pred_probs, f"y_pred_probs_fold_test_acc_{acc:.6f}.pkl")

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report((y_pred_probs[:, 0] <= 0.5).astype(int), y_true, digits=6))

In [None]:
/(y_pred_probs[:, 0] >= 0.5).astype(int)[:10]

In [None]:
y_true[:10]