In [11]:
!pip install accelerate peft bitsandbytes transformers trl unsloth optree



In [12]:
import torch
import pandas as pd
import joblib
import warnings
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from transformers import (
    GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments, 
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset

warnings.filterwarnings("ignore")

In [13]:
class CFG:
    checkpoint = "gpt2"
    max_length = 2048
    n_splits = 5
    current_fold = 0
    optim_type = "adamw_torch"
    per_device_train_batch_size = 2
    per_device_eval_batch_size = 8
    gradient_accumulation_steps = 2
    n_epochs = 1
    lr = 2e-4
    warmup_steps = 20
    lora_r = 16
    lora_alpha = lora_r * 2
    lora_dropout = 0.05
    lora_bias = "none"
    seed = 42
    device = "cuda" if torch.cuda.is_available() else "cpu"

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
dataset = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet").reset_index(drop=True)
dataset["winner"] = dataset["winner"].map({"model_a": 0, "model_b": 1})

train, test = train_test_split(dataset, test_size=0.2, random_state=1)
val, test = train_test_split(test, test_size=0.5, random_state=1)

train = Dataset.from_pandas(train)
val = Dataset.from_pandas(val)
test = Dataset.from_pandas(test)

In [16]:
tokenizer = GPT2Tokenizer.from_pretrained(CFG.checkpoint)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def extend_pos_embeddings(model, new_max_length):
    old_max_length = model.config.n_positions
    if new_max_length <= old_max_length:
        return model

    print(f"Extending GPT-2 positional embeddings from {old_max_length} to {new_max_length}...")
    old_embeddings = model.transformer.wpe.weight.detach().cpu()
    new_embeddings = torch.nn.Embedding(new_max_length, old_embeddings.shape[1])
    
    new_embeddings.weight.data[:old_max_length] = old_embeddings
    model.transformer.wpe = new_embeddings
    model.config.n_positions = new_max_length
    return model

class Tokenizer:
    def __init__(self, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch):
        prompt = ["<prompt>: " + t for t in batch["prompt"]]
        response_a = ["\n\n<response_a>: " + t for t in batch["response_a"]]
        response_b = ["\n\n<response_b>: " + t for t in batch["response_b"]]
        texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True, padding="max_length")
        return {**tokenized, "labels": batch["winner"]}

encode = Tokenizer(tokenizer, max_length=CFG.max_length)

train = train.map(encode, batched=True)
val = val.map(encode, batched=True)
test = test.map(encode, batched=True)

Map:   0%|          | 0/38751 [00:00<?, ? examples/s]

Map:   0%|          | 0/4844 [00:00<?, ? examples/s]

Map:   0%|          | 0/4844 [00:00<?, ? examples/s]

In [17]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

In [18]:
model = GPT2ForSequenceClassification.from_pretrained(CFG.checkpoint, num_labels=2)
model = extend_pos_embeddings(model, CFG.max_length)
model.config.use_cache = False  # Disable caching for training
model.to(CFG.device)

lora_config = LoraConfig(
    r=CFG.lora_r,
    lora_alpha=CFG.lora_alpha,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=CFG.lora_dropout,
    bias=CFG.lora_bias,
    task_type=TaskType.SEQ_CLS,
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extending GPT-2 positional embeddings from 1024 to 2048...


In [19]:
def compute_metrics(eval_preds):
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(preds).float().softmax(-1).numpy()
    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
    return {"acc": acc, "log_loss": loss}

training_args = TrainingArguments(
    output_dir='/kaggle/working/',
    overwrite_output_dir=True,
    num_train_epochs=CFG.n_epochs,
    per_device_train_batch_size=CFG.per_device_train_batch_size,
    gradient_accumulation_steps=CFG.gradient_accumulation_steps,
    per_device_eval_batch_size=CFG.per_device_eval_batch_size,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="steps",
    save_steps=100,
    save_total_limit=1,
    optim=CFG.optim_type,
    fp16=True,
    fp16_full_eval=True,
    learning_rate=CFG.lr,
    warmup_steps=CFG.warmup_steps,
    report_to="none"
)

trainer = Trainer(
    args=training_args,
    model=model,
    tokenizer=tokenizer,
    train_dataset=train,
    eval_dataset=val,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()

ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/parallel/parallel_apply.py", line 96, in _worker
    output = module(*input, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/peft/peft_model.py", line 1558, in forward
    return self.base_model(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/peft/tuners/tuners_utils.py", line 193, in forward
    return self.model.forward(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/gpt2/modeling_gpt2.py", line 1397, in forward
    raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
ValueError: Cannot handle batch sizes > 1 if no padding token is defined.


In [None]:
y_true = test["winner"]
logits = trainer.predict(test).predictions
y_pred_probs = torch.from_numpy(logits).float().softmax(-1).numpy()
acc = accuracy_score(y_true=y_true, y_pred=y_pred_probs.argmax(-1))
print(f"Test - Accuracy: {acc:.4f}")

joblib.dump(y_pred_probs, f"y_pred_probs_fold_test_acc_{acc:.6f}.pkl")

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report((y_pred_probs[:, 0] <= 0.5).astype(int), y_true, digits=6))

In [None]:
(y_pred_probs[:, 0] >= 0.5).astype(int)[:10]

In [None]:
y_true[:10]