In [1]:
!pip install -U "transformers==4.47.1" "bitsandbytes==0.45.0" "accelerate==1.2.1" "peft==0.14.0" "datasets==3.2.0" "ftfy==6.3.1" "pyarrow==18.1.0" "chardet==5.2.0" "charset-normalizer==3.3.2"



In [2]:
import os
import copy
from dataclasses import dataclass
import ftfy

import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from datasets import Dataset, concatenate_datasets, Features, Value
from transformers import (
    BitsAndBytesConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModel
from sklearn.metrics import log_loss, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

VER = '13-REVERSED-8BIT-TOP5-LMSYS-MODEL-99.9PERCENT-CUSTOM-HEAD-LEFTSIDE-CONCAT-NO-EXTRA-DATA-MAXLEN2200-R64-A4-BF16'

# --- was used in init checkpoint ---
USE_ULTRAFEEDBACK = False
USE_33K = False
USE_LMSYS = False
USE_CONCAT_DATA = False

# --- new extra data ---
USE_ORPO44K = False # don't work for top-5 init checkpoint
USE_PREV_PSEUDOLABELS = False

# --- my soft pseudolabeled data ---
USE_PUBLIC_8K = False
USE_MY_PL = False
MY_PL_FRAC = 1

# --- my hard pseudolabeled data ---
USE_MY_HARD_PL = False

# --- select init checkpoint ---
USE_LMSYS_MODEL = True
USE_FSFAIRX = False # no reason to test it

# --- additional features ---
USE_LEFTSIDE_TRUNCATION = True
USE_PROMPT_PREFIX = False

In [4]:
os.environ['WANDB_API_KEY'] = 'secret'
os.environ['WANDB_PROJECT'] = 'WSDM Gemma2-9b-it'
os.environ['WANDB_NOTES'] = f'WSDM Gemma2-9b-it LoRA Training VER-{VER}'
os.environ['WANDB_NAME'] = f'ft-gemma2-wsdm-ver-{VER}'

In [5]:
@dataclass
class Config:
    output_dir: str = 'output'
    checkpoint: str = 'unsloth/gemma-2-9b-it-bnb-4bit'
    old_path: str = 'TOP5-MODEL/QUANTIZED-TOP5-LMSYS-8BIT'
    max_length: int = 2200
    n_splits: int = 1000
    fold_idx: int = 0
    optim_type: str = 'adamw_8bit'
    per_device_train_batch_size: int = 8
    gradient_accumulation_steps: int = 2
    per_device_eval_batch_size: int = 2
    n_epochs: int = 1
    freeze_layers: int = 0
    lr: float = 2e-4
    warmup_ratio: float = 0.025
    lora_r: int = 64
    lora_alpha: float = 4.
    lora_dropout: float = 0.05
    lora_bias: str = 'none'
    num_labels: int = 2
    hdim: int = 3584
    head_dropout: float = 0.1
    label_smoothing_alpha: float = 0.
    seed: int = 0xFACED
    
config = Config()

In [6]:
training_args = TrainingArguments(
    output_dir=f'output-{VER}',
    overwrite_output_dir=True,
    report_to='wandb',
    num_train_epochs=config.n_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    logging_steps=1,
    eval_strategy='epoch',
    save_strategy='steps',
    save_steps=1000 if not USE_MY_PL else 10000,
    optim=config.optim_type,
    bf16=True,
    learning_rate=config.lr,
    warmup_ratio=config.warmup_ratio,
    seed=config.seed,
)

In [7]:
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
    layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.SEQ_CLS,
)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(config.checkpoint)
tokenizer.add_eos_token = True
if USE_LEFTSIDE_TRUNCATION:
    tokenizer.padding_side = 'left'
    tokenizer.truncation_side = 'left'

In [9]:
if not USE_LMSYS_MODEL and not USE_FSFAIRX:
    model = AutoModelForSequenceClassification.from_pretrained(
        config.checkpoint,
        num_labels=config.num_labels,
        torch_dtype=torch.bfloat16,
        device_map='auto',
    )
    model.score = torch.nn.Sequential(
        torch.nn.Dropout(config.head_dropout),
        torch.nn.Linear(config.hdim, config.hdim // 2),
        torch.nn.Dropout(config.head_dropout),
        torch.nn.GELU(),
        torch.nn.Linear(config.hdim // 2, config.num_labels),
    ).cuda().bfloat16()
elif USE_LMSYS_MODEL:
    model = AutoModelForSequenceClassification.from_pretrained(
        config.old_path,
        num_labels=config.num_labels,
        torch_dtype=torch.bfloat16,
        device_map='auto',
    )
    model.score = torch.nn.Sequential(
        torch.nn.Dropout(config.head_dropout),
        torch.nn.Linear(config.hdim, config.hdim // 2),
        torch.nn.Dropout(config.head_dropout),
        torch.nn.GELU(),
        torch.nn.Linear(config.hdim // 2, config.num_labels),
    ).cuda().bfloat16()
    tokenizer = AutoTokenizer.from_pretrained(config.old_path)
    tokenizer.add_eos_token = True
    if USE_LEFTSIDE_TRUNCATION:
        tokenizer.padding_side = 'left'
        tokenizer.truncation_side = 'left'
else:
    pass

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [11]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Gemma2ForSequenceClassification(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584, padding_idx=0)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): Modul

In [12]:
model.print_trainable_parameters()

trainable params: 222,500,098 || all params: 9,470,633,988 || trainable%: 2.3494


In [13]:
ds = Dataset.from_parquet('data/wsdm-original.parquet')

In [14]:
if USE_ORPO44K:
    ds_orpo = Dataset.from_parquet('data/orpo-dpo-44k-for-wsdm.parquet')
    ds_orpo = ds_orpo.remove_columns(['__index_level_0__'])

    ds = ds.remove_columns(['id', 'model_a', 'model_b', 'language'])
    
    ds = ds.cast(Features({
        'prompt': Value('string'),
        'response_a': Value('string'),
        'response_b': Value('string'),
        'winner': Value('string')
    }))

    ds = concatenate_datasets([ds, ds_orpo])

    ds = ds.shuffle(seed=0xFACED)

In [15]:
if USE_PUBLIC_8K:
    ds_public_8k = Dataset.from_parquet('data/wsdm-pseudolabeled/8k_pseudolabeled.parquet')

In [16]:
if USE_MY_PL:
    ds_my_pl = Dataset.from_parquet('data/wsdm-pseudolabeled/huuuuuge_pseudolabeled_df.parquet')
    if MY_PL_FRAC != 1:
        ds_my_pl = ds_my_pl.filter(lambda example, idx: idx % (1 / MY_PL_FRAC) == 0, with_indices=True)

In [17]:
if USE_MY_HARD_PL:
    ds_hard = Dataset.from_parquet('data/wsdm-pseudolabeled/wsdm_pseudolabeled_from_my100perc_thold_0.95.parquet')

    ds = ds.remove_columns(['id', 'model_a', 'model_b', 'language'])
    
    ds = ds.cast(Features({
        'prompt': Value('string'),
        'response_a': Value('string'),
        'response_b': Value('string'),
        'winner': Value('string')
    }))

    ds = concatenate_datasets([ds, ds_hard])
    
    ds = ds.shuffle(seed=0xFACED)

In [18]:
if USE_CONCAT_DATA:
    ds_concat =  Dataset.from_parquet('data/combined-lmsys-and-33k.parquet')

    ds = ds.remove_columns(['id', 'model_a', 'model_b', 'language'])
    
    ds = ds.cast(Features({
        'prompt': Value('string'),
        'response_a': Value('string'),
        'response_b': Value('string'),
        'winner': Value('string')
    }))

    ds = concatenate_datasets([ds, ds_concat])
    
    ds = ds.shuffle(seed=0xFACED)

In [19]:
class CustomTokenizer:
    def __init__(
        self, 
        tokenizer: PreTrainedTokenizerBase, 
        max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __call__(self, batch: dict) -> dict:
        prompt = ['<prompt>: ' + self.process_text(t) for t in batch['prompt']]
        response_a = ['\n\n<response_a>: ' + self.process_text(t) for t in batch['response_b']] #rev
        response_b = ['\n\n<response_b>: ' + self.process_text(t) for t in batch['response_a']] #rev
        
        texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
        
        labels = []
        for win in batch['winner']:
            if win in ('model_a', 'winner_a'):
                label = 1 # [0.99, 0.01] rev
            elif win in ('model_b', 'winner_b'):
                label = 0 # [0.01, 0.99] rev
            labels.append(label)
            
        return {**tokenized, 'labels': labels}

    @staticmethod
    def process_text(text: str) -> str:
        return ftfy.fix_text(text)

In [20]:
encode = CustomTokenizer(tokenizer, max_length=config.max_length)
ds = ds.map(encode, batched=True)

In [21]:
ds

Dataset({
    features: ['id', 'prompt', 'response_a', 'response_b', 'winner', 'model_a', 'model_b', 'language', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 48439
})

In [22]:
# class CustomTokenizerPL:
#     def __init__(
#         self, 
#         tokenizer: PreTrainedTokenizerBase, 
#         max_length: int
#     ) -> None:
#         self.tokenizer = tokenizer
#         self.max_length = max_length
        
#     def __call__(self, batch: dict) -> dict:
#         prompt = ['<prompt>: ' + self.process_text(t) for t in batch['prompt']]
#         response_a = ['\n\n<response_a>: ' + self.process_text(t) for t in batch['response_a']]
#         response_b = ['\n\n<response_b>: ' + self.process_text(t) for t in batch['response_b']]
        
#         texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
#         tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
        
#         labels = batch['winner']
            
#         return {**tokenized, 'labels': labels}

#     @staticmethod
#     def process_text(text: str) -> str:
#         return ftfy.fix_text(text)

In [23]:
# encode = CustomTokenizerPL(tokenizer, max_length=config.max_length)

# if USE_PUBLIC_8K:
#     ds_public_8k = ds_public_8k.map(encode, batched=True)
# if USE_MY_PL:
#     ds_my_pl = ds_my_pl.map(encode, batched=True)

In [24]:
# ds = ds.remove_columns(['prompt', 'response_a', 'response_b', 'winner'])

In [25]:
# if USE_PUBLIC_8K:
#     ds_public_8k = ds_public_8k.remove_columns(['prompt', 'response_a', 'response_b', 'winner'])
#     ds = concatenate_datasets([ds, ds_public_8k])
    
# if USE_MY_PL:
#     ds_my_pl = ds_my_pl.remove_columns(['prompt', 'response_a', 'response_b', 'winner'])  
#     ds = concatenate_datasets([ds, ds_my_pl])

# if USE_PUBLIC_8K or USE_MY_PL:
#     ds = ds.shuffle(seed=0xFACED)

In [26]:
ds

Dataset({
    features: ['id', 'prompt', 'response_a', 'response_b', 'winner', 'model_a', 'model_b', 'language', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 48439
})

In [27]:
def compute_metrics(eval_preds: EvalPrediction) -> dict:
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(preds).float().softmax(-1).numpy()
    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
    return {'acc': acc, 'log_loss': loss}

In [28]:
folds = [
    (
        [i for i in range(len(ds)) if i % config.n_splits != fold_idx],
        [i for i in range(len(ds)) if i % config.n_splits == fold_idx]
    ) 
    for fold_idx in range(config.n_splits)
]

In [29]:
# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
#         labels = inputs.pop("labels")
#         outputs = model(**inputs)
#         logits = outputs.get("logits")
#         loss = F.binary_cross_entropy_with_logits(logits, labels)
#         if return_outputs:
#             return loss, outputs
#         return loss

In [30]:
train_idx, eval_idx = folds[config.fold_idx]

trainer = Trainer(
    args=training_args, 
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds.select(train_idx),
    eval_dataset=ds.select(eval_idx),
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

In [None]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlightsource-[0m ([33mlightsource-unk[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


In [None]:
from huggingface_hub import login, HfApi

login(token="secret")

api = HfApi()

api.upload_folder(
    folder_path=f"output-{VER}/checkpoint-3024",  
    repo_id="lightsource/wsdm-top5-adapter-trained-only-on-rev-comp-data-2200-int8-0.XXX-w-tta",
    repo_type="model"
)