In [None]:
import numpy as np
import torch
import random

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
#     torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
#     torch.backends.cudnn.benchmark = False  # Ensure reproducibility, might slow down performance
    random.seed(seed)

# Set the seed for all libraries
set_seed(42)

In [None]:
import shutil
src_paths = [r"/kaggle/input/armorm-llama3-8b-v0-1-mdl-custom/modeling_custom.py", r"/kaggle/input/lmsys-ddp-for-armor-llama/lmsys_ddp.py"]
dst_path = r"/kaggle/working/"
for src_path in src_paths:
    shutil.copy(src_path, dst_path)

In [None]:
import pandas as pd
import numpy as np
import ast
import re
from typing import List

In [None]:
INPUT_SIZE = 19
HIDDEN_SIZE = 128
NUM_CLASSES = 3
BATCH_SIZE = 8
PER_DEVICE_TRAIN_BATCH_SIZE = PER_DEVICE_EVAL_BATCH_SIZE = 1
NUM_EPOCH = 1
GRADIENT_ACCUMULATION_STEPS = 8
MODEL_CKPT = "RLHFlow/ArmoRM-Llama3-8B-v0.1"
LEARNING_RATE = 1e-4
MAX_LENGTH = 1536
# Might take a look at LoRA to remind myself of how these hyperparameters work
LORA_R = 4
LORA_ALPHA = LORA_R * 2
LORA_DROPOUT = 0.05
LORA_BIAS = 'none'

In [None]:
train_df = pd.read_pickle('/kaggle/input/lmsys-preprocessed-data/preprocessed_data.pkl')
def truncate(df, max_length):
    def truncation_op(x, col: str, max_len: int = 1024):
        return x[col][:min(len(x[col]), max_len)]
    
    df['input_ids_a'] = df.apply(truncation_op, args=('input_ids_a', max_length), axis=1)
    df['input_ids_b'] = df.apply(truncation_op, args=('input_ids_b', max_length), axis=1)
    
    return df

train_df = truncate(train_df, MAX_LENGTH)

In [None]:
token_pattern = [128009, 128006, 78191, 128007, 271]
def find_token_for_gating(lst, ):
    """Find the last occurrence of a token_pattern in a list."""
    token_pattern_len = len(token_pattern)
    search_end = len(lst)
    for j in range(search_end - token_pattern_len, -1, -1):
        if lst[j:j + token_pattern_len] == token_pattern:
            return j
    return -1

In [None]:
# remove all data points where the prompt token lengths exceed 
exp = []
for i in range(len(train_df)): 
    if(find_token_for_gating(train_df.iloc[i]['input_ids_a']) < 0):
        exp.append(i)

In [None]:
# remove these rows from training data        
train_df = train_df.drop(index=exp)

In [None]:
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForSequenceClassification
from functools import partial
import torch

tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

### Prepare datasets

In [None]:
from datasets import Dataset, DatasetDict
train_df_cop = train_df.copy()
dataset = Dataset.from_pandas(train_df_cop[['input_ids_a', 'input_ids_b', 'label']])
dataset.set_format(type="torch")
shuffled_dataset = dataset.shuffle(seed=42)
train_test_split = shuffled_dataset.train_test_split(test_size=0.2)

# Optionally, you can wrap them in a DatasetDict for easier handling
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})

# Build a network for model comparison

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from modeling_custom import LlamaForRewardModelWithGating

class SwiGLU(nn.Module):
    def forward(self, x):
        x, gate = x.chunk(2, dim=-1)
        return F.silu(gate) * x

class DualInputInteractionNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(DualInputInteractionNetwork, self).__init__()
        self.classification_head = nn.Sequential(nn.Linear(input_size, hidden_size),
                                                 nn.Linear(hidden_size, hidden_size),
                                                 nn.Linear(hidden_size, num_classes)  # Combining interaction features
                                                )

    def forward_one(self, x):
        x = F.silu(self.shared_fc1(x))
        x = F.silu(self.shared_fc2(x))
        return x
    
    @staticmethod
    def SwiGLU(x):
        x, gate = x.chunk(2, dim=-1)
        return F.silu(gate) * x
    
    def forward(self, inputs: dict, reward_transform_matrix: torch.Tensor):
        
        batch_size = inputs.rewards.shape[0]
        model_a_idx = torch.arange(0, batch_size, 2)
        model_b_idx = model_a_idx + 1
        
        multi_obj_rewards_a = inputs.rewards[model_a_idx]
        multi_obj_rewards_b = inputs.rewards[model_b_idx]
        
        multi_obj_coeffs_a = inputs.gating_output[model_a_idx] @ reward_transform_matrix.T
        multi_obj_coeffs_b = inputs.gating_output[model_b_idx] @ reward_transform_matrix.T
        
        scaled_mul_obj_rewards_a = multi_obj_rewards_a * multi_obj_coeffs_a
        scaled_mul_obj_rewards_b = multi_obj_rewards_b * multi_obj_coeffs_b
        
        output = self.classification_head(scaled_mul_obj_rewards_a - scaled_mul_obj_rewards_b)
        
        return output

In [None]:
from typing import Optional

class LLaMaPreferencePredictionModel(nn.Module):
    
    def __init__(self, reward_model_ckpt, reward_model_quant_config: BitsAndBytesConfig, 
                 lora_config: Optional[dict] = None, 
                 **dual_input_interaction_kwargs):
        """
        there are Gemma variants for ArMor
        """
        
        super().__init__()
        self.reward_model = LlamaForRewardModelWithGating.from_pretrained(reward_model_ckpt, 
                                                                          quantization_config=reward_model_quant_config
                                                                                            )
        
        self.preference_prediction_model = DualInputInteractionNetwork(**dual_input_interaction_kwargs)
        
        self.register_buffer('reward_transform_matrix', self.reward_model.reward_transform_matrix)
        
        if lora_config is not None:
            self.lora_setup(**lora_config)
            
    def lora_config(self, **lora_config):
        
        from peft import LoraConfig, TaskType
        self.lora_config = LoraConfig(**lora_config,
                                target_modules=["q_proj", "k_proj", "v_proj"],
                                task_type=TaskType.SEQ_CLS)
        
    def lora_setup(self, **lora_config):
        
        from peft import get_peft_model, prepare_model_for_kbit_training
        self.lora_config(**lora_config)
        
        self.reward_model = prepare_model_for_kbit_training(self.reward_model)
        self.reward_model = get_peft_model(self.reward_model, self.lora_config)
    
        
    def forward(self, input_ids, attention_mask, label=None):
        # there are data fields that the ArMor model does not require, so passing kwargs would be the temporary solution
        # armoRM framework only takes input_ids as inputs
        
        reward_model_output = self.reward_model(input_ids, attention_mask)
        logits = self.preference_prediction_model(reward_model_output, self.reward_transform_matrix)
        if label is None:
            return {"reward_model_output": reward_model_output,
                    "logits": logits}
        
        return {"reward_model_output": reward_model_output,
                "logits": logits,
                "label": label}

### Create a custom data collator & trainer for this use case

In [None]:
from transformers import Trainer
from transformers import DataCollatorWithPadding
from dataclasses import dataclass
from typing import Union, Dict, Any
from transformers.tokenization_utils_base import PaddingStrategy

@dataclass
class RewardDataCollatorWithPadding:
    tokenizer: AutoTokenizer
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        merged_features = []
        for feature in features:
            merged_features.append(
                {
                    "input_ids": feature["input_ids_a"]
                }
            )
            merged_features.append(
                {
                    "input_ids": feature["input_ids_b"]
                }
            )
        batch = self.tokenizer.pad(
            merged_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch = {
            "input_ids": batch["input_ids"],
            "attention_mask": batch["attention_mask"],
            "label": torch.tensor([feature["label"].item() for feature in features])
        }
        
        return batch
                    
class RewardTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(
            input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], label=inputs['label']
        )
        logits, targets = outputs['logits'], outputs['label']
        loss = nn.functional.cross_entropy(logits, targets).mean()
        
        return loss

def compute_metrics(pred):
    
    # Get the predictions and labels from the pred argument
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    
    # Calculate cross entropy loss
    # Convert logits to PyTorch tensor
    logits_tensor = torch.tensor(logits)
    labels_tensor = torch.tensor(labels)
    
    # Compute cross entropy loss
    cross_entropy_loss = F.cross_entropy(logits_tensor, labels_tensor).item()

    return {
        "cross_entropy_loss": cross_entropy_loss
    }

In [None]:
# from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from modeling_custom import LlamaForRewardModelWithGating
import torch

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True, # quantize the quantization factor, which saves another 0.4 bit/parameter
   bnb_4bit_compute_dtype=torch.float16 # configure computations to be in (b)float16
)

lora_config = {'r': LORA_R,
               'lora_alpha': LORA_ALPHA,
               'lora_dropout': LORA_DROPOUT,
               'bias': LORA_BIAS
                }

### Multi-GPU training

In [None]:
from torch.utils.data import DataLoader, TensorDataset, DistributedSampler
from tqdm import tqdm
from accelerate import Accelerator
from torch.optim import AdamW
from transformers import get_scheduler

def create_dataloaders(dataset_dict: DatasetDict, data_collator, batch_size: int = 4):
    train_dataloader = DataLoader(dataset_dict['train'], batch_size=batch_size, collate_fn=data_collator)
    eval_dataloader = DataLoader(dataset_dict['validation'], batch_size=batch_size, collate_fn=data_collator)
    return train_dataloader, eval_dataloader

def train_one_epoch(model, dataloader, optimizer, criterion, accelerator, lr_scheduler, epoch):
    accelerator.print("Training on one epoch")
    model.train()
    epoch_loss, step_loss = 0., 0.
    for i, batch in enumerate(tqdm(dataloader, desc=f'Epoch {epoch+1}')):
#         print(batch['input_ids'].shape)
        with accelerator.accumulate(model):
            # Forward pass
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch["attention_mask"], 
                label=batch['label']
            )
            
            logits = outputs['logits']
            loss = criterion(logits, batch['label'])
            step_loss += loss.item()
            epoch_loss += loss.item()
            
            if (i + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                accelerator.log({"train_loss": step_loss / GRADIENT_ACCUMULATION_STEPS, "epoch": epoch + 1})
                step_loss = 0.
                
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

    accelerator.print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(dataloader)}")
    
def eval_one_epoch(model, dataloader, optimizer, criterion, accelerator, lr_scheduler, epoch):
    model.eval()
    epoch_loss = 0
    
    for batch in tqdm(dataloader):

        # Forward pass
        outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch["attention_mask"], 
                label=batch['label']
            )
        
        logits = outputs['logits']
        loss = criterion(logits, batch['label'])
        epoch_loss += loss.item()
    
    accelerator.log({"epoch_loss": epoch_loss / len(dataloader), "epoch": epoch + 1})
    
    accelerator.print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(dataloader)}")
    

def main_train_loop(model, dataset_dict, data_collator, mixed_precision, batch_size, gradient_accumulation_steps, num_epoch, learning_rate):
    
    !pip install peft -U -qq
    accelerator = Accelerator(mixed_precision=mixed_precision, 
                              gradient_accumulation_steps=gradient_accumulation_steps,
                              log_with="wandb"
                             )
    
    model = LLaMaPreferencePredictionModel(reward_model_ckpt=MODEL_CKPT, 
                                       reward_model_quant_config=nf4_config, 
                                       lora_config=lora_config,
                                       input_size=INPUT_SIZE, 
                                       hidden_size=HIDDEN_SIZE, 
                                       num_classes=NUM_CLASSES) 
    
    if accelerator.is_local_main_process:
        accelerator.init_trackers(
        project_name="distributed_training",
        init_kwargs={"wandb": {"entity": "ashton_h", "name": "armor-1-epoch"}}
        )
    
    train_dataloader, eval_dataloader = create_dataloaders(dataset_dict, data_collator, batch_size)
    
    def print_trainable_parameters(model):
        """
        Prints the number of trainable parameters in the model.
        """
        trainable_params = 0
        all_param = 0
        for _, param in model.named_parameters():
            all_param += param.numel()
            if param.requires_grad:
                trainable_params += param.numel()
        accelerator.print(
            f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
        )
        
    # print the number of trainable parameters
    print_trainable_parameters(model)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    lr_scheduler = get_scheduler(
        "cosine", optimizer=optimizer, num_warmup_steps=0.03, num_training_steps=len(train_dataloader) * num_epoch
    )
    
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader, lr_scheduler)
    
    for epoch in range(num_epoch):
        train_one_epoch(model, train_dataloader, optimizer, criterion, accelerator, lr_scheduler, epoch)
        try:
            eval_one_epoch(model, eval_dataloader, optimizer, criterion, accelerator, lr_scheduler, epoch)
        except Exception as e:
            print(e)
            pass
        
    if accelerator.is_local_main_process:
        accelerator.end_training()


In [None]:
data_collator = RewardDataCollatorWithPadding(tokenizer, padding='longest')

In [None]:
import wandb
wandb.init()

In [None]:
from accelerate import notebook_launcher
args = (dataset_dict, data_collator, "bf16", BATCH_SIZE, GRADIENT_ACCUMULATION_STEPS, NUM_EPOCH, LEARNING_RATE)
notebook_launcher(main_train_loop, args, num_processes=2)

In [None]:
wandb.finish()