#  **Reward Modeling**

In [1]:
#!pip install tyro
#!pip install transformers
#!pip install torch
#!pip install datasets
#!pip install accelerate
#!pip install peft
#!pip install trl
#!pip install peft

## Import necessary modules and packages

In [2]:
# ML tasks
import torch
import tyro

# Data manipulation
import pandas as pd

# Data handling and modeling
from sklearn.model_selection import train_test_split
from dataclasses import dataclass, field
from typing import Optional
from __future__ import annotations
from accelerate import Accelerator


# TRL library for RL
from trl.core import LengthSampler
from trl import (
    RewardConfig,
    RewardTrainer,
    is_xpu_available,
    AutoModelForCausalLMWithValueHead,
    PPOConfig,
    PPOTrainer
)

# Libraries for NLP
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainerCallback,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    GPT2Tokenizer,
    pipeline
)

## Reward Model

In [3]:
@dataclass
class ScriptArguments:
    model_name: str = "facebook/opt-350m"
    """the model name"""
    dataset_name: str = "Anthropic/hh-rlhf"
    """the dataset name"""
    dataset_text_field: str = "text"
    """the text field of the dataset"""
    eval_split: str = "none"
    """the dataset split to evaluate on; default to 'none' (no evaluation)"""
    load_in_8bit: bool = False
    """load the model in 8 bits precision"""
    load_in_4bit: bool = False
    """load the model in 4 bits precision"""
    trust_remote_code: bool = True
    """Enable `trust_remote_code`"""
    reward_config: RewardConfig = field(
        default_factory=lambda: RewardConfig(
            output_dir="output",
            per_device_train_batch_size=8,  #Choosing th batch size as small as possible to be able to train the model
            num_train_epochs=1,
            gradient_accumulation_steps=16,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": False},
            learning_rate=1.41e-5,
            report_to="tensorboard",
            remove_unused_columns=False,
            optim="adamw_torch",
            logging_steps=500,
            evaluation_strategy="no",
            max_length=512,
        )
    )


quantization_config = BitsAndBytesConfig(load_in_8bit=ScriptArguments.load_in_8bit, load_in_4bit=ScriptArguments.load_in_4bit)

# Step 1: Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    'facebook/opt-350m',
    num_labels=1,
)


# Step 2: Load the dataset and pre-process it
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
train_dataset = load_dataset("Anthropic/hh-rlhf", split="train[:1%]") #Reducing the size of the training set to be able to train the model


# Tokenize chosen/rejected pairs of inputs
# Adapt this section to your needs for custom datasets
def preprocess_function(examples):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
        tokenized_chosen = tokenizer(chosen)
        tokenized_rejected = tokenizer(rejected)

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return new_examples


# Step 3 :Preprocess the dataset and filter out examples that are longer than ScriptArguments.max_length
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
)

train_dataset = train_dataset.filter(
    lambda x: len(x["input_ids_chosen"]) <= 512
    and len(x["input_ids_rejected"]) <= 512
)


# Step 4: Define the LoraConfig
peft_config = LoraConfig(
            r=16,
            lora_alpha=16,
            bias="none",
            task_type="SEQ_CLS",
            modules_to_save=["scores"],
        )

eval_dataset=None


# Step 5: Define the Trainer
trainer = RewardTrainer(
    model=model,
    tokenizer=tokenizer,
    args= RewardConfig(
            output_dir="output",
            per_device_train_batch_size=8,
            num_train_epochs=1,
            gradient_accumulation_steps=16,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": False},
            learning_rate=1.41e-5,
            report_to="tensorboard",
            remove_unused_columns=False,
            optim="adamw_torch",
            logging_steps=500,
            evaluation_strategy="no",
            max_length=512,
        ),
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config
)



Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training Reward Model

In [4]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss


TrainOutput(global_step=12, training_loss=0.9080644448598226, metrics={'train_runtime': 599.6848, 'train_samples_per_second': 2.605, 'train_steps_per_second': 0.02, 'total_flos': 0.0, 'train_loss': 0.9080644448598226, 'epoch': 0.98})