## Reward Model using Huggingface RewardTraining lib

Reward Model based on BERT-BASE-UNCASED

Dataset: CarperAI/openai_summarize_comparisons 

In [45]:
import torch 
import random 
import numpy as np 

from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments
from trl import RewardTrainer #, RewardConfig
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

from datasets import load_dataset


In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

### The model: bert-base-uncased

In [47]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') 

# Load a tokenizer (change the model name as per your requirements)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load dataset from HF

In [48]:
# Load the SQuAD dataset
dataset = load_dataset("CarperAI/openai_summarize_comparisons")



# Tokenize the dataset and get the lengths
tokenized_lengths = dataset["train"].map(lambda examples: {'lengths': len(tokenizer(examples['chosen'], add_special_tokens=True)["input_ids"])}, remove_columns=dataset["train"].column_names)
# Fetch max length
max_length = max(tokenized_lengths["lengths"])
print("Max token count:", max_length)



# Shuffle the indices
total_samples = len(dataset["valid2"])
all_indices = list(range(total_samples))
random.shuffle(all_indices)


# Select 'n'' random indices
n_samples = 40000 # With 12500 samples, train at 80% will be 10k
selected_indices = all_indices[:n_samples]

# Get the 'n'' random samples
dataset = dataset["valid2"].select(selected_indices)

len(dataset)


Found cached dataset parquet (C:/Users/juan_/.cache/huggingface/datasets/CarperAI___parquet/CarperAI--openai_summarize_comparisons-79d2c222a15dc8fb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/92534 [00:00<?, ? examples/s]

Max token count: 172


40000

In [49]:
# Split the dataset into train, val, and test
train_percent = 0.8
val_percent = 0.1
# test_percent is implicitly 0.1 since train + val + test = 1.0

train_size = int(train_percent * n_samples)
val_size = int(val_percent * n_samples)
# Remaining samples are for testing

train_dataset = dataset.select(list(range(train_size)))
val_dataset = dataset.select(list(range(train_size, train_size + val_size)))
test_dataset = dataset.select(list(range(train_size + val_size, n_samples)))

The HF RewardTraining util expects a very specific dataset format with 2 features: chosen and rejected. The dataset we are using includes 'prompt' features. Lets drop it:

In [50]:
# Remove 'prompt' column from each dataset
train_dataset = train_dataset.remove_columns(['prompt'])
val_dataset = val_dataset.remove_columns(['prompt'])
#test_dataset = test_dataset.remove_columns(['prompt'])

Each final dataset object should contain two 4 entries:

* input_ids_chosen
* attention_mask_chosen
* input_ids_rejected
* attention_mask_rejected

In [51]:
def process_features(batch):
    # Tokenize 'chosen' feature
    chosen_tokens = tokenizer(batch['chosen'], padding='max_length', truncation=True, max_length=512, return_tensors='np')
    batch['input_ids_chosen'] = chosen_tokens['input_ids']
    batch['attention_mask_chosen'] = chosen_tokens['attention_mask']
    
    # Tokenize 'rejected' feature
    rejected_tokens = tokenizer(batch['rejected'], padding='max_length', truncation=True, max_length=512, return_tensors='np')
    batch['input_ids_rejected'] = rejected_tokens['input_ids']
    batch['attention_mask_rejected'] = rejected_tokens['attention_mask']
    
    return batch

# Apply the function to your datasets
train_dataset = train_dataset.map(process_features, batched=True)
val_dataset = val_dataset.map(process_features, batched=True)
#test_dataset = test_dataset.map(process_features, batched=True)

# Remove original 'chosen' and 'rejected' columns
columns_to_remove = ['chosen', 'rejected']
train_dataset = train_dataset.remove_columns(columns_to_remove)
val_dataset = val_dataset.remove_columns(columns_to_remove)
#test_dataset = test_dataset.remove_columns(columns_to_remove)


Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

### Prepare the training objects

In [52]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [55]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.2,
)

In [56]:
output_dir = './model_bert_hf_experiment2' 

training_args = TrainingArguments(
    output_dir=output_dir,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=8,
)

In [57]:
trainer = RewardTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    compute_metrics=compute_metrics,
    max_length=256,
)




In [58]:
trainer.train()



  0%|          | 0/32000 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


{'loss': 0.6882, 'learning_rate': 4.921875e-05, 'epoch': 0.12}
{'loss': 0.6562, 'learning_rate': 4.8437500000000005e-05, 'epoch': 0.25}
{'loss': 0.6447, 'learning_rate': 4.765625e-05, 'epoch': 0.38}
{'loss': 0.6406, 'learning_rate': 4.6875e-05, 'epoch': 0.5}
{'loss': 0.6336, 'learning_rate': 4.609375e-05, 'epoch': 0.62}
{'loss': 0.6388, 'learning_rate': 4.5312500000000004e-05, 'epoch': 0.75}
{'loss': 0.6351, 'learning_rate': 4.453125e-05, 'epoch': 0.88}
{'loss': 0.6185, 'learning_rate': 4.375e-05, 'epoch': 1.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 0.6250253915786743, 'eval_accuracy': 0.64075, 'eval_runtime': 143.3438, 'eval_samples_per_second': 27.905, 'eval_steps_per_second': 3.488, 'epoch': 1.0}




{'loss': 0.6172, 'learning_rate': 4.2968750000000004e-05, 'epoch': 1.12}
{'loss': 0.6278, 'learning_rate': 4.21875e-05, 'epoch': 1.25}
{'loss': 0.6267, 'learning_rate': 4.140625e-05, 'epoch': 1.38}
{'loss': 0.6274, 'learning_rate': 4.0625000000000005e-05, 'epoch': 1.5}
{'loss': 0.6248, 'learning_rate': 3.984375e-05, 'epoch': 1.62}
{'loss': 0.6078, 'learning_rate': 3.90625e-05, 'epoch': 1.75}
{'loss': 0.6249, 'learning_rate': 3.828125e-05, 'epoch': 1.88}
{'loss': 0.6157, 'learning_rate': 3.7500000000000003e-05, 'epoch': 2.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 0.6143164038658142, 'eval_accuracy': 0.6605, 'eval_runtime': 143.6905, 'eval_samples_per_second': 27.838, 'eval_steps_per_second': 3.48, 'epoch': 2.0}




{'loss': 0.6155, 'learning_rate': 3.671875e-05, 'epoch': 2.12}
{'loss': 0.619, 'learning_rate': 3.59375e-05, 'epoch': 2.25}
{'loss': 0.6107, 'learning_rate': 3.5156250000000004e-05, 'epoch': 2.38}
{'loss': 0.6087, 'learning_rate': 3.4375e-05, 'epoch': 2.5}
{'loss': 0.6066, 'learning_rate': 3.359375e-05, 'epoch': 2.62}
{'loss': 0.6075, 'learning_rate': 3.2812500000000005e-05, 'epoch': 2.75}
{'loss': 0.5972, 'learning_rate': 3.203125e-05, 'epoch': 2.88}
{'loss': 0.6139, 'learning_rate': 3.125e-05, 'epoch': 3.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 0.6110466122627258, 'eval_accuracy': 0.66475, 'eval_runtime': 142.2232, 'eval_samples_per_second': 28.125, 'eval_steps_per_second': 3.516, 'epoch': 3.0}




{'loss': 0.6055, 'learning_rate': 3.0468750000000002e-05, 'epoch': 3.12}
{'loss': 0.6186, 'learning_rate': 2.96875e-05, 'epoch': 3.25}
{'loss': 0.6023, 'learning_rate': 2.890625e-05, 'epoch': 3.38}
{'loss': 0.5963, 'learning_rate': 2.8125000000000003e-05, 'epoch': 3.5}
{'loss': 0.6015, 'learning_rate': 2.734375e-05, 'epoch': 3.62}
{'loss': 0.5982, 'learning_rate': 2.6562500000000002e-05, 'epoch': 3.75}
{'loss': 0.5977, 'learning_rate': 2.578125e-05, 'epoch': 3.88}
{'loss': 0.594, 'learning_rate': 2.5e-05, 'epoch': 4.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 0.612758994102478, 'eval_accuracy': 0.6675, 'eval_runtime': 139.6517, 'eval_samples_per_second': 28.643, 'eval_steps_per_second': 3.58, 'epoch': 4.0}




{'loss': 0.5977, 'learning_rate': 2.4218750000000003e-05, 'epoch': 4.12}
{'loss': 0.597, 'learning_rate': 2.34375e-05, 'epoch': 4.25}
{'loss': 0.5918, 'learning_rate': 2.2656250000000002e-05, 'epoch': 4.38}
{'loss': 0.5948, 'learning_rate': 2.1875e-05, 'epoch': 4.5}
{'loss': 0.5903, 'learning_rate': 2.109375e-05, 'epoch': 4.62}
{'loss': 0.5936, 'learning_rate': 2.0312500000000002e-05, 'epoch': 4.75}
{'loss': 0.5831, 'learning_rate': 1.953125e-05, 'epoch': 4.88}
{'loss': 0.5947, 'learning_rate': 1.8750000000000002e-05, 'epoch': 5.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 0.6098501086235046, 'eval_accuracy': 0.66925, 'eval_runtime': 139.5495, 'eval_samples_per_second': 28.664, 'eval_steps_per_second': 3.583, 'epoch': 5.0}




{'loss': 0.5819, 'learning_rate': 1.796875e-05, 'epoch': 5.12}
{'loss': 0.5912, 'learning_rate': 1.71875e-05, 'epoch': 5.25}
{'loss': 0.582, 'learning_rate': 1.6406250000000002e-05, 'epoch': 5.38}
{'loss': 0.576, 'learning_rate': 1.5625e-05, 'epoch': 5.5}
{'loss': 0.5764, 'learning_rate': 1.484375e-05, 'epoch': 5.62}
{'loss': 0.5853, 'learning_rate': 1.4062500000000001e-05, 'epoch': 5.75}
{'loss': 0.5843, 'learning_rate': 1.3281250000000001e-05, 'epoch': 5.88}
{'loss': 0.5921, 'learning_rate': 1.25e-05, 'epoch': 6.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 0.6071081757545471, 'eval_accuracy': 0.67425, 'eval_runtime': 140.8946, 'eval_samples_per_second': 28.39, 'eval_steps_per_second': 3.549, 'epoch': 6.0}




{'loss': 0.5799, 'learning_rate': 1.171875e-05, 'epoch': 6.12}
{'loss': 0.5794, 'learning_rate': 1.09375e-05, 'epoch': 6.25}
{'loss': 0.5804, 'learning_rate': 1.0156250000000001e-05, 'epoch': 6.38}
{'loss': 0.583, 'learning_rate': 9.375000000000001e-06, 'epoch': 6.5}
{'loss': 0.5792, 'learning_rate': 8.59375e-06, 'epoch': 6.62}
{'loss': 0.5706, 'learning_rate': 7.8125e-06, 'epoch': 6.75}
{'loss': 0.5749, 'learning_rate': 7.031250000000001e-06, 'epoch': 6.88}
{'loss': 0.585, 'learning_rate': 6.25e-06, 'epoch': 7.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 0.6101868748664856, 'eval_accuracy': 0.67275, 'eval_runtime': 139.4081, 'eval_samples_per_second': 28.693, 'eval_steps_per_second': 3.587, 'epoch': 7.0}




{'loss': 0.577, 'learning_rate': 5.46875e-06, 'epoch': 7.12}
{'loss': 0.5773, 'learning_rate': 4.6875000000000004e-06, 'epoch': 7.25}
{'loss': 0.5711, 'learning_rate': 3.90625e-06, 'epoch': 7.38}
{'loss': 0.5765, 'learning_rate': 3.125e-06, 'epoch': 7.5}
{'loss': 0.5833, 'learning_rate': 2.3437500000000002e-06, 'epoch': 7.62}
{'loss': 0.5594, 'learning_rate': 1.5625e-06, 'epoch': 7.75}
{'loss': 0.5822, 'learning_rate': 7.8125e-07, 'epoch': 7.88}
{'loss': 0.5703, 'learning_rate': 0.0, 'epoch': 8.0}


  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 0.6113418936729431, 'eval_accuracy': 0.67275, 'eval_runtime': 140.1212, 'eval_samples_per_second': 28.547, 'eval_steps_per_second': 3.568, 'epoch': 8.0}
{'train_runtime': 20834.5183, 'train_samples_per_second': 12.287, 'train_steps_per_second': 1.536, 'train_loss': 0.600979121208191, 'epoch': 8.0}


TrainOutput(global_step=32000, training_loss=0.600979121208191, metrics={'train_runtime': 20834.5183, 'train_samples_per_second': 12.287, 'train_steps_per_second': 1.536, 'train_loss': 0.600979121208191, 'epoch': 8.0})

In [59]:
# Save the model and tokenizer
model.save_pretrained("./model_bert_hf_experiment2/")
tokenizer.save_pretrained("./model_bert_hf_experiment2/")

('./model_bert_hf_experiment2/tokenizer_config.json',
 './model_bert_hf_experiment2/special_tokens_map.json',
 './model_bert_hf_experiment2/vocab.txt',
 './model_bert_hf_experiment2/added_tokens.json',
 './model_bert_hf_experiment2/tokenizer.json')

### Load model if necessary

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("./model_bert_hf_experiment2/")
tokenizer = AutoTokenizer.from_pretrained("./model_bert_hf_experiment2/")


### Evaluate

In [60]:
# Evaluate the model
results = trainer.evaluate()

# Print metrics
print(results)



  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 0.6113418936729431, 'eval_accuracy': 0.67275, 'eval_runtime': 140.2892, 'eval_samples_per_second': 28.513, 'eval_steps_per_second': 3.564, 'epoch': 8.0}


In [61]:
# Get predictions
predictions, label_ids, _ = trainer.predict(val_dataset)



  0%|          | 0/500 [00:00<?, ?it/s]

In [62]:
# Convert logits to labels
predicted_labels = np.argmax(predictions, axis=1)

# Compute accuracy or any other metric
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(label_ids, predicted_labels)
print("Accuracy:", accuracy)

Accuracy: 0.67275


### Inference

In [69]:
import torch.nn.functional as F

def score_summaries(model, tokenizer, chosen_summary, rejected_summary):
    # Tokenize the inputs
    chosen_tokens = tokenizer(chosen_summary, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
    rejected_tokens = tokenizer(rejected_summary, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
    
    chosen_tokens.to(device)
    rejected_tokens.to(device)
    
    # Get logits from the model
    with torch.no_grad():
        chosen_logits = model(**chosen_tokens).logits
        rejected_logits = model(**rejected_tokens).logits
    
    # Apply softmax to get probabilities
    chosen_probs = F.softmax(chosen_logits, dim=-1)
    rejected_probs = F.softmax(rejected_logits, dim=-1)

    # Assuming the positive class (indicating 'chosen' is good) is the second one
    chosen_score = chosen_probs[0][1].item()
    rejected_score = rejected_probs[0][1].item()
    
    # Extract logits for each summary
    chosen_logit = chosen_logits[0][1].item()
    rejected_logit = rejected_logits[0][1].item()

    return chosen_score, rejected_score, chosen_logit, rejected_logit
    

In [70]:
# Example usage
chosen_summary = "TL;DR: My Girlfriend of 15 months went through my Facebook messages without my permission and found old conversations of me flirting with a girl. She broke up with me and went no contact."
rejected_summary = "TL;DR: My girlfriend and I broke up after she went through my Facebook account without my permission.<|endoftext|>Citizens for the Republic"

In [71]:
chosen_score, rejected_score, chosen_logit, rejected_logit = score_summaries(model, tokenizer, chosen_summary, rejected_summary)

print(f"Chosen Score: {chosen_score:.4f}")
print(f"Rejected Score: {rejected_score:.4f}")

print(f"Chosen Logit: {chosen_logit:.4f}")
print(f"Rejected Logit: {rejected_logit:.4f}")


Chosen Score: 0.4981
Rejected Score: 0.5131
Chosen Logit: 0.0200
Rejected Logit: -1.3241


In [None]:
test_dataset 

In [68]:

def evaluate_on_test_samples(model, tokenizer, test_data, n):
    results = []
    for i in range(n):
        chosen_summary = test_data['chosen'][i]
        rejected_summary = test_data['rejected'][i]
        
        chosen_score, rejected_score, chosen_logit, rejected_logit = score_summaries(model, tokenizer, chosen_summary, rejected_summary)
        results.append({
            'chosen_summary': chosen_summary,
            'rejected_summary': rejected_summary,
            'chosen_score': chosen_score,
            'rejected_score': rejected_score,
            'chosen_logit': chosen_logit,
            'rejected_logit': rejected_logit
        })
    return results

# Run the evaluation on top 'n' samples
n = 20  # or any other number up to 2500
results = evaluate_on_test_samples(model, tokenizer, test_dataset, n)

# Print results
for i, result in enumerate(results, 1):
    print(f"Sample {i} - Chosen Logit: {result['chosen_logit']:.4f} | Rejected Logit: {result['rejected_logit']:.4f}")
    #print(f"Sample {i} - Chosen Score: {result['chosen_score']:.4f} | Chosen Logit: {result['chosen_logit']:.4f} - Rejected Score: {result['rejected_score']:.4f} | Rejected Logit: {result['rejected_logit']:.4f}")
    #print(f"Sample {i} - Chosen Summary: {result['chosen_summary']} - Score: {result['chosen_score']:.4f} | Logit: {result['chosen_logit']:.4f}")
    #print(f"Chosen Summary: {result['chosen_summary']}")
    #print(f"Chosen Score: {result['chosen_score']:.4f} | Logit: {result['chosen_logit']:.4f}")
    #print(f"Rejected Summary: {result['rejected_summary']} - Rejected Score: {result['rejected_score']:.4f} | Logit: {result['rejected_logit']:.4f}")
    #print(f"Rejected Score: {result['rejected_score']:.4f} | Logit: {result['rejected_logit']:.4f}")
    #print("-" * 50)


Sample 1 - Chosen Logit: 0.7876 | Rejected Logit: 0.1472
Sample 2 - Chosen Logit: 0.7826 | Rejected Logit: 0.6687
Sample 3 - Chosen Logit: 0.5691 | Rejected Logit: 0.2868
Sample 4 - Chosen Logit: 1.5987 | Rejected Logit: 1.3197
Sample 5 - Chosen Logit: 0.5945 | Rejected Logit: 0.2571
Sample 6 - Chosen Logit: -1.4813 | Rejected Logit: -0.6219
Sample 7 - Chosen Logit: 0.2960 | Rejected Logit: -3.3139
Sample 8 - Chosen Logit: 0.0979 | Rejected Logit: 0.4782
Sample 9 - Chosen Logit: 0.4443 | Rejected Logit: -0.7533
Sample 10 - Chosen Logit: -0.0003 | Rejected Logit: 0.7359
Sample 11 - Chosen Logit: -0.0742 | Rejected Logit: 0.8884
Sample 12 - Chosen Logit: -0.6051 | Rejected Logit: 1.3530
Sample 13 - Chosen Logit: 0.1496 | Rejected Logit: 1.0381
Sample 14 - Chosen Logit: 1.6363 | Rejected Logit: -0.3961
Sample 15 - Chosen Logit: -1.4088 | Rejected Logit: -0.3917
Sample 16 - Chosen Logit: -0.3622 | Rejected Logit: -1.1410
Sample 17 - Chosen Logit: 0.4717 | Rejected Logit: -2.4015
Sample 18 