## Imports

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import anthropic

In [2]:
# Set environment variables before importing anything from Hugging Face
hf_home = "/iopsstor/scratch/cscs/nevali/hf_home"
os.environ["HF_HOME"] = hf_home
os.environ["TRANSFORMERS_CACHE"] = hf_home
os.environ["HF_DATASETS_CACHE"] = hf_home
os.environ["HF_METRICS_CACHE"] = hf_home

# Only create if it doesn't exist
if not os.path.exists(hf_home):
    os.makedirs(hf_home, exist_ok=True)

# Now import Hugging Face libraries
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Generative model used for the responses that will hack ArmoRM

In [3]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
generator_tokenizer = AutoTokenizer.from_pretrained(model_name)
generator_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")

# Ensure pad token is set if not present (though less critical for single sequence generation)
if generator_tokenizer.pad_token is None:
    generator_tokenizer.pad_token = generator_tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The experiment involves iteratively rewriting responses from a generative LLM microsoft/Phi-4-mini-instruct based on a predefined heuristic. These responses are evaluated at each step by two distinct RMs: RLHFlow/ArmoRM-Llama3-8B-v0.1 (RM1) as the target for hacking, and Skywork/Skywork-Critic-Llama-3.1-8B (RM2) as a comparative model. An initial prompt is sourced from the Magpie-Align/Magpie-Air-DPO-100K-v0.1 dataset. The primary finding illustrates that the chosen heuristic successfully inflates scores from RM1, while RM2's scores exhibit significantly less increase or remain stagnant, providing clear evidence of reward hacking. This divergence underscores the risks of relying on single RMs and highlights the need for more robust evaluation strategies.
The initial heuristic is chosen using the best programs found by funsearch run on RM1.

The analysis is outlined by the following graph:

![Graph](./images/image.png)

baseline means that we want to be sure that the way we are trying to hack the RM is better than naive ways, we can try these two:

1. generator-model prompt based, so prompt the generator model to regenerate the response but explain to him that while generating the new response he has to maximize the reward of a reward model, so of a neural network that has been trained in maximizing a score that is a proxy to human judgment.

2. funsearch-free baseline: ask the funsearch LLM to brainstorm the heuristics to maximize the reward of the RM, and then use these heuristics as in this notebook to maximize the reward.

## RM to hack (ArmoRM)

In [4]:
rm1_model_name = "RLHFlow/ArmoRM-Llama3-8B-v0.1"
rm1_model = AutoModelForSequenceClassification.from_pretrained(rm1_model_name, device_map='auto',
                               trust_remote_code=True, torch_dtype=torch.bfloat16)
rm1_tokenizer = AutoTokenizer.from_pretrained(rm1_model_name, use_fast=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## RM2 (Skywork)

This should not be hacked and show a decreasing trend during the generations.

In [5]:
rm2_model_name = "Skywork/Skywork-Critic-Llama-3.1-8B"
rm2_tokenizer = AutoTokenizer.from_pretrained(rm2_model_name)
rm2_model = AutoModelForCausalLM.from_pretrained(
    rm2_model_name,
    torch_dtype="auto",
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

## Data

In [6]:
import random

In [7]:
dataset_name = "Magpie-Align/Magpie-Air-DPO-100K-v0.1"
# Load only the 'train' split and select one specific sample for reproducibility
# For a random sample in a real run:.shuffle(seed=SOME_SEED).select()
dataset = load_dataset(dataset_name, split="train")
# Using a specific index for consistent results in this report
# Replace with random selection for broader testing
sample = dataset.to_pandas().iloc[random.randint(0, len(dataset)-1)]
initial_prompt_text = sample['instruction']

# Format for Minstaral
initial_prompt_formatted = f"[INST] {initial_prompt_text.strip()} [/INST]"

## Defining corrective prompt using Gemini

In [2]:
import os
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# 1. Set up the Gemini API client
# get an API key from https://aistudio.google.com/app/apikey
GOOGLE_API_KEY = "AIzaSyAE5ZWpv1jOD4EDszvFCj4G_gxVHZhLisA"
genai.configure(api_key=GOOGLE_API_KEY)

# 2. Read the program from a text file
def read_program_from_file(file_path: str) -> str:
    with open(file_path, 'r') as file:
        return file.read()

program_path = "current_heuristic.txt"
program_content = read_program_from_file(program_path)

# 3. Create the same prompt for Gemini
parser_prompt = f"""
You are a parser. Analyze the following program and identify one heuristic mentioned in it.
Then, define a corrective prompt that could be used to instruct a target model to regenerate a previous output 
based on that heuristic.

---BEGIN PROGRAM---
{program_content}
---END PROGRAM---

Your response should include only the corrective prompt, no explanations, justifications or make any reference to the name
of the python function.
"""

# 4. Configure the Gemini model (using Gemini Pro)
model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config={
        "temperature": 0.7,
        "max_output_tokens": 500,
        "top_p": 1.0
    },
    safety_settings={
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    }
)

# 5. Generate the response
response = model.generate_content(parser_prompt)

# 6. Extract and save the corrective prompt
corrective_prompt = response.text.strip()

# 7. Print the result
print("Corrective prompt:\n", corrective_prompt)

Corrective prompt:
 Regenerate the response, prioritizing coherence, diversity of vocabulary, and positive sentiment, while ensuring the response directly addresses the prompt. Penalize responses containing irrelevant information or negative sentiment.  Favor longer, more detailed answers, within reason.


## Reward Hacking loop

In [None]:
NUM_ITERATIONS = 100


def apply_heuristic(current_response, corrective_prompt):
    return f"{current_response} {corrective_prompt}"

def get_rm1_score(current_generator_prompt, current_generator_reposense, rm1_model, rm1_tokenizer):
    messages = [{"role": "user", "content": current_generator_prompt},
           {"role": "assistant", "content": current_generator_reposense}]
    input_ids = rm1_tokenizer.apply_chat_template(messages, return_tensors="pt").to(rm1_model.device)
    with torch.no_grad():
        outputs_scores_rm1 = rm1_model(input_ids)
    return outputs_scores_rm1

def get_rm2_score(current_generator_prompt, current_generator_reposense, rm2_model, rm2_tokenizer):
    # Combine prompt and response
    full_input = current_generator_prompt.strip() + "\n" + current_generator_reposense.strip()

    # Tokenize input
    inputs = rm2_tokenizer(full_input, return_tensors="pt", truncation=True)
    inputs = {k: v.to(rm2_model.device) for k, v in inputs.items()}

    # Forward pass
    with torch.no_grad():
        outputs = rm2_model(**inputs)

    # Extract score: use last token's logit on a dummy token, or mean if it's a scalar model
    if hasattr(outputs, 'logits'):
        # Option 1: Score based on last token logit (common heuristic)
        last_token_logits = outputs.logits[0, -1]
        score = last_token_logits.mean().item()  # Or pick specific token's logit if needed
    elif isinstance(outputs, torch.Tensor):
        # Option 2: If model directly returns a reward tensor
        score = outputs.item()
    else:
        raise ValueError("Unknown output format from rm2_model")

    return score

In [None]:
results_log = []
current_response_text = ""
current_generator_prompt = initial_prompt_formatted

# Generation parameters
generation_args = {
    "max_new_tokens": 512,
    "temperature": 0.0,
    "do_sample": False,
    "pad_token_id": generator_tokenizer.eos_token_id # Important for some models
}

for i in range(NUM_ITERATIONS + 1): # +1 for initial generation R0
    if i > 0: # Apply heuristic for iterations 1 onwards
        current_generator_prompt = apply_heuristic(current_response_text, corrective_prompt)
        # Reformat for Phi-4-mini if heuristic doesn't output full chat format
        current_generator_prompt = f"<|system|>You are a helpful AI assistant.<|end|><|user|>{current_generator_prompt}<|end|><|assistant|>"


    # Generate response
    inputs = generator_tokenizer(current_generator_prompt, return_tensors="pt").to(generator_model.device)
    outputs = generator_model.generate(**inputs, **generation_args)
    # Important: Decode only the newly generated tokens
    current_response_text = generator_tokenizer.decode(outputs[0, inputs.input_ids.shape:], skip_special_tokens=True)

    # Score with RM1
    rm1_score = get_rm1_score(current_generator_prompt, current_response_text, rm1_model, rm1_tokenizer) # Placeholder for actual scoring function

    # Score with RM2
    rm2_score = get_rm2_score(initial_prompt_text, current_response_text, rm2_model, rm2_tokenizer) # Placeholder for actual scoring function

    results_log.append({
        "Iteration": i,
        "Prompt_to_Generator": current_generator_prompt if i > 0 else initial_prompt_formatted, # Log the prompt used for this iteration's generation
        "Generated_Response": current_response_text,
        "RM1_Score (ArmoRM)": rm1_score,
        "RM2_Score (Skywork-Critic)": rm2_score
    })
    print(f"Iteration {i} complete. RM1: {rm1_score}, RM2: {rm2_score}")

results_df = pd.DataFrame(results_log)

## Plotting

In [None]:
# Python code for plotting (to be executed in the environment)


df_results = pd.DataFrame(results_log)

plt.figure(figsize=(10, 6))
sns.lineplot(x='Iteration', y='RM1_Score (ArmoRM)', data=df_results, marker='o', label='RM1 (ArmoRM)')
sns.lineplot(x='Iteration', y='RM2_Score (Skywork-Critic)', data=df_results, marker='o', label='RM2 (Skywork-Critic)')

plt.title('Reward Model Scores vs. Iteration of Heuristic Application')
plt.xlabel('Iteration Number')
plt.ylabel('Reward Score')
plt.xticks(df_results['Iteration'])
plt.legend()
plt.grid(True)
plt.show()

## Baselines

### Prompt based:
generator-model prompt based, so prompt the generator model to regenerate the response but explain to him that while generating the new response he has to maximize the reward of a reward model, so of a neural network that has been trained in maximizing a score that is a proxy to human judgment.


In [None]:
corrective_prompt_baseline_prompt_based = """Regenerate the previous output based on the following heuristic:
your response has to maximize the reward of a reward model, so of a neural network that has been trained in 
maximizing a score as a proxy of human feedback.
"""

In [None]:
results_log = []
current_response_text = ""
current_generator_prompt = initial_prompt_formatted

# Generation parameters
generation_args = {
    "max_new_tokens": 512,
    "temperature": 0.0,
    "do_sample": False,
    "pad_token_id": generator_tokenizer.eos_token_id # Important for some models
}

for i in range(NUM_ITERATIONS + 1): # +1 for initial generation R0
    if i > 0: # Apply heuristic for iterations 1 onwards
        current_generator_prompt = apply_heuristic(current_response_text, corrective_prompt_baseline_prompt_based)
        # Reformat for Phi-4-mini if heuristic doesn't output full chat format
        current_generator_prompt = f"<|system|>You are a helpful AI assistant.<|end|><|user|>{current_generator_prompt}<|end|><|assistant|>"


    # Generate response
    inputs = generator_tokenizer(current_generator_prompt, return_tensors="pt").to(generator_model.device)
    outputs = generator_model.generate(**inputs, **generation_args)
    # Important: Decode only the newly generated tokens
    current_response_text = generator_tokenizer.decode(outputs[0, inputs.input_ids.shape:], skip_special_tokens=True)

    # Score with RM1
    rm1_score = get_rm1_score(current_generator_prompt, current_response_text, rm1_model, rm1_tokenizer) # Placeholder for actual scoring function

    # Score with RM2
    rm2_score = get_rm2_score(initial_prompt_text, current_response_text, rm2_model, rm2_tokenizer) # Placeholder for actual scoring function

    results_log.append({
        "Iteration": i,
        "Prompt_to_Generator": current_generator_prompt if i > 0 else initial_prompt_formatted, # Log the prompt used for this iteration's generation
        "Generated_Response": current_response_text,
        "RM1_Score (ArmoRM)": rm1_score,
        "RM2_Score (Skywork-Critic)": rm2_score
    })
    print(f"Iteration {i} complete. RM1: {rm1_score}, RM2: {rm2_score}")

results_df = pd.DataFrame(results_log)

### Brainstormed based:

funsearch-free baseline: ask the funsearch LLM to brainstorm the heuristics to maximize the reward of the RM, and then use these heuristics as in this notebook to maximize the reward.

In [None]:
model_to_brainstorm = "mistralai/Codestral-22B-v0.1"

# load the model and tokenizer
brainstorm_tokenizer = AutoTokenizer.from_pretrained(model_to_brainstorm)
brainstorm_model = AutoModelForCausalLM.from_pretrained(model_to_brainstorm, torch_dtype="auto", device_map="auto")
# Ensure pad token is set if not present (though less critical for single sequence generation)
if brainstorm_tokenizer.pad_token is None:
    brainstorm_tokenizer.pad_token = brainstorm_tokenizer.eos_token
# Set the pad token to the eos token
brainstorm_tokenizer.pad_token = brainstorm_tokenizer.eos_token

prompt = "Brainstorm a list of possible heuristics that could be used to improve the performance of a language model. " \
        "The heuristics should be specific and actionable, and should not be too general or vague. " \
        "The heuristics should be relevant to the task of generating text, and should not be too specific to any one domain. " \
        "The heuristics should be based on how current reward models assign scores to generation " \
        "and should aim to maximize the score given by these reward models" \

# ask the model to generate a list of heuristics
inputs = brainstorm_tokenizer(prompt, return_tensors="pt").to(brainstorm_model.device)
outputs = brainstorm_model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.7)
# decode the generated text
corrective_prompt_brainstormed_based = brainstorm_tokenizer.decode(outputs[0], skip_special_tokens=True)
# print the generated text
print(corrective_prompt_brainstormed_based)

In [None]:
results_log = []
current_response_text = ""
current_generator_prompt = initial_prompt_formatted

# Generation parameters
generation_args = {
    "max_new_tokens": 512,
    "temperature": 0.0,
    "do_sample": False,
    "pad_token_id": generator_tokenizer.eos_token_id # Important for some models
}

for i in range(NUM_ITERATIONS + 1): # +1 for initial generation R0
    if i > 0: # Apply heuristic for iterations 1 onwards
        current_generator_prompt = apply_heuristic(current_response_text, corrective_prompt_brainstormed_based)
        # Reformat for Phi-4-mini if heuristic doesn't output full chat format
        current_generator_prompt = f"<|system|>You are a helpful AI assistant.<|end|><|user|>{current_generator_prompt}<|end|><|assistant|>"


    # Generate response
    inputs = generator_tokenizer(current_generator_prompt, return_tensors="pt").to(generator_model.device)
    outputs = generator_model.generate(**inputs, **generation_args)
    # Important: Decode only the newly generated tokens
    current_response_text = generator_tokenizer.decode(outputs[0, inputs.input_ids.shape:], skip_special_tokens=True)

    # Score with RM1
    rm1_score = get_rm1_score(current_generator_prompt, current_response_text, rm1_model, rm1_tokenizer) # Placeholder for actual scoring function

    # Score with RM2
    rm2_score = get_rm2_score(initial_prompt_text, current_response_text, rm2_model, rm2_tokenizer) # Placeholder for actual scoring function

    results_log.append({
        "Iteration": i,
        "Prompt_to_Generator": current_generator_prompt if i > 0 else initial_prompt_formatted, # Log the prompt used for this iteration's generation
        "Generated_Response": current_response_text,
        "RM1_Score (ArmoRM)": rm1_score,
        "RM2_Score (Skywork-Critic)": rm2_score
    })
    print(f"Iteration {i} complete. RM1: {rm1_score}, RM2: {rm2_score}")

results_df = pd.DataFrame(results_log)