# This is PPO training code used for reward modeling and finetuning

In [None]:
import pandas as pd

df = pd.read_pickle("Results/results_with_similarity.pkl")
df = df.dropna(subset=["definition", "definition_similarity"])  # Drop missing


In [None]:
from datasets import Dataset

# Rename columns for PPO compatibility
df_for_rl = df.rename(columns={
    "word": "query",                # prompt
    "definition_clean": "response",       # model-generated
    "definition_similarity": "reward"  # cosine similarity
})

# Drop any rows with missing required fields
df_for_rl = df_for_rl[["query", "response", "reward"]].dropna()

# Convert to HuggingFace Dataset
reward_dataset = Dataset.from_pandas(df_for_rl)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
def tokenize_prompts(example):
    tokens = tokenizer(
        example["query"],
        padding="max_length",
        truncation=True,
        max_length=64
    )
    example["input_ids"] = tokens["input_ids"]
    example["attention_mask"] = tokens["attention_mask"]
    return example

tokenized_dataset = reward_dataset.map(tokenize_prompts, batched=True)


Map:   0%|          | 0/183 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(["query", "response", "reward"])


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

embedder = SentenceTransformer("all-MiniLM-L6-v2")


2025-04-22 18:27:54.852881: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-22 18:27:54.858857: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-22 18:27:54.871703: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745368074.893438  260099 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745368074.899995  260099 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been regist

# Experimented with two Reward Modeling function

In [None]:
def compute_binary_reward(prompts, responses, threshold=0.85):
    rewards = []
    for prompt, response in zip(prompts, responses):
        wordnet_def = query_to_wordnet_def.get(prompt)
        if wordnet_def:
            emb1 = embedder.encode(response)
            emb2 = embedder.encode(wordnet_def)
            sim = cosine_similarity([emb1], [emb2])[0][0]
            reward = 1.0 if sim >= threshold else 0.0
        else:
            reward = 0.0
        rewards.append(reward)
    return rewards


In [None]:
def compute_soft_reward(prompts, responses):
    rewards = []
    for prompt, response in zip(prompts, responses):
        wordnet_def = query_to_wordnet_def.get(prompt)
        if wordnet_def:
            emb1 = embedder.encode(response)
            emb2 = embedder.encode(wordnet_def)
            sim = cosine_similarity([emb1], [emb2])[0][0]
            reward = float(sim)
        else:
            reward = 0.0
        rewards.append(reward)
    return rewards


In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
from trl import AutoModelForCausalLMWithValueHead

model = AutoModelForCausalLMWithValueHead.from_pretrained("HuggingFaceTB/SmolLM2-135M")
device = next(model.parameters()).device





In [None]:
from trl import PPOConfig

config = PPOConfig(
    model_name="HuggingFaceTB/SmolLM2-135M",
    learning_rate=1.41e-5,
    batch_size=8,
    mini_batch_size=4
)


In [None]:
from trl import PPOTrainer

ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)




In [None]:
from tqdm import tqdm
import torch

# === Generation parameters ===
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "max_new_tokens": 30,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}

query_to_wordnet_def = dict(zip(df["word"], df["wordnet_definition"]))

# === Number of epochs ===
num_epochs = 20

for epoch in range(num_epochs):
    print(f"\n Starting Epoch {epoch + 1}/{num_epochs}")

    for step, batch in enumerate(tqdm(ppo_trainer.dataloader, desc=f"Epoch {epoch+1}")):
        # Remove this to train on full dataset each epoch
        # if step >= max_batches:
        #     break

        query_tensors = batch["input_ids"]
        query_tensor_list = list(query_tensors.unbind(dim=0))

        response_tensors = ppo_trainer.generate(query_tensor_list, **generation_kwargs)

        queries = [tokenizer.decode(q, skip_special_tokens=True) for q in query_tensor_list]
        responses = [tokenizer.decode(r.squeeze(), skip_special_tokens=True) for r in response_tensors]

        rewards = compute_soft_reward(queries, responses)
        rewards = [torch.tensor(r).to(device) for r in rewards]

        stats = ppo_trainer.step(query_tensor_list, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)



 Starting Epoch 1/20


Epoch 1:   0%|          | 0/22 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Epoch 1:  23%|██▎       | 5/22 [1:01:45<3:29:47, 740.46s/it]

In [None]:
import torch
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from trl import AutoModelForCausalLMWithValueHead
from transformers import AutoTokenizer

# Load WordNet definitions
query_to_wordnet_def = dict(zip(df["word"], df["wordnet_definition"]))
embedder = SentenceTransformer("all-MiniLM-L6-v2")
model = AutoModelForCausalLMWithValueHead.from_pretrained("ppo_finetuned_smol_binary")

# Words to evaluate
words = list(query_to_wordnet_def.keys())

# Store results
finetuned_data = []

for word in tqdm(words, desc="Generating definitions from finetuned model"):
    prompt = f"Define {word}"
    tokens = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        output = model.generate(
            input_ids=tokens["input_ids"],
            attention_mask=tokens["attention_mask"],
            max_new_tokens=50,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    definition = tokenizer.decode(output[0], skip_special_tokens=True).replace(prompt, "").strip()

    # Compute embedding and similarity
    def_emb = embedder.encode(definition)
    wordnet_def = query_to_wordnet_def[word]
    wn_emb = embedder.encode(wordnet_def)
    sim_score = cosine_similarity([def_emb], [wn_emb])[0][0]

    finetuned_data.append({
        "word": word,
        "definition": definition,
        "embedding": def_emb,
        "wordnet_definition": wordnet_def,
        "definition_similarity": sim_score
    })




Some weights of the model checkpoint at ppo_finetuned_smol_binary were not used when initializing LlamaForCausalLM: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing LlamaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Generating definitions from finetuned model: 100%|██████████| 183/183 [3:40:22<00:00, 72.25s/it]  


In [None]:
# Create DataFrame and save
df_finetuned = pd.DataFrame(finetuned_data)
df_finetuned.to_pickle("finetuned_results_with_similarity.pkl")

In [None]:
model.save_pretrained("ppo_finetuned_smol_binary")
tokenizer.save_pretrained("ppo_finetuned_smol_binary")


('ppo_finetuned_smol_binary/tokenizer_config.json',
 'ppo_finetuned_smol_binary/special_tokens_map.json',
 'ppo_finetuned_smol_binary/vocab.json',
 'ppo_finetuned_smol_binary/merges.txt',
 'ppo_finetuned_smol_binary/added_tokens.json',
 'ppo_finetuned_smol_binary/tokenizer.json')