In [None]:
import torch
from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from trl.core import respond_to_batch

num_iterations = 10000

model = AutoModelForCausalLMWithValueHead.from_pretrained('gpt2')
model_ref = create_reference_model(model)
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
from datetime import datetime
import os

now = datetime.now()
ts = now.strftime("%m%d-%H%M")
print("timestamp:", ts)
log_dir = f"logs/{ts}"
os.makedirs(log_dir, exist_ok=True)

ppo_config = PPOConfig(batch_size=1, mini_batch_size=1, log_with='tensorboard', project_kwargs={'logging_dir': log_dir})
ppo_trainer = PPOTrainer(ppo_config, model, model_ref, tokenizer)

In [None]:
for iteration in range(num_iterations):
    query_txt = "This morning I went to the "
    query_tensor = tokenizer.encode(query_txt, return_tensors="pt")
    query_tensor = query_tensor.to(device)
    
    response_tensor = respond_to_batch(model, query_tensor)
    reward_length = len(tokenizer.decode(response_tensor[0], skip_special_tokens=True))
    reward = torch.tensor([float(reward_length)])  # Convert reward to a tensor
    train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], [reward])

    print(f"Iteration {iteration + 1} Reward: {reward.item()} Predicted Text:", tokenizer.decode(response_tensor[0], skip_special_tokens=True))

print("Training completed.")
