In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
import torch
import numpy as np

# Import our custom modules
from reward_model import RewardNetwork
from environment import PbRLWrapper
from buffers import TrajectoryBuffer
from critic import AutomatedCritic

def main():
    # --- 1. SETUP ---
    print("Setting up the PbRL Apparatus...")

    # Hyperparameters
    TOTAL_ITERATIONS = 50      # How many times we switch between Playing and Training
    STEPS_PER_ITERATION = 2048  # How many steps the agent plays per loop (Total ~80k steps)
    REWARD_EPOCHS = 50          # How many times we train the Reward Net per loop

    # Initialize the components
    reward_net = RewardNetwork(input_dim=4) # CartPole has 4 state features
    buffer = TrajectoryBuffer()
    critic = AutomatedCritic()

    # Create the Environment and wrap it
    raw_env = gym.make("CartPole-v1")
    env = PbRLWrapper(raw_env, reward_net, buffer)

    # Create the PPO Agent
    # MlpPolicy = Simple Neural Net for the agent
    model = PPO("MlpPolicy", env, verbose=1)

    # --- 2. THE MAIN LOOP ---
    print("Starting Training Loop...")
    
    for iteration in range(TOTAL_ITERATIONS):
        print(f"\n=== Iteration {iteration + 1}/{TOTAL_ITERATIONS} ===")
        
        # PHASE 1: Agent Acts (The Student takes the test)
        # The agent plays in the env. The Wrapper handles the reward overriding.
        # The Wrapper also fills the 'buffer' with new trajectories.
        print(f"   >> Agent is playing for {STEPS_PER_ITERATION} steps...")
        model.learn(total_timesteps=STEPS_PER_ITERATION, reset_num_timesteps=False)

        # PHASE 2: Train the Reward Model (The Teacher learns to grade)
        print(f"   >> Training Reward Model (Critiquing Trajectories)...")
        
        cumulative_loss = 0
        valid_pairs = 0
        
        for _ in range(REWARD_EPOCHS):
            # 1. Get two random trajectories from history
            traj_A, traj_B = buffer.sample_pair()
            
            # (If we don't have enough data yet, skip)
            if traj_A is None:
                break
                
            # 2. Ask the Critic: Which one was better? (Ground Truth)
            # Returns 1 if A is better, 0 if B is better
            label = critic.judge(traj_A, traj_B)
            
            # 3. Update the Reward Network to match the Critic's opinion
            loss = reward_net.train_on_batch(traj_A, traj_B, label)
            cumulative_loss += loss
            valid_pairs += 1

        if valid_pairs > 0:
            avg_loss = cumulative_loss / valid_pairs
            print(f"   >> Reward Model Loss: {avg_loss:.4f}")
        else:
            print("   >> Not enough data to train Reward Model yet.")

    print("\nTraining Complete!")
    
    # --- 3. SAVE ---
    # Save the agent and the reward model so you can show them off
    model.save("ppo_pbrl_agent")
    torch.save(reward_net.state_dict(), "reward_model.pth")
    print("Models saved.")

if __name__ == "__main__":
    main()

Setting up the PbRL Apparatus...
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Starting Training Loop...

=== Iteration 1/50 ===
   >> Agent is playing for 2048 steps...




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.4     |
|    ep_rew_mean     | -0.325   |
| time/              |          |
|    fps             | 551      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
   >> Training Reward Model (Critiquing Trajectories)...
   >> Reward Model Loss: 0.6141

=== Iteration 2/50 ===
   >> Agent is playing for 2048 steps...
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 17.9     |
|    ep_rew_mean     | 2.66     |
| time/              |          |
|    fps             | 645      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 4096     |
---------------------------------
   >> Training Reward Model (Critiquing Trajectories)...
   >> Reward Model Loss: 0.3648

=== Iteration 3/50 ===
   >> Agent is playing for 2048 steps...
------------