In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
import torch
import numpy as np

from reward_model import RewardNetwork
from environment import PbRLWrapper
from buffers import TrajectoryBuffer
from critic import AutomatedCritic

def main():
    
    # Setup

    # Hyperparameters
    TOTAL_ITERATIONS = 40     
    STEPS_PER_ITERATION = 2048 
    REWARD_EPOCHS = 50      

    # Initialize the components
    reward_net = RewardNetwork(input_dim=4)
    buffer = TrajectoryBuffer()
    critic = AutomatedCritic()

    # We create the Environment and wrap it
    raw_env = gym.make("CartPole-v1")
    env = PbRLWrapper(raw_env, reward_net, buffer)

    # Create the PPO Agent
    model = PPO("MlpPolicy", env, verbose=1)

   # Main Training Loop
    
    for iteration in range(TOTAL_ITERATIONS):
        
        # The agent plays in the env, wrapper overrides the reward
        model.learn(total_timesteps=STEPS_PER_ITERATION, reset_num_timesteps=False)

        # Train the Reward Model        
        cumulative_loss = 0
        valid_pairs = 0
        
        for _ in range(REWARD_EPOCHS):
            
            # Get two random trajectories from history
            traj_A, traj_B = buffer.sample_pair()
            
            # Skip if there isn't enough data
            if traj_A is None:
                break
                
            # Ask critic which one was better
            label = critic.judge(traj_A, traj_B)
            
            # Update the Reward Network to match the Critic's opinion
            loss = reward_net.train_on_batch(traj_A, traj_B, label)
            cumulative_loss += loss
            valid_pairs += 1

        if valid_pairs > 0:
            avg_loss = cumulative_loss / valid_pairs
            print(f"Reward Model Loss: {avg_loss:.4f}")
        else:
            print("Not enough data to train Reward Model yet")

    print("\nTraining Complete")
    
    # Save the agent and reward model
    model.save("ppo_pbrl_agent")
    torch.save(reward_net.state_dict(), "reward_model.pth")
    print("Models saved")

if __name__ == "__main__":
    main()

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.3     |
|    ep_rew_mean     | -3.36    |
| time/              |          |
|    fps             | 310      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 2048     |
---------------------------------
Reward Model Loss: 0.6748
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 19.4     |
|    ep_rew_mean     | -1.24    |
| time/              |          |
|    fps             | 384      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 4096     |
---------------------------------
Reward Model Loss: 0.6426
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.2     |
|    ep_rew_mean     | 4.26     |
| time/              |          |
|    fps             | 346      |
|    iterations      | 1        |
|    time_elapsed    | 5      