In [1]:
!pip install swig
!pip install gymnasium[box2d]

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py): started
  Building wheel for box2d-py (setup.py): finished with status 'done'
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp312-cp312-win_amd64.whl size=463082 sha256=71207dbfccbaa49c538bd39b1345bd6110ad4699956344bd69f7a35d0596c5ab
  Stored in directory: c:\users\papa\appdata\local\pip\cache\wheels\2a\e9\60\774da0bcd07f7dc7761a8590fa2d065e4069568e78dcdc3318
Successfully built box2d-py
Installing collected packages: box2d-py
Successfully installed box2d-py-2.3.5


In [5]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import VecTransposeImage, DummyVecEnv
from tqdm import trange
import os

SEED = 42
log_dir = "./ppo_carracing_tensorboard/"
os.makedirs(log_dir, exist_ok=True)

def make_env():
    env = gym.make("CarRacing-v3", render_mode="rgb_array", continuous=True)
    env.reset(seed=SEED)
    env.action_space.seed(SEED)
    return env

env = DummyVecEnv([make_env])
env = VecTransposeImage(env)

model = PPO(
    "CnnPolicy",
    env,
    seed=SEED,
    verbose=0,
    learning_rate=0.0001,
    gamma=0.99,
    ent_coef=0.01,
    tensorboard_log=log_dir
)

total_timesteps = 1_000_000  
chunk_size = 10000

with trange(0, total_timesteps, chunk_size, desc="Training PPO on CarRacing") as pbar:
    for _ in pbar:
        model.learn(total_timesteps=chunk_size, reset_num_timesteps=False, tb_log_name="PPO_CarRacing_Entropy")

model.save("ppo_carracing_sb3")

model = PPO.load("ppo_carracing_sb3", env=env)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5, render=True, deterministic=True)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")


Training PPO on CarRacing: 100%|██████████████████████████████████████████████████| 100/100 [6:28:05<00:00, 232.86s/it]


Mean reward: 540.74 ± 322.95


In [3]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
import numpy as np

seeds = [0, 1, 10, 42, 100, 123, 999]
n_eval_episodes = 5

def make_env(seed):
    def _init():
        env = gym.make("CarRacing-v3", render_mode="rgb_array", continuous=True)
        env.reset(seed=seed)
        env.action_space.seed(seed)
        return env
    return _init

mean_rewards = []
std_rewards = []

for seed in seeds:
    vec_env = DummyVecEnv([make_env(seed)])
    vec_env = VecTransposeImage(vec_env)
    model = PPO.load("ppo_carracing_sb3", env=vec_env)
    mean_reward, std_reward = evaluate_policy(
        model, vec_env, n_eval_episodes=n_eval_episodes, render=False, deterministic=False
    )
    mean_rewards.append(mean_reward)
    std_rewards.append(std_reward)

overall_mean = np.mean(mean_rewards)
overall_std = np.mean(std_rewards)

print(f"\nOverall average across seeds: Mean reward = {overall_mean:.2f} ± {overall_std:.2f}")



Overall average across seeds: Mean reward = 687.34 ± 239.11


In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecTransposeImage, DummyVecEnv
import time

def make_env():
    env = gym.make("CarRacing-v3", render_mode="human", continuous=True)  # human mode = display in real time
    return env

env = DummyVecEnv([make_env])
env = VecTransposeImage(env)
model = PPO.load("ppo_carracing_sb3", env=env)

n_episodes = 5
for ep in range(n_episodes):
    obs = env.reset()
    done = False
    total_reward = 0

    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        total_reward += reward[0]
        time.sleep(0.03)  

    print(f"Episode {ep + 1} reward: {total_reward:.2f}")


Episode 1 reward: 55.48


KeyboardInterrupt: 