In [None]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from stable_baselines3.common.monitor import Monitor
from tqdm import trange
import os

SEED = 42

log_dir = "./ppo_carracing_tensorboard/"
os.makedirs(log_dir, exist_ok=True)

def make_env():
    env = gym.make("CarRacing-v3", render_mode="rgb_array", continuous=True)
    env = Monitor(env)  
    env.reset(seed=SEED)
    env.action_space.seed(SEED)
    return env

env = DummyVecEnv([make_env])
env = VecTransposeImage(env)  

model = PPO(
    "CnnPolicy",
    env,
    seed=SEED,
    verbose=0,  
    learning_rate=0.001,
    gamma=0.99,
    ent_coef=0.01,
    tensorboard_log=log_dir
)

total_timesteps = 1_000_000
chunk_size = 10_000

with trange(0, total_timesteps, chunk_size, desc="Training PPO on CarRacing") as pbar:
    for _ in pbar:
        model.learn(
            total_timesteps=chunk_size,
            reset_num_timesteps=False,
            tb_log_name="PPO_CarRacing_Entropy",
            log_interval=1 
        )

model.save("ppo_carracing_sb3")
model = PPO.load("ppo_carracing_sb3", env=env)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5, render=True, deterministic=True)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")


Training PPO on CarRacing:  84%|█████████████████████████████████████████▏       | 84/100 [5:35:28<1:09:00, 258.80s/it]