In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from tqdm import trange
import os

In [2]:
SEED = 42
log_dir = "./ppo_frozenlake_tensorboard/"
os.makedirs(log_dir, exist_ok=True)
env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="rgb_array")
env.reset(seed=SEED)
env.action_space.seed(SEED)

42

In [3]:
model = PPO(
    "MlpPolicy",
    env,
    seed=SEED,
    verbose=0,  
    learning_rate=0.0001,
    gamma=0.99,
    ent_coef=0.1,
    tensorboard_log=log_dir
)

In [None]:
total_timesteps = 10_000_000
chunk_size = 100_000

with trange(0, total_timesteps, chunk_size, desc="Training PPO") as pbar:
    for _ in pbar:
        model.learn(total_timesteps=chunk_size, reset_num_timesteps=False, tb_log_name="PPO_FrozenLake_Entropy")

model.save("ppo_frozenlake_sb3")

model = PPO.load("ppo_frozenlake_sb3", env=env)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5, render=True, deterministic=True)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")


Training PPO:  31%|███████████████████▏                                          | 31/100 [1:02:52<2:53:57, 151.27s/it]