In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from tqdm import trange
import os

In [2]:
SEED = 42

log_dir = "./ppo_frozenlake_tensorboard/"
os.makedirs(log_dir, exist_ok=True)

env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="rgb_array")
env.reset(seed=SEED)
env.action_space.seed(SEED)


42

In [3]:
model = PPO(
    "MlpPolicy",
    env,
    seed=SEED,
    verbose=0,  
    learning_rate=0.001,
    gamma=0.99,
    ent_coef=0.01,
    tensorboard_log=log_dir
)

In [4]:
total_timesteps = 10_000_000
chunk_size = 100_000

with trange(0, total_timesteps, chunk_size, desc="Training PPO") as pbar:
    for _ in pbar:
        model.learn(total_timesteps=chunk_size, reset_num_timesteps=False, tb_log_name="PPO_FrozenLake_Entropy")

model.save("ppo_frozenlake_sb3")

model = PPO.load("ppo_frozenlake_sb3", env=env)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5, render=True, deterministic=True)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")


Training PPO: 100%|███████████████████████████████████████████████████████████████| 100/100 [3:07:54<00:00, 112.75s/it]


Mean reward: 0.80 ± 0.40


In [4]:

import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np

seeds = [0, 1, 10, 42, 100, 123, 999]
n_eval_episodes = 10

env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="rgb_array")
model = PPO.load("ppo_frozenlake_sb3", env=env)

results = []

for seed in seeds:
    env.reset(seed=seed)
    env.action_space.seed(seed)
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, render=False, deterministic=True)
    results.append((mean_reward, std_reward))

mean_rewards = [r[0] for r in results]
std_rewards = [r[1] for r in results]
overall_mean = np.mean(mean_rewards)
overall_std = np.mean(std_rewards)

print(f"\nOverall average across seeds: Mean reward = {overall_mean:.2f} ± {overall_std:.2f}")


Overall average across seeds: Mean reward = 0.66 ± 0.45
