In [1]:
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from tqdm import trange
import os

In [2]:
SEED = 42
log_dir = "./dqn_frozenlake_tensorboard/"
os.makedirs(log_dir, exist_ok=True)
env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="rgb_array")
env.reset(seed=SEED)
env.action_space.seed(SEED)


42

In [3]:
model = DQN(
    "MlpPolicy",
    env,
    seed=SEED,
    verbose=0,
    learning_rate=0.0005,
    gamma=0.99,
    exploration_fraction=0.0,  
    exploration_final_eps=0.0,  
    tensorboard_log=log_dir,
    buffer_size=100_000,
    learning_starts=10_000,
    train_freq=4,
    target_update_interval=1_000,
    device="cuda"
)

total_timesteps = 5_000_000
chunk_size = 100_000

with trange(0, total_timesteps, chunk_size, desc="Training DQN") as pbar:
    for _ in pbar:
        model.learn(total_timesteps=chunk_size, reset_num_timesteps=False, tb_log_name="DQN_FrozenLake")

model.save("dqn_frozenlake_sb3")
model = DQN.load("dqn_frozenlake_sb3", env=env)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5, render=True, deterministic=True)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")


Training DQN: 100%|█████████████████████████████████████████████████████████████████| 50/50 [2:09:55<00:00, 155.90s/it]


Mean reward: 0.80 ± 0.40


In [3]:
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np

seeds = [0, 1, 10, 42, 100, 123, 999]
n_eval_episodes = 10

env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="rgb_array")
model = DQN.load("dqn_frozenlake_sb3", env=env)

mean_rewards = []
std_rewards = []

for seed in seeds:
    env.reset(seed=seed)
    env.action_space.seed(seed)
    mean_reward, std_reward = evaluate_policy(
        model, env, n_eval_episodes=n_eval_episodes, render=False, deterministic=True
    )
    mean_rewards.append(mean_reward)
    std_rewards.append(std_reward)

overall_mean = np.mean(mean_rewards)
overall_std = np.mean(std_rewards)

print(f"\nOverall average across seeds: Mean reward = {overall_mean:.2f} ± {overall_std:.2f}")



Overall average across seeds: Mean reward = 0.63 ± 0.46
