In [1]:
# Cell 1: Load existing trained model (if present) and verify environment
import os
import gymnasium as gym
import ale_py  # ensure ALE namespace is registered
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack

MODEL_PATH = os.path.join("models", "dqn_pong_cnn.zip")
ENV_ID = "ALE/Pong-v5"

# Recreate the same preprocessing used for CNN training: make_atari_env + 4-frame stack
try:
    env = make_atari_env(ENV_ID, n_envs=1, seed=42)
    env = VecFrameStack(env, n_stack=4)
except Exception as e:
    raise RuntimeError(
        f"Failed to build Atari VecEnv for {ENV_ID}. Ensure ale-py and ROMs are installed. Original error: {e}"
    )

if os.path.isfile(MODEL_PATH):
    model = DQN.load(MODEL_PATH, env=env)
    mean_r, std_r = evaluate_policy(model, env, n_eval_episodes=3)
    print(f"Loaded model from {MODEL_PATH}. Eval reward over 3 episodes: {mean_r:.2f} ± {std_r:.2f}")
else:
    model = None
    print(f"No existing model found at {MODEL_PATH}. You can still run experiments below.")

env.close()

Wrapping the env in a VecTransposeImage.




Loaded model from models\dqn_pong_cnn.zip. Eval reward over 3 episodes: -17.00 ± 1.63


In [2]:
# Cell 2: Reusable training function for hyperparameter experiments (CNN policy)
import time
import pandas as pd
from typing import Dict, Tuple

from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

# Callback to record episode stats into a unique CSV per run
class EpisodeCSVLogger(BaseCallback):
    def __init__(self, csv_path: str, verbose: int = 0):
        super().__init__(verbose)
        self.csv_path = csv_path
        self.rows = []

    def _on_step(self) -> bool:
        # VecEnv infos may contain 'episode' dict on episode end
        for info in self.locals.get("infos", []):
            if "episode" in info:
                ep = info["episode"]
                self.rows.append((self.num_timesteps, ep.get("l", None), ep.get("r", None)))
        return True

    def _on_training_end(self) -> None:
        import csv, os
        os.makedirs(os.path.dirname(self.csv_path), exist_ok=True)
        with open(self.csv_path, "w", newline="") as f:
            w = csv.writer(f)
            w.writerow(["timestep", "ep_length", "ep_reward"]) 
            w.writerows(self.rows)
        if self.verbose:
            print(f"Saved episode CSV to {self.csv_path}")


def make_cnn_env(seed: int):
    env = make_atari_env(ENV_ID, n_envs=1, seed=seed)
    env = VecFrameStack(env, n_stack=4)
    return env


def train_experiment(name: str, hp: Dict, total_timesteps: int = 50_000, seed: int = 42, eval_episodes: int = 3) -> Tuple[Dict, DQN]:
    """Train a DQN(CnnPolicy) model with given hyperparameters.
    Returns (metrics_dict, model) WITHOUT saving the model; caller decides which to persist.
    hp is a dict of DQN constructor keyword args (learning_rate, gamma, batch_size, etc.).
    """
    import os
    os.makedirs("logs", exist_ok=True)
    os.makedirs("models", exist_ok=True)  # ensure exists for later best-model save

    env = make_cnn_env(seed)
    csv_log = os.path.join("logs", f"training_metrics_{name}.csv")
    callback = EpisodeCSVLogger(csv_log, verbose=0)

    model = DQN(
        "CnnPolicy",
        env,
        seed=seed,
        tensorboard_log=os.path.join("logs", "tensorboard", name),
        **hp,
    )

    print(f"\n[RUN {name}] Training {total_timesteps} steps | hp={hp}")
    t0 = time.time()
    model.learn(total_timesteps=total_timesteps, callback=callback, progress_bar=True)
    minutes = (time.time() - t0) / 60.0

    mean_r, std_r = evaluate_policy(model, env, n_eval_episodes=eval_episodes)
    env.close()

    metrics = {
        "name": name,
        "mean_reward": float(mean_r),
        "std_reward": float(std_r),
        "train_minutes": minutes,
        **hp,
    }
    print(f"[RUN {name}] Finished: mean_reward={mean_r:.2f} ± {std_r:.2f} | train_minutes={minutes:.2f}")
    return metrics, model

print("Training function ready. Configure hyperparameter sets in next cell.")

Training function ready. Configure hyperparameter sets in next cell.


In [3]:
# Cell 3: Define TEN hyperparameter experiment configurations (CNN policy)
# Expanded from 5 to 10 for broader tuning coverage.
# Strategy: vary learning rate, buffer size, batch size, gamma, exploration schedule, and target update interval.

experiments = [
    # 1 Baseline (close to defaults)
    {
        "name": "exp1_baseline",
        "hp": dict(
            learning_rate=1e-4,
            gamma=0.99,
            batch_size=32,
            buffer_size=100_000,
            train_freq=4,
            gradient_steps=1,
            target_update_interval=10_000,
            exploration_fraction=0.1,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.01,
            verbose=0,
        ),
    },
    # 2 Larger batch size + slower LR
    {
        "name": "exp2_large_batch_slow_lr",
        "hp": dict(
            learning_rate=5e-5,
            gamma=0.99,
            batch_size=64,
            buffer_size=150_000,
            train_freq=4,
            gradient_steps=1,
            target_update_interval=8_000,
            exploration_fraction=0.15,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.02,
            verbose=0,
        ),
    },
    # 3 Higher gamma, faster target updates
    {
        "name": "exp3_high_gamma_fast_target",
        "hp": dict(
            learning_rate=1.5e-4,
            gamma=0.995,
            batch_size=32,
            buffer_size=120_000,
            train_freq=4,
            gradient_steps=1,
            target_update_interval=5_000,
            exploration_fraction=0.1,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.01,
            verbose=0,
        ),
    },
    # 4 Aggressive exploration decay (short fraction)
    {
        "name": "exp4_aggressive_exploration_decay",
        "hp": dict(
            learning_rate=1e-4,
            gamma=0.99,
            batch_size=32,
            buffer_size=100_000,
            train_freq=4,
            gradient_steps=1,
            target_update_interval=10_000,
            exploration_fraction=0.05,  # quicker decay
            exploration_initial_eps=1.0,
            exploration_final_eps=0.01,
            verbose=0,
        ),
    },
    # 5 Larger buffer, slightly higher LR
    {
        "name": "exp5_large_buffer_higher_lr",
        "hp": dict(
            learning_rate=2e-4,
            gamma=0.99,
            batch_size=32,
            buffer_size=200_000,
            train_freq=4,
            gradient_steps=1,
            target_update_interval=10_000,
            exploration_fraction=0.12,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.01,
            verbose=0,
        ),
    },
    # 6 Smaller buffer, faster updates, higher exploration fraction
    {
        "name": "exp6_small_buffer_fast_updates",
        "hp": dict(
            learning_rate=1e-4,
            gamma=0.99,
            batch_size=32,
            buffer_size=50_000,
            train_freq=4,
            gradient_steps=1,
            target_update_interval=4_000,
            exploration_fraction=0.2,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.02,
            verbose=0,
        ),
    },
    # 7 Higher batch size + higher learning rate + slower exploration decay
    {
        "name": "exp7_high_lr_high_batch",
        "hp": dict(
            learning_rate=3e-4,
            gamma=0.99,
            batch_size=64,
            buffer_size=150_000,
            train_freq=4,
            gradient_steps=1,
            target_update_interval=12_000,
            exploration_fraction=0.18,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.02,
            verbose=0,
        ),
    },
    # 8 Very high gamma (more credit assignment) + moderate LR
    {
        "name": "exp8_very_high_gamma",
        "hp": dict(
            learning_rate=1e-4,
            gamma=0.997,
            batch_size=32,
            buffer_size=120_000,
            train_freq=4,
            gradient_steps=1,
            target_update_interval=8_000,
            exploration_fraction=0.1,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.01,
            verbose=0,
        ),
    },
    # 9 Faster train freq (every 1 step) smaller batch
    {
        "name": "exp9_freq1_small_batch",
        "hp": dict(
            learning_rate=1e-4,
            gamma=0.99,
            batch_size=16,
            buffer_size=100_000,
            train_freq=1,
            gradient_steps=1,
            target_update_interval=10_000,
            exploration_fraction=0.12,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.01,
            verbose=0,
        ),
    },
    # 10 Larger gradient steps per update
    {
        "name": "exp10_more_gradient_steps",
        "hp": dict(
            learning_rate=1e-4,
            gamma=0.99,
            batch_size=32,
            buffer_size=100_000,
            train_freq=4,
            gradient_steps=4,  # perform multiple gradient steps per collection
            target_update_interval=10_000,
            exploration_fraction=0.1,
            exploration_initial_eps=1.0,
            exploration_final_eps=0.01,
            verbose=0,
        ),
    },
]

print(f"Prepared {len(experiments)} experiment configs.")

Prepared 10 experiment configs.


In [4]:
# Cell 4: Run experiments, keep ONLY the best model, and build results table
import os
TOTAL_TIMESTEPS = 50_000  # increase for better learning (e.g., 500_000 or more)
SEED = 42
EVAL_EPISODES = 3
BEST_MODEL_PATH = os.path.join("models", "best_dqn_pong_cnn.zip")

results = []
best_record = None
best_mean_reward = None

for exp in experiments:
    metrics, model = train_experiment(
        name=exp["name"],
        hp=exp["hp"],
        total_timesteps=TOTAL_TIMESTEPS,
        seed=SEED,
        eval_episodes=EVAL_EPISODES,
    )
    results.append(metrics)

    if (best_mean_reward is None) or (metrics["mean_reward"] > best_mean_reward):
        best_mean_reward = metrics["mean_reward"]
        best_record = metrics
        # Save/overwrite the single best model
        os.makedirs("models", exist_ok=True)
        model.save(BEST_MODEL_PATH)
        print(f"Saved new best model: {metrics['name']} -> {BEST_MODEL_PATH} (mean_reward={best_mean_reward:.2f})")

import pandas as pd
results_df = pd.DataFrame(results)
# Sort for easier viewing
results_df = results_df.sort_values("mean_reward", ascending=False).reset_index(drop=True)
print("Top result:")
print(results_df.head(1))
results_df




[RUN exp1_baseline] Training 50000 steps | hp={'learning_rate': 0.0001, 'gamma': 0.99, 'batch_size': 32, 'buffer_size': 100000, 'train_freq': 4, 'gradient_steps': 1, 'target_update_interval': 10000, 'exploration_fraction': 0.1, 'exploration_initial_eps': 1.0, 'exploration_final_eps': 0.01, 'verbose': 0}


[RUN exp1_baseline] Finished: mean_reward=-20.33 ± 0.94 | train_minutes=22.59
Saved new best model: exp1_baseline -> models\best_dqn_pong_cnn.zip (mean_reward=-20.33)





[RUN exp2_large_batch_slow_lr] Training 50000 steps | hp={'learning_rate': 5e-05, 'gamma': 0.99, 'batch_size': 64, 'buffer_size': 150000, 'train_freq': 4, 'gradient_steps': 1, 'target_update_interval': 8000, 'exploration_fraction': 0.15, 'exploration_initial_eps': 1.0, 'exploration_final_eps': 0.02, 'verbose': 0}


[RUN exp2_large_batch_slow_lr] Finished: mean_reward=-21.00 ± 0.00 | train_minutes=36.96


MemoryError: Unable to allocate 3.15 GiB for an array with shape (120000, 1, 4, 84, 84) and data type uint8

In [None]:
# Cell 5: Save results table as branis_models.csv (sorted by mean_reward)
import os
os.makedirs("logs", exist_ok=True)
results_csv = os.path.join("logs", "branis_models.csv")
results_df.to_csv(results_csv, index=False)
print(f"Saved results table to {results_csv}")
results_df

In [None]:

import matplotlib.pyplot as plt
import numpy as np

# Ensure results_df exists and is sorted by mean_reward
_df = results_df.copy()
if "mean_reward" in _df.columns:
    _df = _df.sort_values("mean_reward", ascending=False)

# Bar chart: mean_reward by experiment
plt.figure(figsize=(10, 4))
plt.bar(_df["name"], _df["mean_reward"], color="#4c78a8")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Mean reward (eval)")
plt.title("DQN (CnnPolicy): mean reward by experiment")
plt.tight_layout()
plt.show()

# Scatter: train_minutes vs mean_reward
if "train_minutes" in _df.columns:
    plt.figure(figsize=(6, 4))
    plt.scatter(_df["train_minutes"], _df["mean_reward"], color="#f58518")
    plt.xlabel("Train minutes")
    plt.ylabel("Mean reward (eval)")
    plt.title("Training time vs performance")
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


In [None]:
# Cell 6: Play the BEST saved model with on-screen rendering
import os, time
import gymnasium as gym
import ale_py  # ensure ALE namespace is registered
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack

BEST_MODEL_PATH = os.path.join("models", "best_dqn_pong_cnn.zip")
N_EPISODES = 1
SEED = 42

if not os.path.isfile(BEST_MODEL_PATH):
    print("Best model not found:", BEST_MODEL_PATH)
    if os.path.isdir("models"):
        print("Available models:")
        for f in sorted(os.listdir("models")):
            if f.endswith(".zip"):
                print(" -", os.path.join("models", f))
else:
    env = make_atari_env(ENV_ID, n_envs=1, seed=SEED, env_kwargs={"render_mode": "human"})
    env = VecFrameStack(env, n_stack=4)
    model = DQN.load(BEST_MODEL_PATH, env=env)
    for ep in range(N_EPISODES):
        obs = env.reset()
        done = False
        ep_reward = 0.0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, rewards, dones, infos = env.step(action)
            ep_reward += float(rewards[0])
            done = bool(dones[0])
            time.sleep(1/60)
        print(f"Episode {ep+1} return: {ep_reward:.2f}")
    env.close()