# Excel's DQN experiments (excel.ipynb)

This notebook is a personal copy of the experiment pipeline to run DQN experiments separately from `branis.ipynb`.
Each run is prefixed with `excel_` so logs and models remain separate.
Use small `TOTAL_TIMESTEPS` for smoke tests and increase for final training runs.

In [1]:
# Cell 2: Imports and constants
import os
import time
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict, Tuple

import gymnasium as gym
import ale_py  # ensure ALE namespace is registered
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack

# Constants (edit as needed)
ENV_ID = "ALE/Pong-v5"
MODEL_DIR = os.path.join("models")
LOG_DIR = os.path.join("logs")
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(os.path.join(LOG_DIR, "tensorboard"), exist_ok=True)

In [2]:
# Cell 3: EpisodeCSVLogger (same behavior as in branis.ipynb)
from stable_baselines3.common.callbacks import BaseCallback

class EpisodeCSVLogger(BaseCallback):
    def __init__(self, csv_path: str, verbose: int = 0):
        super().__init__(verbose)
        self.csv_path = csv_path
        self.rows = []

    def _on_step(self) -> bool:
        for info in self.locals.get("infos", []):
            if "episode" in info:
                ep = info["episode"]
                # 'l' = length, 'r' = reward in VecEnv episode info
                self.rows.append((self.num_timesteps, ep.get("l", None), ep.get("r", None)))
        return True

    def _on_training_end(self) -> None:
        import csv, os
        os.makedirs(os.path.dirname(self.csv_path), exist_ok=True)
        with open(self.csv_path, "w", newline="") as f:
            w = csv.writer(f)
            w.writerow(["timestep", "ep_length", "ep_reward"])
            w.writerows(self.rows)
        if self.verbose:
            print(f"Saved episode CSV to {self.csv_path}")

In [3]:
# Cell 4: Helper to make CNN env with frame stack
def make_cnn_env(seed: int, render_mode: str = None):
    # render_mode can be None or 'human' for play cell
    env = make_atari_env(ENV_ID, n_envs=1, seed=seed, env_kwargs={"render_mode": render_mode} if render_mode else None)
    env = VecFrameStack(env, n_stack=4)
    return env

In [4]:
# Cell 5: Define 10 experiment configurations (prefixed with 'excel_') - tweaked for diversity
experiments = [
    # 1 Baseline (CNN) - similar to branis but with explicit learning_starts
    {
        "name": "excel_exp1_baseline",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=1e-4, gamma=0.99, batch_size=32, buffer_size=100_000, train_freq=4, gradient_steps=1, target_update_interval=10_000, learning_starts=5000, exploration_fraction=0.1, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    },
    # 2 Larger batch + larger buffer, slightly lower LR
    {
        "name": "excel_exp2_large_batch",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=7e-5, gamma=0.99, batch_size=64, buffer_size=200_000, train_freq=4, gradient_steps=1, target_update_interval=8000, learning_starts=8000, exploration_fraction=0.12, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0),
    },
    # 3 More frequent updates (train every step) with small batch
    {
        "name": "excel_exp3_freq1_small_batch",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=1e-4, gamma=0.99, batch_size=16, buffer_size=100_000, train_freq=1, gradient_steps=1, target_update_interval=5000, learning_starts=2000, exploration_fraction=0.12, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    },
    # 4 More gradient steps per update
    {
        "name": "excel_exp4_more_gradsteps",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=8e-5, gamma=0.99, batch_size=32, buffer_size=150_000, train_freq=4, gradient_steps=4, target_update_interval=8000, learning_starts=5000, exploration_fraction=0.1, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    },
    # 5 Higher gamma (longer horizon)
    {
        "name": "excel_exp5_high_gamma",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=1e-4, gamma=0.997, batch_size=32, buffer_size=120_000, train_freq=4, gradient_steps=1, target_update_interval=7000, learning_starts=5000, exploration_fraction=0.1, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    },
    # 6 Small buffer, faster target updates
    {
        "name": "excel_exp6_small_buffer_fast_target",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=1.2e-4, gamma=0.99, batch_size=32, buffer_size=50_000, train_freq=4, gradient_steps=1, target_update_interval=4000, learning_starts=2000, exploration_fraction=0.2, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0),
    },
    # 7 MLP policy (for comparison) - smaller network via policy_kwargs
    {
        "name": "excel_exp7_mlp_small",
        "policy": "MlpPolicy",
        "hp": dict(learning_rate=5e-4, gamma=0.99, batch_size=64, buffer_size=100_000, train_freq=4, gradient_steps=1, target_update_interval=10000, learning_starts=5000, exploration_fraction=0.15, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0, policy_kwargs=dict(net_arch=[256, 256])),
    },
    # 8 MLP larger (deeper) - see if MLP can learn with stacked frames
    {
        "name": "excel_exp8_mlp_deep",
        "policy": "MlpPolicy",
        "hp": dict(learning_rate=3e-4, gamma=0.99, batch_size=64, buffer_size=150_000, train_freq=4, gradient_steps=2, target_update_interval=8000, learning_starts=8000, exploration_fraction=0.15, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0, policy_kwargs=dict(net_arch=[512, 512])),
    },
    # 9 Aggressive exploration decay (faster exploitation)
    {
        "name": "excel_exp9_quick_decay",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=1e-4, gamma=0.99, batch_size=32, buffer_size=120_000, train_freq=4, gradient_steps=1, target_update_interval=8000, learning_starts=4000, exploration_fraction=0.03, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    },
    # 10 Conservative LR + gradient clipping (max_grad_norm) to stabilize training
    {
        "name": "excel_exp10_slow_lr_clip",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=5e-5, gamma=0.99, batch_size=32, buffer_size=150_000, train_freq=4, gradient_steps=1, target_update_interval=8000, learning_starts=8000, exploration_fraction=0.12, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0, max_grad_norm=10),
    },
]

print(f"Prepared {len(experiments)} experiment configs (excel).")

Prepared 10 experiment configs (excel).


In [5]:
# Cell 6: train_experiment function (saves per-run model & csv)
from stable_baselines3.common.evaluation import evaluate_policy

def train_experiment(name: str, hp: Dict, policy: str = "CnnPolicy", total_timesteps: int = 50_000, seed: int = 42, eval_episodes: int = 3) -> Tuple[Dict, DQN]:
    """Train a DQN model with given hyperparameters and save per-run model and CSV.
    policy: string policy name to pass to DQN (e.g., 'CnnPolicy' or 'MlpPolicy').
    Returns (metrics_dict, model)
    """
    run_model_path = os.path.join(MODEL_DIR, f"dqn_{name}.zip")
    csv_log = os.path.join(LOG_DIR, f"training_metrics_{name}.csv")
    tb_log = os.path.join(LOG_DIR, "tensorboard", name)

    env = make_cnn_env(seed)
    callback = EpisodeCSVLogger(csv_log, verbose=0)

    # Create model with the requested policy (CnnPolicy or MlpPolicy)
    model = DQN(
        policy,
        env,
        seed=seed,
        tensorboard_log=tb_log,
        **hp,
    )

    print(f"\n[RUN {name}] Training {total_timesteps} steps | policy={policy} | hp={hp}")
    t0 = time.time()
    model.learn(total_timesteps=total_timesteps, callback=callback, progress_bar=True)
    minutes = (time.time() - t0) / 60.0

    mean_r, std_r = evaluate_policy(model, env, n_eval_episodes=eval_episodes)
    env.close()

    model.save(run_model_path)  # save per-run model

    metrics = {
        "name": name,
        "mean_reward": float(mean_r),
        "std_reward": float(std_r),
        "train_minutes": minutes,
        **hp,
        "policy": policy,
    }
    print(f"[RUN {name}] Finished: mean_reward={mean_r:.2f} ± {std_r:.2f} | train_minutes={minutes:.2f}")
    return metrics, model

In [6]:
# Cell 7: Run the experiments and save the best model (excel_best_dqn.zip)
TOTAL_TIMESTEPS = 50000  # change for longer runs
SEED = 42
EVAL_EPISODES = 3
BEST_MODEL_PATH = os.path.join(MODEL_DIR, "excel_best_dqn.zip")

results = []
best_mean_reward = None
best_record = None

for exp in experiments:
    policy = exp.get("policy", "CnnPolicy")
    metrics, model = train_experiment(name=exp["name"], hp=exp["hp"], policy=policy, total_timesteps=TOTAL_TIMESTEPS, seed=SEED, eval_episodes=EVAL_EPISODES)
    results.append(metrics)

    if (best_mean_reward is None) or (metrics["mean_reward"] > best_mean_reward):
        best_mean_reward = metrics["mean_reward"]
        best_record = metrics
        model.save(BEST_MODEL_PATH)
        print(f"Saved new best model: {metrics['name']} -> {BEST_MODEL_PATH} (mean_reward={best_mean_reward:.2f})")

# Save summary CSV
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("mean_reward", ascending=False).reset_index(drop=True)
results_csv = os.path.join(LOG_DIR, "excel_models.csv")
results_df.to_csv(results_csv, index=False)
print(f"Saved results table to {results_csv}")
results_df




[RUN excel_exp1_baseline] Training 50000 steps | policy=CnnPolicy | hp={'learning_rate': 0.0001, 'gamma': 0.99, 'batch_size': 32, 'buffer_size': 100000, 'train_freq': 4, 'gradient_steps': 1, 'target_update_interval': 10000, 'learning_starts': 5000, 'exploration_fraction': 0.1, 'exploration_initial_eps': 1.0, 'exploration_final_eps': 0.01, 'verbose': 0}


[RUN excel_exp1_baseline] Finished: mean_reward=-21.00 ± 0.00 | train_minutes=31.20
Saved new best model: excel_exp1_baseline -> models\excel_best_dqn.zip (mean_reward=-21.00)


MemoryError: Unable to allocate 5.26 GiB for an array with shape (200000, 1, 4, 84, 84) and data type uint8

In [7]:
# Cell 8: Simple visualization of results
_df = results_df.copy()
if "mean_reward" in _df.columns:
    _df = _df.sort_values("mean_reward", ascending=False)

plt.figure(figsize=(10, 4))
plt.bar(_df["name"], _df["mean_reward"], color="#4c78a8")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Mean reward (eval)")
plt.title("DQN (CnnPolicy) - excel: mean reward by experiment")
plt.tight_layout()
plt.show()

if "train_minutes" in _df.columns:
    plt.figure(figsize=(6,4))
    plt.scatter(_df["train_minutes"], _df["mean_reward"], color="#f58518")
    plt.xlabel("Train minutes")
    plt.ylabel("Mean reward (eval)")
    plt.title("Training time vs performance (excel)")
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

NameError: name 'results_df' is not defined

In [None]:
# Cell 9: Play the BEST saved model (render on-screen)
import time
from stable_baselines3 import DQN

BEST_MODEL_PATH = BEST_MODEL_PATH  # already defined in notebook above
N_EPISODES = 1
SEED = 42

if not os.path.isfile(BEST_MODEL_PATH):
    print("Best model not found:", BEST_MODEL_PATH)
    if os.path.isdir(MODEL_DIR):
        print("Available models:")
        for f in sorted(os.listdir(MODEL_DIR)):
            if f.endswith(".zip"):
                print(" -", os.path.join(MODEL_DIR, f))
else:
    env = make_atari_env(ENV_ID, n_envs=1, seed=SEED, env_kwargs={"render_mode": "human"})
    env = VecFrameStack(env, n_stack=4)
    model = DQN.load(BEST_MODEL_PATH, env=env)
    for ep in range(N_EPISODES):
        obs = env.reset()
        done = False
        ep_reward = 0.0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, rewards, dones, infos = env.step(action)
            ep_reward += float(rewards[0])
            done = bool(dones[0])
            time.sleep(1/60)
        print(f"Episode {ep+1} return: {ep_reward:.2f}")
    env.close()