In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [18]:
import os

# Define the base directory in Google Drive
DRIVE_PATH = '/content/drive/MyDrive/Colab_DQN_Checkpoints'

# Modify MODEL_DIR and LOG_DIR to point to the Drive path
MODEL_DIR = os.path.join(DRIVE_PATH, "models")
LOG_DIR = os.path.join(DRIVE_PATH, "logs")

# Ensure directories are created
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(os.path.join(LOG_DIR, "tensorboard"), exist_ok=True)

print(f"Updated MODEL_DIR: {MODEL_DIR}")
print(f"Updated LOG_DIR: {LOG_DIR}")
print("Ensured all necessary directories exist in Google Drive.")

Updated MODEL_DIR: /content/drive/MyDrive/Colab_DQN_Checkpoints/models
Updated LOG_DIR: /content/drive/MyDrive/Colab_DQN_Checkpoints/logs
Ensured all necessary directories exist in Google Drive.


In [2]:
import sys
!{sys.executable} -m pip install stable-baselines3[extra] ale-py
print("stable-baselines3 and ale-py re-installed.")

Collecting pygame
  Downloading pygame-2.6.1-cp39-cp39-win_amd64.whl (10.6 MB)
     ---------------------------------------- 10.6/10.6 MB 3.0 MB/s eta 0:00:00
Collecting numpy<3.0,>=1.20
  Using cached numpy-2.0.2-cp39-cp39-win_amd64.whl (15.9 MB)
Installing collected packages: pygame, numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
stable-baselines3 and ale-py re-installed.


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Excel\\AppData\\Local\\Programs\\Python\\Python39\\Lib\\site-packages\\~umpy.libs\\libopenblas64__v0.3.23-293-gc2f4bdbb-gcc_10_3_0-2bde3a66a51006b2b53eb373ff767a3f.dll'
Consider using the `--user` option or check the permissions.

You should consider upgrading via the 'c:\Users\Excel\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [20]:
import os
from stable_baselines3.common.callbacks import BaseCallback

class PeriodicSaveCallback(BaseCallback):
    """
    A custom callback that saves the model periodically to Google Drive.
    """
    def __init__(self, save_freq: int, save_path: str, verbose: int = 0):
        super().__init__(verbose)
        self.save_freq = save_freq
        self.save_path = save_path
        os.makedirs(save_path, exist_ok=True)

    def _on_step(self) -> bool:
        # Check if the current timestep is a multiple of the save frequency
        if self.n_calls % self.save_freq == 0:
            # Construct the save path with the current timestep
            path = os.path.join(self.save_path, f"model_{self.num_timesteps}.zip")
            # Save the model
            self.model.save(path)
            if self.verbose > 0:
                print(f"Saving model to {path} at timestep {self.num_timesteps}")
        return True

print("PeriodicSaveCallback class defined successfully.")

PeriodicSaveCallback class defined successfully.


In [3]:
import os
import time
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict, Tuple

import gymnasium as gym
import ale_py  # ensure ALE namespace is registered
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback

# Cell 6: train_experiment function (saves per-run model & csv)

def train_experiment(name: str, hp: Dict, policy: str = "CnnPolicy", total_timesteps: int = 50_000, seed: int = 42, eval_episodes: int = 3, save_freq: int = 100_000) -> Tuple[Dict, DQN]:
    """Train a DQN model with given hyperparameters and save per-run model and CSV.
    policy: string policy name to pass to DQN (e.g., 'CnnPolicy' or 'MlpPolicy').
    Returns (metrics_dict, model)
    """
    run_model_path = os.path.join(MODEL_DIR, f"dqn_{name}.zip")
    csv_log = os.path.join(LOG_DIR, f"training_metrics_{name}.csv")
    tb_log = os.path.join(LOG_DIR, "tensorboard", name)

    env = make_cnn_env(seed)

    # Instantiate callbacks
    episode_logger_callback = EpisodeCSVLogger(csv_log, verbose=0)
    periodic_save_callback = PeriodicSaveCallback(save_freq=save_freq, save_path=os.path.join(MODEL_DIR, name + "_checkpoints"), verbose=1) # Save checkpoints in a subfolder per experiment

    # Combine callbacks into a list
    callbacks = [episode_logger_callback, periodic_save_callback]

    # Create model with the requested policy (CnnPolicy or MlpPolicy)
    model = DQN(
        policy,
        env,
        seed=seed,
        tensorboard_log=tb_log,
        **hp,
    )

    print(f"\n[RUN {name}] Training {total_timesteps} steps | policy={policy} | hp={hp}")
    t0 = time.time()
    model.learn(total_timesteps=total_timesteps, callback=callbacks, progress_bar=True)
    minutes = (time.time() - t0) / 60.0

    mean_r, std_r = evaluate_policy(model, env, n_eval_episodes=eval_episodes)
    env.close()

    model.save(run_model_path)  # save final model

    metrics = {
        "name": name,
        "mean_reward": float(mean_r),
        "std_reward": float(std_r),
        "train_minutes": minutes,
        **hp,
        "policy": policy,
    }
    print(f"[RUN {name}] Finished: mean_reward={mean_r:.2f} \u00b1 {std_r:.2f} | train_minutes={minutes:.2f}")
    return metrics, model

In [22]:
# Cell 5: Define 10 experiment configurations (prefixed with 'excel_') - tweaked for diversity
experiments = [
    # 1 Baseline (CNN) - similar to branis but with explicit learning_starts
    {
        "name": "excel_exp1_baseline",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=1e-4, gamma=0.99, batch_size=32, buffer_size=100_000, train_freq=4, gradient_steps=1, target_update_interval=10_000, learning_starts=5000, exploration_fraction=0.1, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    },
    # 2 Larger batch + larger buffer, slightly lower LR
    {
        "name": "excel_exp2_large_batch",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=7e-5, gamma=0.99, batch_size=64, buffer_size=200_000, train_freq=4, gradient_steps=1, target_update_interval=8000, learning_starts=8000, exploration_fraction=0.12, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0),
    },
    # 3 More frequent updates (train every step) with small batch
    {
        "name": "excel_exp3_freq1_small_batch",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=1e-4, gamma=0.99, batch_size=16, buffer_size=100_000, train_freq=1, gradient_steps=1, target_update_interval=5000, learning_starts=2000, exploration_fraction=0.12, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    },
    # 4 More gradient steps per update
    # {
    #     "name": "excel_exp4_more_gradsteps",
    #     "policy": "CnnPolicy",
    #     "hp": dict(learning_rate=8e-5, gamma=0.99, batch_size=32, buffer_size=150_000, train_freq=4, gradient_steps=4, target_update_interval=8000, learning_starts=5000, exploration_fraction=0.1, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    # },
    # 5 Higher gamma (longer horizon)
    # {
    #     "name": "excel_exp5_high_gamma",
    #     "policy": "CnnPolicy",
    #     "hp": dict(learning_rate=1e-4, gamma=0.997, batch_size=32, buffer_size=120_000, train_freq=4, gradient_steps=1, target_update_interval=7000, learning_starts=5000, exploration_fraction=0.1, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    # },
    # 6 Small buffer, faster target updates
    # {
    #     "name": "excel_exp6_small_buffer_fast_target",
    #     "policy": "CnnPolicy",
    #     "hp": dict(learning_rate=1.2e-4, gamma=0.99, batch_size=32, buffer_size=50_000, train_freq=4, gradient_steps=1, target_update_interval=4000, learning_starts=2000, exploration_fraction=0.2, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0),
    # },
    # 7 MLP policy (for comparison) - smaller network via policy_kwargs
    # {
    #     "name": "excel_exp7_mlp_small",
    #     "policy": "MlpPolicy",
    #     "hp": dict(learning_rate=5e-4, gamma=0.99, batch_size=64, buffer_size=100_000, train_freq=4, gradient_steps=1, target_update_interval=10000, learning_starts=5000, exploration_fraction=0.15, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0, policy_kwargs=dict(net_arch=[256, 256])),
    # },
    # 8 MLP larger (deeper) - see if MLP can learn with stacked frames
    # {
    #     "name": "excel_exp8_mlp_deep",
    #     "policy": "MlpPolicy",
    #     "hp": dict(learning_rate=3e-4, gamma=0.99, batch_size=64, buffer_size=150_000, train_freq=4, gradient_steps=2, target_update_interval=8000, learning_starts=8000, exploration_fraction=0.15, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0, policy_kwargs=dict(net_arch=[512, 512])),
    # },
    # 9 Aggressive exploration decay (faster exploitation)
    # {
    #     "name": "excel_exp9_quick_decay",
    #     "policy": "CnnPolicy",
    #     "hp": dict(learning_rate=1e-4, gamma=0.99, batch_size=32, buffer_size=120_000, train_freq=4, gradient_steps=1, target_update_interval=8000, learning_starts=4000, exploration_fraction=0.03, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    # },
    # 10 Conservative LR + gradient clipping (max_grad_norm) to stabilize training
    # {
    #     "name": "excel_exp10_slow_lr_clip",
    #     "policy": "CnnPolicy",
    #     "hp": dict(learning_rate=5e-5, gamma=0.99, batch_size=32, buffer_size=150_000, train_freq=4, gradient_steps=1, target_update_interval=8000, learning_starts=8000, exploration_fraction=0.12, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0, max_grad_norm=10),
    # },
]

print(f"Prepared {len(experiments)} experiment configs (excel).")

Prepared 3 experiment configs (excel).


In [4]:
# Cell 4: Helper to make CNN env with frame stack
def make_cnn_env(seed: int, render_mode: str = None):
    # render_mode can be None or 'human' for play cell
    env = make_atari_env(ENV_ID, n_envs=1, seed=seed, env_kwargs={"render_mode": render_mode} if render_mode else None)
    env = VecFrameStack(env, n_stack=4)
    return env

In [5]:
# Cell 2: Imports and constants
import os
import time
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict, Tuple

import gymnasium as gym
import ale_py  # ensure ALE namespace is registered
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack

# Constants (edit as needed)
ENV_ID = "ALE/Pong-v5"
MODEL_DIR = os.path.join("models")
LOG_DIR = os.path.join("logs")
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(os.path.join(LOG_DIR, "tensorboard"), exist_ok=True)

In [None]:
# import os

# Define the base directory in Google Drive
DRIVE_PATH = '/content/drive/MyDrive/Colab_DQN_Checkpoints'

# Modify MODEL_DIR and LOG_DIR to point to the Drive path
MODEL_DIR = os.path.join(DRIVE_PATH, "models")
LOG_DIR = os.path.join(DRIVE_PATH, "logs")

# Ensure directories are created
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(os.path.join(LOG_DIR, "tensorboard"), exist_ok=True)

print(f"Updated MODEL_DIR: {MODEL_DIR}")
print(f"Updated LOG_DIR: {LOG_DIR}")
print("Ensured all necessary directories exist in Google Drive.")


In [None]:
# Cell 3: EpisodeCSVLogger (same behavior as in branis.ipynb)
from stable_baselines3.common.callbacks import BaseCallback

class EpisodeCSVLogger(BaseCallback):
    def __init__(self, csv_path: str, verbose: int = 0):
        super().__init__(verbose)
        self.csv_path = csv_path
        self.rows = []

    def _on_step(self) -> bool:
        for info in self.locals.get("infos", []):
            if "episode" in info:
                ep = info["episode"]
                # 'l' = length, 'r' = reward in VecEnv episode info
                self.rows.append((self.num_timesteps, ep.get("l", None), ep.get("r", None)))
        return True

    def _on_training_end(self) -> None:
        import csv, os
        os.makedirs(os.path.dirname(self.csv_path), exist_ok=True)
        with open(self.csv_path, "w", newline="") as f:
            w = csv.writer(f)
            w.writerow(["timestep", "ep_length", "ep_reward"])
            w.writerows(self.rows)
        if self.verbose:
            print(f"Saved episode CSV to {self.csv_path}")

In [None]:
TOTAL_TIMESTEPS = 1500000  # change for longer runs
SEED = 42
EVAL_EPISODES = 3
BEST_MODEL_PATH = os.path.join(MODEL_DIR, "excel_best_dqn.zip")
CHECKPOINT_SAVE_FREQ = 100_000 # Save a checkpoint every 100,000 timesteps

results = []
best_mean_reward = None
best_record = None

for exp in experiments:
    policy = exp.get("policy", "CnnPolicy")
    metrics, model = train_experiment(name=exp["name"], hp=exp["hp"], policy=policy, total_timesteps=TOTAL_TIMESTEPS, seed=SEED, eval_episodes=EVAL_EPISODES, save_freq=CHECKPOINT_SAVE_FREQ)
    results.append(metrics)

    if (best_mean_reward is None) or (metrics["mean_reward"] > best_mean_reward):
        best_mean_reward = metrics["mean_reward"]
        best_record = metrics
        model.save(BEST_MODEL_PATH)
        print(f"Saved new best model: {metrics['name']} -> {BEST_MODEL_PATH} (mean_reward={best_mean_reward:.2f})")

# Save summary CSV
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("mean_reward", ascending=False).reset_index(drop=True)
results_csv = os.path.join(LOG_DIR, "excel_models.csv")
results_df.to_csv(results_csv, index=False)
print(f"Saved results table to {results_csv}")
results_df

In [16]:
# Cell 9: Play the BEST saved model (render on-screen)
import time
from pathlib import Path
from stable_baselines3 import DQN

# Get the notebook's current directory and go back 2 levels to project root
notebook_dir = Path.cwd()
project_root = notebook_dir.parent
BEST_MODEL_PATH = project_root / "models" / "Excel_model" / "excel_best_dqn.zip"

print(f"Notebook dir: {notebook_dir}")
print(f"Project root: {project_root}")
print(f"Loading best model from: {BEST_MODEL_PATH}")

ENV_ID = "ALE/Pong-v5"
N_EPISODES = 1
SEED = 42

if not os.path.isfile(BEST_MODEL_PATH):
    print("Best model not found:", BEST_MODEL_PATH)
    if os.path.isdir(MODEL_DIR):
        print("Available models:")
        for f in sorted(os.listdir(MODEL_DIR)):
            if f.endswith(".zip"):
                print(" -", os.path.join(MODEL_DIR, f))
else:
    env = make_atari_env(ENV_ID, n_envs=1, seed=SEED, env_kwargs={"render_mode": "human"})
    env = VecFrameStack(env, n_stack=4)
    model = DQN.load(BEST_MODEL_PATH, env=env)
    for ep in range(N_EPISODES):
        obs = env.reset()
        done = False
        ep_reward = 0.0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, rewards, dones, infos = env.step(action)
            ep_reward += float(rewards[0])
            done = bool(dones[0])
            time.sleep(1/60)
        print(f"Episode {ep+1} return: {ep_reward:.2f}")
    env.close()

Notebook dir: c:\Users\Excel\Desktop\Github Projects\Deep-Q-Learning\notebooks
Project root: c:\Users\Excel\Desktop\Github Projects\Deep-Q-Learning
Loading best model from: c:\Users\Excel\Desktop\Github Projects\Deep-Q-Learning\models\Excel_model\excel_best_dqn.zip


MemoryError: Unable to allocate 2.63 GiB for an array with shape (100000, 1, 4, 84, 84) and data type uint8