In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

# Define the base directory in Google Drive
DRIVE_PATH = '/content/drive/MyDrive/Colab_DQN_Checkpoints'

# Modify MODEL_DIR and LOG_DIR to point to the Drive path
MODEL_DIR = os.path.join(DRIVE_PATH, "models")
LOG_DIR = os.path.join(DRIVE_PATH, "logs")

# Ensure directories are created
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(os.path.join(LOG_DIR, "tensorboard"), exist_ok=True)

print(f"Updated MODEL_DIR: {MODEL_DIR}")
print(f"Updated LOG_DIR: {LOG_DIR}")
print("Ensured all necessary directories exist in Google Drive.")

Updated MODEL_DIR: /content/drive/MyDrive/Colab_DQN_Checkpoints/models
Updated LOG_DIR: /content/drive/MyDrive/Colab_DQN_Checkpoints/logs
Ensured all necessary directories exist in Google Drive.


In [5]:
import sys
!{sys.executable} -m pip install stable-baselines3[extra] ale-py
print("stable-baselines3 and ale-py re-installed.")

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.7.0-py3-none-any.whl.metadata (4.8 kB)
Downloading stable_baselines3-2.7.0-py3-none-any.whl (187 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stable-baselines3
Successfully installed stable-baselines3-2.7.0
stable-baselines3 and ale-py re-installed.


In [None]:
from stable_baselines3.common.callbacks import BaseCallback

class PeriodicSaveCallback(BaseCallback):
    """
    A custom callback that saves the model periodically to Google Drive.
    """
    def __init__(self, save_freq: int, save_path: str, verbose: int = 0):
        super().__init__(verbose)
        self.save_freq = save_freq
        self.save_path = save_path
        os.makedirs(save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.save_freq == 0:
            path = os.path.join(self.save_path, f"model_{self.num_timesteps}.zip")
            self.model.save(path)
            if self.verbose > 0:
                print(f"Saving model to {path} at timestep {self.num_timesteps}")
        return True

print("PeriodicSaveCallback class defined successfully.")

PeriodicSaveCallback class defined successfully.


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
import time
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict, Tuple

import gymnasium as gym
import ale_py  # ensure ALE namespace is registered
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.callbacks import BaseCallback

# Train_experiment function (saves per-run model & csv)

def train_experiment(name: str, hp: Dict, policy: str = "CnnPolicy", total_timesteps: int = 50_000, seed: int = 42, eval_episodes: int = 3, save_freq: int = 100_000) -> Tuple[Dict, DQN]:
    """Train a DQN model with given hyperparameters and save per-run model and CSV.
    policy: string policy name to pass to DQN (e.g., 'CnnPolicy' or 'MlpPolicy').
    Returns (metrics_dict, model)
    """
    run_model_path = os.path.join(MODEL_DIR, f"dqn_{name}.zip")
    csv_log = os.path.join(LOG_DIR, f"training_metrics_{name}.csv")
    tb_log = os.path.join(LOG_DIR, "tensorboard", name)

    env = make_cnn_env(seed)

    # Instantiating callbacks
    episode_logger_callback = EpisodeCSVLogger(csv_log, verbose=0)
    periodic_save_callback = PeriodicSaveCallback(save_freq=save_freq, save_path=os.path.join(MODEL_DIR, name + "_checkpoints"), verbose=1) # Save checkpoints in a subfolder per experiment

    # Combining callbacks into a list
    callbacks = [episode_logger_callback, periodic_save_callback]

    # Creating model instance
    model = DQN(
        policy,
        env,
        seed=seed,
        tensorboard_log=tb_log,
        **hp,
    )

    print(f"\n[RUN {name}] Training {total_timesteps} steps | policy={policy} | hp={hp}")
    t0 = time.time()
    model.learn(total_timesteps=total_timesteps, callback=callbacks, progress_bar=True)
    minutes = (time.time() - t0) / 60.0

    mean_r, std_r = evaluate_policy(model, env, n_eval_episodes=eval_episodes)
    env.close()

    model.save(run_model_path)  

    metrics = {
        "name": name,
        "mean_reward": float(mean_r),
        "std_reward": float(std_r),
        "train_minutes": minutes,
        **hp,
        "policy": policy,
    }
    print(f"[RUN {name}] Finished: mean_reward={mean_r:.2f} \u00b1 {std_r:.2f} | train_minutes={minutes:.2f}")
    return metrics, model

In [None]:
# Defining 10 experiment configurations (prefixed with 'excel_')
experiments = [
    # 1 Baseline (CNN)
    {
        "name": "excel_exp1_baseline",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=1e-4, gamma=0.99, batch_size=32, buffer_size=100_000, train_freq=4, gradient_steps=1, target_update_interval=10_000, learning_starts=5000, exploration_fraction=0.1, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    },
    # 2 Larger batch + larger buffer, slightly lower LR
    {
        "name": "excel_exp2_large_batch",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=7e-5, gamma=0.99, batch_size=64, buffer_size=200_000, train_freq=4, gradient_steps=1, target_update_interval=8000, learning_starts=8000, exploration_fraction=0.12, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0),
    },
    # 3 More frequent updates (train every step) with small batch
    {
        "name": "excel_exp3_freq1_small_batch",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=1e-4, gamma=0.99, batch_size=16, buffer_size=100_000, train_freq=1, gradient_steps=1, target_update_interval=5000, learning_starts=2000, exploration_fraction=0.12, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    },
    # 4 More gradient steps per update
    # {
    #     "name": "excel_exp4_more_gradsteps",
    #     "policy": "CnnPolicy",
    #     "hp": dict(learning_rate=8e-5, gamma=0.99, batch_size=32, buffer_size=150_000, train_freq=4, gradient_steps=4, target_update_interval=8000, learning_starts=5000, exploration_fraction=0.1, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    # },
    # 5 Higher gamma (longer horizon)
    # {
    #     "name": "excel_exp5_high_gamma",
    #     "policy": "CnnPolicy",
    #     "hp": dict(learning_rate=1e-4, gamma=0.997, batch_size=32, buffer_size=120_000, train_freq=4, gradient_steps=1, target_update_interval=7000, learning_starts=5000, exploration_fraction=0.1, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    # },
    # 6 Small buffer, faster target updates
    # {
    #     "name": "excel_exp6_small_buffer_fast_target",
    #     "policy": "CnnPolicy",
    #     "hp": dict(learning_rate=1.2e-4, gamma=0.99, batch_size=32, buffer_size=50_000, train_freq=4, gradient_steps=1, target_update_interval=4000, learning_starts=2000, exploration_fraction=0.2, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0),
    # },
    # 7 MLP policy (for comparison) - smaller network via policy_kwargs
    # {
    #     "name": "excel_exp7_mlp_small",
    #     "policy": "MlpPolicy",
    #     "hp": dict(learning_rate=5e-4, gamma=0.99, batch_size=64, buffer_size=100_000, train_freq=4, gradient_steps=1, target_update_interval=10000, learning_starts=5000, exploration_fraction=0.15, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0, policy_kwargs=dict(net_arch=[256, 256])),
    # },
    # 8 MLP larger (deeper) - see if MLP can learn with stacked frames
    # {
    #     "name": "excel_exp8_mlp_deep",
    #     "policy": "MlpPolicy",
    #     "hp": dict(learning_rate=3e-4, gamma=0.99, batch_size=64, buffer_size=150_000, train_freq=4, gradient_steps=2, target_update_interval=8000, learning_starts=8000, exploration_fraction=0.15, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0, policy_kwargs=dict(net_arch=[512, 512])),
    # },
    # 9 Aggressive exploration decay (faster exploitation)
    # {
    #     "name": "excel_exp9_quick_decay",
    #     "policy": "CnnPolicy",
    #     "hp": dict(learning_rate=1e-4, gamma=0.99, batch_size=32, buffer_size=120_000, train_freq=4, gradient_steps=1, target_update_interval=8000, learning_starts=4000, exploration_fraction=0.03, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    # },
    # 10 Conservative LR + gradient clipping (max_grad_norm) to stabilize training
    # {
    #     "name": "excel_exp10_slow_lr_clip",
    #     "policy": "CnnPolicy",
    #     "hp": dict(learning_rate=5e-5, gamma=0.99, batch_size=32, buffer_size=150_000, train_freq=4, gradient_steps=1, target_update_interval=8000, learning_starts=8000, exploration_fraction=0.12, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0, max_grad_norm=10),
    # },
]

print(f"Prepared {len(experiments)} experiment configs (excel).")

Prepared 3 experiment configs (excel).


In [None]:
# Helper to make CNN env with frame stack
def make_cnn_env(seed: int, render_mode: str = None):
    # render_mode can be None or 'human' for play cell
    env = make_atari_env(ENV_ID, n_envs=1, seed=seed, env_kwargs={"render_mode": render_mode} if render_mode else None)
    env = VecFrameStack(env, n_stack=4)
    return env

In [None]:
# Constants 
ENV_ID = "ALE/Pong-v5"
MODEL_DIR = os.path.join("models")
LOG_DIR = os.path.join("logs")
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(os.path.join(LOG_DIR, "tensorboard"), exist_ok=True)

In [None]:
DRIVE_PATH = '/content/drive/MyDrive/Colab_DQN_Checkpoints'

MODEL_DIR = os.path.join(DRIVE_PATH, "models")
LOG_DIR = os.path.join(DRIVE_PATH, "logs")

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(os.path.join(LOG_DIR, "tensorboard"), exist_ok=True)

print(f"Updated MODEL_DIR: {MODEL_DIR}")
print(f"Updated LOG_DIR: {LOG_DIR}")
print("Ensured all necessary directories exist in Google Drive.")

In [None]:
# EpisodeCSVLogger 
from stable_baselines3.common.callbacks import BaseCallback

class EpisodeCSVLogger(BaseCallback):
    def __init__(self, csv_path: str, verbose: int = 0):
        super().__init__(verbose)
        self.csv_path = csv_path
        self.rows = []

    def _on_step(self) -> bool:
        for info in self.locals.get("infos", []):
            if "episode" in info:
                ep = info["episode"]
                # 'l' = length, 'r' = reward in VecEnv episode info
                self.rows.append((self.num_timesteps, ep.get("l", None), ep.get("r", None)))
        return True

    def _on_training_end(self) -> None:
        import csv, os
        os.makedirs(os.path.dirname(self.csv_path), exist_ok=True)
        with open(self.csv_path, "w", newline="") as f:
            w = csv.writer(f)
            w.writerow(["timestep", "ep_length", "ep_reward"])
            w.writerows(self.rows)
        if self.verbose:
            print(f"Saved episode CSV to {self.csv_path}")

In [None]:
TOTAL_TIMESTEPS = 1500000 
SEED = 42
EVAL_EPISODES = 3
BEST_MODEL_PATH = os.path.join(MODEL_DIR, "excel_best_dqn.zip")
CHECKPOINT_SAVE_FREQ = 100_000 # Save a checkpoint every 100,000 timesteps

results = []
best_mean_reward = None
best_record = None

for exp in experiments:
    policy = exp.get("policy", "CnnPolicy")
    metrics, model = train_experiment(name=exp["name"], hp=exp["hp"], policy=policy, total_timesteps=TOTAL_TIMESTEPS, seed=SEED, eval_episodes=EVAL_EPISODES, save_freq=CHECKPOINT_SAVE_FREQ)
    results.append(metrics)

    if (best_mean_reward is None) or (metrics["mean_reward"] > best_mean_reward):
        best_mean_reward = metrics["mean_reward"]
        best_record = metrics
        model.save(BEST_MODEL_PATH)
        print(f"Saved new best model: {metrics['name']} -> {BEST_MODEL_PATH} (mean_reward={best_mean_reward:.2f})")

# Save summary CSV
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("mean_reward", ascending=False).reset_index(drop=True)
results_csv = os.path.join(LOG_DIR, "excel_models.csv")
results_df.to_csv(results_csv, index=False)
print(f"Saved results table to {results_csv}")
results_df

In [None]:
# Ensure LOG_DIR points to the Google Drive path
DRIVE_PATH = '/content/drive/MyDrive/Colab_DQN_Checkpoints'
LOG_DIR = os.path.join(DRIVE_PATH, "logs")

# List of available individual experiment CSVs
individual_csvs = [
    "training_metrics_excel_exp1_baseline.csv",
    "training_metrics_excel_exp2_large_batch.csv"
]

print("Loading individual training metrics:")
for csv_file_name in individual_csvs:
    csv_path = os.path.join(LOG_DIR, csv_file_name)
    if os.path.exists(csv_path):
        print(f"\n--- Displaying {csv_file_name} ---")
        df = pd.read_csv(csv_path)
        display(df)
    else:
        print(f"Error: The file {csv_path} was not found.")

Loading individual training metrics:

--- Displaying training_metrics_excel_exp1_baseline.csv ---


Unnamed: 0,timestep,ep_length,ep_reward
0,224,914,-21.0
1,426,843,-20.0
2,626,825,-21.0
3,832,852,-21.0
4,1021,764,-21.0
...,...,...,...
2036,1494486,4999,4.0
2037,1495935,5820,1.0
2038,1497146,4877,2.0
2039,1498277,4558,12.0


  return datetime.utcnow().replace(tzinfo=utc)



--- Displaying training_metrics_excel_exp2_large_batch.csv ---


Unnamed: 0,timestep,ep_length,ep_reward
0,224,914,-21.0
1,426,843,-20.0
2,626,825,-21.0
3,832,852,-21.0
4,1021,764,-21.0
...,...,...,...
2466,1495669,3677,4.0
2467,1496768,4416,5.0
2468,1497953,4759,4.0
2469,1498932,3950,7.0



To generate the 'excel_models.csv' summary file, please ensure all experiments are run by executing the training loop cell (`cell_id: NUW8frgRdGre`).


  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
# LOG_DIR = '/content/drive/MyDrive/Colab_DQN_Checkpoints/logs'

# Load 'training_metrics_excel_exp1_baseline.csv'
csv_path_exp1 = os.path.join(LOG_DIR, 'training_metrics_excel_exp1_baseline.csv')
if os.path.exists(csv_path_exp1):
    df_exp1_baseline = pd.read_csv(csv_path_exp1)
    print(f"Successfully loaded {os.path.basename(csv_path_exp1)} into df_exp1_baseline.")
    display(df_exp1_baseline.head())
else:
    print(f"Error: The file {csv_path_exp1} was not found.")

# Load 'training_metrics_excel_exp2_large_batch.csv'
csv_path_exp2 = os.path.join(LOG_DIR, 'training_metrics_excel_exp2_large_batch.csv')
if os.path.exists(csv_path_exp2):
    df_exp2_large_batch = pd.read_csv(csv_path_exp2)
    print(f"\nSuccessfully loaded {os.path.basename(csv_path_exp2)} into df_exp2_large_batch.")
    display(df_exp2_large_batch.head())
else:
    print(f"\nError: The file {csv_path_exp2} was not found.")


Successfully loaded training_metrics_excel_exp1_baseline.csv into df_exp1_baseline.


  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,timestep,ep_length,ep_reward
0,224,914,-21.0
1,426,843,-20.0
2,626,825,-21.0
3,832,852,-21.0
4,1021,764,-21.0



Successfully loaded training_metrics_excel_exp2_large_batch.csv into df_exp2_large_batch.


  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,timestep,ep_length,ep_reward
0,224,914,-21.0
1,426,843,-20.0
2,626,825,-21.0
3,832,852,-21.0
4,1021,764,-21.0


In [None]:
# Calculate mean and standard deviation for 'exp1_baseline'
mean_reward_exp1 = df_exp1_baseline['ep_reward'].mean()
std_reward_exp1 = df_exp1_baseline['ep_reward'].std()

# Calculate mean and standard deviation for 'exp2_large_batch'
mean_reward_exp2 = df_exp2_large_batch['ep_reward'].mean()
std_reward_exp2 = df_exp2_large_batch['ep_reward'].std()

summary_data = {
    'Experiment': ['excel_exp1_baseline', 'excel_exp2_large_batch'],
    'Mean Episode Reward': [mean_reward_exp1, mean_reward_exp2],
    'Std Dev Episode Reward': [std_reward_exp1, std_reward_exp2]
}
summary_df = pd.DataFrame(summary_data)

print("Summary of Episode Rewards:")
display(summary_df)

Summary of Episode Rewards:


  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,Experiment,Mean Episode Reward,Std Dev Episode Reward
0,excel_exp1_baseline,-12.241058,7.884876
1,excel_exp2_large_batch,-12.384864,7.851223


In [None]:
experiments = [
    # 1 Baseline (CNN) - similar to branis but with explicit learning_starts
    {
        "name": "excel_exp1_baseline",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=1e-4, gamma=0.99, batch_size=32, buffer_size=100_000, train_freq=4, gradient_steps=1, target_update_interval=10_000, learning_starts=5000, exploration_fraction=0.1, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    },
    # 2 Larger batch + larger buffer, slightly lower LR
    {
        "name": "excel_exp2_large_batch",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=7e-5, gamma=0.99, batch_size=64, buffer_size=200_000, train_freq=4, gradient_steps=1, target_update_interval=8000, learning_starts=8000, exploration_fraction=0.12, exploration_initial_eps=1.0, exploration_final_eps=0.02, verbose=0),
    },
    # 3 More frequent updates (train every step) with small batch
    {
        "name": "excel_exp3_freq1_small_batch",
        "policy": "CnnPolicy",
        "hp": dict(learning_rate=1e-4, gamma=0.99, batch_size=16, buffer_size=100_000, train_freq=1, gradient_steps=1, target_update_interval=5000, learning_starts=2000, exploration_fraction=0.12, exploration_initial_eps=1.0, exploration_final_eps=0.01, verbose=0),
    },
]

# Initializing an empty list to store consolidated data
consolidated_data = []

# MODEL_DIR is already defined and set to '/content/drive/MyDrive/Colab_DQN_Checkpoints/models'

# Iterating through the `experiments` list
for exp in experiments:
    exp_name = exp['name']
    # Checking if its 'name' is either 'excel_exp1_baseline' or 'excel_exp2_large_batch'
    if exp_name in ['excel_exp1_baseline', 'excel_exp2_large_batch']:
        # Retrieve the mean and standard deviation of episode rewards from `summary_df`
        mean_reward = summary_df[summary_df['Experiment'] == exp_name]['Mean Episode Reward'].iloc[0]
        std_reward = summary_df[summary_df['Experiment'] == exp_name]['Std Dev Episode Reward'].iloc[0]

        # Construct the `model_path` for the current experiment
        model_path = os.path.join(MODEL_DIR, f"dqn_{exp_name}.zip")

        # Creating a dictionary for the current experiment
        consolidated_exp = {
            "name": exp_name,
            "policy": exp["policy"],
            "hp": exp["hp"],
            "mean_reward": mean_reward,
            "std_reward": std_reward,
            "model_path": model_path,
            "train_minutes": None 
        }
        consolidated_data.append(consolidated_exp)

# Displaying the consolidated data for verification
print("Consolidated Experiment Data:")
for item in consolidated_data:
    print(item)

Consolidated Experiment Data:
{'name': 'excel_exp1_baseline', 'policy': 'CnnPolicy', 'hp': {'learning_rate': 0.0001, 'gamma': 0.99, 'batch_size': 32, 'buffer_size': 100000, 'train_freq': 4, 'gradient_steps': 1, 'target_update_interval': 10000, 'learning_starts': 5000, 'exploration_fraction': 0.1, 'exploration_initial_eps': 1.0, 'exploration_final_eps': 0.01, 'verbose': 0}, 'mean_reward': np.float64(-12.241058304752572), 'std_reward': np.float64(7.884876294118987), 'model_path': 'models/dqn_excel_exp1_baseline.zip', 'train_minutes': None}
{'name': 'excel_exp2_large_batch', 'policy': 'CnnPolicy', 'hp': {'learning_rate': 7e-05, 'gamma': 0.99, 'batch_size': 64, 'buffer_size': 200000, 'train_freq': 4, 'gradient_steps': 1, 'target_update_interval': 8000, 'learning_starts': 8000, 'exploration_fraction': 0.12, 'exploration_initial_eps': 1.0, 'exploration_final_eps': 0.02, 'verbose': 0}, 'mean_reward': np.float64(-12.384864427357344), 'std_reward': np.float64(7.851222703961445), 'model_path': '

In [None]:
final_df = pd.DataFrame(consolidated_data)

output_csv_path = os.path.join(LOG_DIR, "excel_models.csv")

os.makedirs(LOG_DIR, exist_ok=True)

final_df.to_csv(output_csv_path, index=False)
print(f"Successfully saved the consolidated experiment data to {output_csv_path}")

# 3. Display the DataFrame
print("\nConsolidated Experiment Data DataFrame:")
display(final_df)

Successfully saved the consolidated experiment data to /content/drive/MyDrive/Colab_DQN_Checkpoints/logs/excel_models.csv

Consolidated Experiment Data DataFrame:


  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,name,policy,hp,mean_reward,std_reward,model_path,train_minutes
0,excel_exp1_baseline,CnnPolicy,"{'learning_rate': 0.0001, 'gamma': 0.99, 'batc...",-12.241058,7.884876,models/dqn_excel_exp1_baseline.zip,
1,excel_exp2_large_batch,CnnPolicy,"{'learning_rate': 7e-05, 'gamma': 0.99, 'batch...",-12.384864,7.851223,models/dqn_excel_exp2_large_batch.zip,


In [None]:
excel_models_path = os.path.join(LOG_DIR, 'excel_models.csv')

if os.path.exists(excel_models_path):
    df_excel_models = pd.read_csv(excel_models_path)
    print(f"Successfully loaded {os.path.basename(excel_models_path)} into df_excel_models.")
    # Display the first few rows of the loaded DataFrame
    display(df_excel_models.head())
else:
    print(f"Error: The file {excel_models_path} was not found. Please ensure it was created correctly.")

Successfully loaded excel_models.csv into df_excel_models.


  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,name,policy,hp,mean_reward,std_reward,model_path,train_minutes
0,excel_exp1_baseline,CnnPolicy,"{'learning_rate': 0.0001, 'gamma': 0.99, 'batc...",-12.241058,7.884876,models/dqn_excel_exp1_baseline.zip,
1,excel_exp2_large_batch,CnnPolicy,"{'learning_rate': 7e-05, 'gamma': 0.99, 'batch...",-12.384864,7.851223,models/dqn_excel_exp2_large_batch.zip,


In [None]:
# Looking for the 'least negative' (closest to zero or positive) mean_reward
best_experiment_row = df_excel_models.loc[df_excel_models['mean_reward'].idxmax()]
best_experiment_name = best_experiment_row['name']

print(f"The experiment that produced the maximum 'mean_reward' (and thus likely `excel_best_dqn.zip`) is: {best_experiment_name}")
print("Details of the best experiment:")
display(best_experiment_row)

The experiment that produced the maximum 'mean_reward' (and thus likely `excel_best_dqn.zip`) is: excel_exp1_baseline
Details of the best experiment:


  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,0
name,excel_exp1_baseline
policy,CnnPolicy
hp,"{'learning_rate': 0.0001, 'gamma': 0.99, 'batc..."
mean_reward,-12.241058
std_reward,7.884876
model_path,models/dqn_excel_exp1_baseline.zip
train_minutes,


In [None]:
# Play the BEST saved model (render on-screen)
import time
from stable_baselines3 import DQN

BEST_MODEL_PATH = '/content/drive/MyDrive/Colab_DQN_Checkpoints/models/excel_best_dqn.zip'
ENV_ID = "ALE/Pong-v5"
N_EPISODES = 1
SEED = 42

if not os.path.isfile(BEST_MODEL_PATH):
    print("Best model not found:", BEST_MODEL_PATH)
    if os.path.isdir(MODEL_DIR):
        print("Available models:")
        for f in sorted(os.listdir(MODEL_DIR)):
            if f.endswith(".zip"):
                print(" -", os.path.join(MODEL_DIR, f))
else:
    env = make_atari_env(ENV_ID, n_envs=1, seed=SEED, env_kwargs={"render_mode": "human"})
    env = VecFrameStack(env, n_stack=4)
    model = DQN.load(BEST_MODEL_PATH, env=env)
    for ep in range(N_EPISODES):
        obs = env.reset()
        done = False
        ep_reward = 0.0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, rewards, dones, infos = env.step(action)
            ep_reward += float(rewards[0])
            done = bool(dones[0])
            time.sleep(1/60)
        print(f"Episode {ep+1} return: {ep_reward:.2f}")
    env.close()

Episode 1 return: -5.00


  return datetime.utcnow().replace(tzinfo=utc)
