# Import libraries and variables

In [None]:
# !pip install gymnasium[atari, accept-rom-license] stable-baselines3

import gymnasium as gym
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
import ale_py
gym.register_envs(ale_py)

import time
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3 import PPO

# Para selección de hiperparámetros 
import optuna    #
import torch, os #


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ENV_ID = "MsPacmanNoFrameskip-v4"
N_ENVS = 1
SEED = 42

# Deep Q-Network (DQN)

In [3]:
env =make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED, 
    monitor_dir="../logs/logs_dqn/")

# Aplicar frame-stacking de 4 últimas imágenes (como en DQN de Atari) (para que el programa induzca movimiento)
env = VecFrameStack(env, n_stack=4)

In [4]:
model = DQN(
    policy="CnnPolicy",
    env=env,
    learning_rate=1e-4,
    buffer_size=100000,
    learning_starts=50000,
    batch_size=32,
    tau=1.0,
    gamma=0.99,
    train_freq=4,
    target_update_interval=10000,
    exploration_fraction=0.1,
    exploration_final_eps=0.1,
    exploration_initial_eps=1.0,
    verbose=1)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [9]:
total_timesteps = 5
model.learn(total_timesteps=total_timesteps)
model.save(f"../models/dqn_pacman_model_{total_timesteps}_timesteps.zip")
env.close()

## Medir performance del modelo con evaluate policy

In [None]:
eval_env = make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED)
eval_env = VecFrameStack(eval_env, n_stack=4)

model_with_n_steps = 5
model = DQN.load(f"../models/dqn_pacman_model_{total_timesteps}_timesteps.zip")

mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"Recompensa media: {mean_reward} +/- {std_reward}")

Recompensa media: 253.0 +/- 36.345563690772494


## ver ejecuccion

In [5]:
model = DQN.load("../models/dqn_pacman_model_1M.zip")

eval_env = make_atari_env(
    ENV_ID,
    n_envs=1,
    seed=SEED,
    env_kwargs={"render_mode": "human"}
)
eval_env = VecFrameStack(eval_env, n_stack=4)

obs = eval_env.reset()
done = False
total_reward = 0.0

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = eval_env.step(action)

    total_reward += float(reward[0])
    eval_env.render()
    time.sleep(0.02)

print("Episodio terminado, recompensa acumulada:", total_reward)
eval_env.close()

Episodio terminado, recompensa acumulada: 28.0


# DQN with optuna (hyperparameter tunning)

In [3]:
FRAME_STACK = 4

def make_mspacman_env(seed=SEED, n_envs=4):
    env = make_atari_env(
        "MsPacmanNoFrameskip-v4",
        n_envs=n_envs,
        seed=seed,
        monitor_dir="../logs/logs_dqn/")
    return VecFrameStack(env, n_stack=FRAME_STACK)

In [4]:
TOTAL_FRAMES_TUNE = 100_000

def objective(trial: optuna.Trial) -> float:
    print("cuda" if torch.cuda.is_available() else "cpu")
    # ----- sample hyper‑parameters -----
    lr        = trial.suggest_loguniform("learning_rate", 5e-4, 5e-3)
    buff_size = trial.suggest_categorical("buffer_size", [100_000, 500_000, 1_000_000])
    batch_sz  = trial.suggest_categorical("batch_size", [32, 64])
    tau       = trial.suggest_uniform("tau", 0.8, 1.0)
    gamma     = trial.suggest_uniform("gamma", 0.97, 0.999)
    train_fr  = trial.suggest_categorical("train_freq", [2, 4, 8])
    expl_frac = trial.suggest_uniform("exploration_fraction", 0.05, 0.2)

    env = make_mspacman_env()

    model = DQN(
        "CnnPolicy", env,
        learning_rate        = lr,
        buffer_size          = buff_size,
        batch_size           = batch_sz,
        tau                  = tau,
        gamma                = gamma,
        train_freq           = (train_fr, "step"),
        target_update_interval = 10_000,
        exploration_fraction = expl_frac,
        exploration_initial_eps = 1.0,
        exploration_final_eps   = 0.05,
        learning_starts      = 50_000,
        verbose              = 0,
        seed                 = SEED,
        device               = "cuda" if torch.cuda.is_available() else "cpu"
    )

    model.learn(TOTAL_FRAMES_TUNE, progress_bar=False)
    mean_reward, _ = evaluate_policy(model, make_mspacman_env(n_envs=1),
                                     n_eval_episodes=10, deterministic=True)
    env.close()
    # report to Optuna
    return mean_reward


In [5]:
study = optuna.create_study(
            direction="maximize",
            sampler=optuna.samplers.TPESampler(seed=SEED),
            pruner = optuna.pruners.MedianPruner(n_warmup_steps=25))

study.optimize(objective, n_trials=20, n_jobs=1)
print("Best reward:", study.best_value)
print("Best params:", study.best_params)


[I 2025-04-18 18:45:06,889] A new study created in memory with name: no-name-6a252572-578e-446a-b927-f51bd1e68553


cuda


  lr        = trial.suggest_loguniform("learning_rate", 5e-4, 5e-3)
  tau       = trial.suggest_uniform("tau", 0.8, 1.0)
  gamma     = trial.suggest_uniform("gamma", 0.97, 0.999)
  expl_frac = trial.suggest_uniform("exploration_fraction", 0.05, 0.2)
[W 2025-04-18 18:47:45,131] Trial 0 failed with parameters: {'learning_rate': 0.0011844319751820387, 'buffer_size': 100000, 'batch_size': 32, 'tau': 0.8116167224336399, 'gamma': 0.9951191082274731, 'train_freq': 4, 'exploration_fraction': 0.19548647782429918} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\pablo\miniconda3\envs\rf_lab\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\pablo\AppData\Local\Temp\ipykernel_27776\1524910167.py", line 34, in objective
    model.learn(TOTAL_FRAMES_TUNE, progress_bar=False)
  File "c:\Users\pablo\miniconda3\envs\rf_lab\Lib\site-packages\sta

KeyboardInterrupt: 

In [None]:
BEST_PARAMS = dict(
    policy="CnnPolicy",
    env=make_mspacman_env(),
    learning_starts=50_000,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    target_update_interval=10_000,
    verbose=1,
    seed=SEED,
    device="cuda" if torch.cuda.is_available() else "cpu"
)
BEST_PARAMS.update(study.best_params)         

LONG_FRAMES = 5_000_000
best_model = DQN(**BEST_PARAMS)
best_model.learn(LONG_FRAMES, progress_bar=True)
best_model.save("../models/dqn_pacman_best.zip")

Using cuda device
Wrapping the env in a VecTransposeImage.


In [None]:
eval_env = make_mspacman_env(n_envs=1)

model = DQN.load("../models/dqn_pacman_best.zip")

mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"Recompensa media: {mean_reward} +/- {std_reward}")

Recompensa media: 210.0 +/- 0.0


# Policy Gradient methods (PPO)

In [None]:
env =make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED, 
    monitor_dir="../logs/logs_ppo/")

# Aplicar frame-stacking de 4 últimas imágenes (como en DQN de Atari)
env = VecFrameStack(env, n_stack=4)

In [None]:
model = PPO(
    policy="CnnPolicy",
    env=env,
    n_steps=1024,              # pasos por entorno antes de cada update: 128*8 = 1024 muestras
    batch_size=256,
    n_epochs=4,
    gamma=0.99,
    gae_lambda=0.95,
    learning_rate=2.5e-4,     # se puede cambiar a schedule lineal → 2.5e‑4 → 0
    ent_coef=0.01,
    clip_range=0.1,
    vf_coef=0.5,
    max_grad_norm=0.5,
    verbose=1,
    device="auto"             # usa GPU si está disponible
)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [None]:
TOTAL_TIMESTEPS = 5_000_000
model.learn(total_timesteps=TOTAL_TIMESTEPS)
model.save(f"../models/ppo_pacman_model_{total_timesteps}timesteps.zip")
env.close()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.78e+03 |
|    ep_rew_mean     | 205      |
| time/              |          |
|    fps             | 296      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.84e+03     |
|    ep_rew_mean          | 218          |
| time/                   |              |
|    fps                  | 290          |
|    iterations           | 2            |
|    time_elapsed         | 7            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0028484466 |
|    clip_fraction        | 0.0437       |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.2         |
|    explained_variance   | -0.00547     |
|    learning_r

## Medir performance del modelo con evaluate policy

In [None]:
eval_env = make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED)
eval_env = VecFrameStack(eval_env, n_stack=4)

model = PPO.load("../models/ppo_pacman_model.zip")

# Evaluamos en modo determinista en 10 episodios
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=False)

print(f"Recompensa media: {mean_reward} +/- {std_reward}")

Recompensa media: 1153.0 +/- 206.2546969162157


## ver ejecuccion

In [None]:
model = PPO.load("../models/ppo_pacman_model_5M.zip")

eval_env = make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED,
    env_kwargs={"render_mode": "human"}
)
eval_env = VecFrameStack(eval_env, n_stack=4)

obs = eval_env.reset()
done = False
total_reward = 0.0

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = eval_env.step(action)

    total_reward += float(reward[0])
    eval_env.render()
    time.sleep(0.02)

print("Episodio terminado, recompensa acumulada:", total_reward)
eval_env.close()

Episodio terminado, recompensa acumulada: 84.0


# Advantage Actor-Critic (A3C)

In [None]:
env =make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED, 
    monitor_dir="../logs/logs_a3c/")

# Aplicar frame-stacking de 4 últimas imágenes (como en DQN de Atari)
env = VecFrameStack(env, n_stack=4)