# Import libraries and variables

In [1]:
# !pip install gymnasium[atari, accept-rom-license] stable-baselines3

import gymnasium as gym
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback

import ale_py
gym.register_envs(ale_py)

import time
import numpy as np
from stable_baselines3 import DQN,PPO
from sb3_contrib.qrdqn import QRDQN

In [2]:
ENV_ID = "MsPacmanNoFrameskip-v4"
N_ENVS = 1
SEED = 42

# QR‑DQN (distribucional + prioritized replay)

In [14]:
train_env = make_atari_env(ENV_ID, seed=SEED, n_envs=N_ENVS)
train_env = VecFrameStack(train_env, n_stack=4)

eval_env = make_atari_env(ENV_ID, seed=123, n_envs=N_ENVS)
eval_env = VecFrameStack(eval_env, n_stack=4)

eval_cb = EvalCallback(eval_env, best_model_save_path="../models/qrdqn",
                       eval_freq=100_000, n_eval_episodes=10,
                       deterministic=True, render=False)

ckpt_cb = CheckpointCallback(save_freq=1_000_000, save_path="../models/qrdqn/checkpoints",
                             name_prefix="qrdqn_pacman")

In [None]:
policy_kwargs = dict(
    n_quantiles=200,
    normalize_images=False
)

model = QRDQN(
    "CnnPolicy",
    train_env,
    learning_rate=lambda f: 2.5e-5 * (1 - f),
    buffer_size=200_000,
    learning_starts=80_000,
    batch_size=32,
    gamma=0.99,
    train_freq=4,
    gradient_steps=1,
    target_update_interval=4_000,
    exploration_fraction=0.12,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.01,
    policy_kwargs=policy_kwargs,
    verbose=1,
    device="cuda",
)

Using cuda device
Wrapping the env in a VecTransposeImage.




In [None]:
total_timesteps = 5_000_000
model.learn(total_timesteps=total_timesteps, callback=[eval_cb, ckpt_cb])
model.save(f"../models/qrdqn/best_model_final.zip")
train_env.close()



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.85e+03 |
|    ep_rew_mean      | 190      |
|    exploration_rate | 0.999    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 701      |
|    time_elapsed     | 0        |
|    total_timesteps  | 642      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.19e+03 |
|    ep_rew_mean      | 275      |
|    exploration_rate | 0.998    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 758      |
|    time_elapsed     | 1        |
|    total_timesteps  | 1378     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.14e+03 |
|    ep_rew_mean      | 275      |
|    exploration_rate | 0.997    |
| time/               |          |
|    episodes       

## Medir performance del modelo con evaluate policy

In [None]:
model = QRDQN.load("../models/qrdqn/best_model_final.zip")
mean, std = evaluate_policy(model, eval_env, n_eval_episodes=10)
print(f"Return medio: {mean:.0f} ± {std:.0f}")



Return medio: 1558 ± 333


## ver ejecuccion

In [None]:
play_env = make_atari_env(ENV_ID, n_envs=1, env_kwargs={"render_mode": "human"})
play_env = VecFrameStack(play_env, n_stack=4)

obs = play_env.reset()
done = False
total_reward = 0.0
while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, _ = play_env.step(action)
    total_reward += float(reward[0])
    play_env.render()
    time.sleep(0.02)

print("Episodio terminado, recompensa acumulada:", total_reward)
play_env.close()

Episodio terminado, recompensa acumulada: 88.0


: 

# Deep Q-Network (DQN)

In [4]:
train_env = make_atari_env(ENV_ID, seed=SEED, n_envs=N_ENVS)
train_env = VecFrameStack(train_env, n_stack=4)

eval_env = make_atari_env(ENV_ID, seed=123, n_envs=N_ENVS)
eval_env = VecFrameStack(eval_env, n_stack=4)

eval_cb = EvalCallback(eval_env, best_model_save_path="../models/dqn",
                       eval_freq=100_000, n_eval_episodes=10,
                       deterministic=True, render=False)

ckpt_cb = CheckpointCallback(save_freq=1_000_000, save_path="../models/dqn/checkpoints",
                             name_prefix="dqn_pacman")

In [5]:
model = DQN(
    policy="CnnPolicy",
    env=train_env,
    learning_rate=1e-4,
    buffer_size=100000,
    learning_starts=50000,
    batch_size=32,
    tau=1.0,
    gamma=0.99,
    train_freq=4,
    target_update_interval=10000,
    exploration_fraction=0.1,
    exploration_final_eps=0.1,
    exploration_initial_eps=1.0,
    verbose=1)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [None]:
total_timesteps = 5_000_000
model.learn(total_timesteps=total_timesteps, callback=[eval_cb, ckpt_cb])
model.save(f"../models/dqn/best_model_final.zip")
train_env.close()



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.85e+03 |
|    ep_rew_mean      | 190      |
|    exploration_rate | 0.999    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 883      |
|    time_elapsed     | 0        |
|    total_timesteps  | 642      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.19e+03 |
|    ep_rew_mean      | 275      |
|    exploration_rate | 0.998    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 940      |
|    time_elapsed     | 1        |
|    total_timesteps  | 1378     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.14e+03 |
|    ep_rew_mean      | 275      |
|    exploration_rate | 0.996    |
| time/               |          |
|    episodes       

## Medir performance del modelo con evaluate policy

In [None]:
model = QRDQN.load("../models/dqn/best_model_final.zip")
mean, std = evaluate_policy(model, eval_env, n_eval_episodes=10)
print(f"Return medio: {mean:.0f} ± {std:.0f}")



Recompensa media: 821.0 +/- 323.0


## ver ejecuccion

In [None]:
play_env = make_atari_env(ENV_ID, n_envs=1, env_kwargs={"render_mode": "human"})
play_env = VecFrameStack(play_env, n_stack=4)

obs = play_env.reset()
done = False
total_reward = 0.0
while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, _ = play_env.step(action)
    total_reward += float(reward[0])
    play_env.render()
    time.sleep(0.02)

print("Episodio terminado, recompensa acumulada:", total_reward)
play_env.close()

Episodio terminado, recompensa acumulada: 52.0


: 

# Policy Gradient methods (PPO)

In [None]:
train_env = make_atari_env(ENV_ID, seed=SEED, n_envs=N_ENVS)
train_env = VecFrameStack(train_env, n_stack=4)

eval_env = make_atari_env(ENV_ID, seed=123, n_envs=N_ENVS)
eval_env = VecFrameStack(eval_env, n_stack=4)

eval_cb = EvalCallback(eval_env, best_model_save_path="../models/ppo",
                       eval_freq=100_000, n_eval_episodes=10,
                       deterministic=True, render=False)

ckpt_cb = CheckpointCallback(save_freq=1_000_000, save_path="../models/ppo/checkpoints",
                             name_prefix="ppo_pacman")

In [None]:
model = PPO(
    policy="CnnPolicy",
    env=train_env,
    n_steps=1024,              # pasos por entorno antes de cada update: 128*8 = 1024 muestras
    batch_size=256,
    n_epochs=4,
    gamma=0.99,
    gae_lambda=0.95,
    learning_rate=2.5e-4,     # se puede cambiar a schedule lineal → 2.5e‑4 → 0
    ent_coef=0.01,
    clip_range=0.1,
    vf_coef=0.5,
    max_grad_norm=0.5,
    verbose=1,
    device="auto"             # usa GPU si está disponible
)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [None]:
total_timesteps = 5_000_000
model.learn(total_timesteps=total_timesteps, callback=[eval_cb, ckpt_cb])
model.save(f"../models/ppo/best_model_final.zip")
train_env.close()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.78e+03 |
|    ep_rew_mean     | 205      |
| time/              |          |
|    fps             | 296      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.84e+03     |
|    ep_rew_mean          | 218          |
| time/                   |              |
|    fps                  | 290          |
|    iterations           | 2            |
|    time_elapsed         | 7            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0028484466 |
|    clip_fraction        | 0.0437       |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.2         |
|    explained_variance   | -0.00547     |
|    learning_r

## Medir performance del modelo con evaluate policy

In [None]:
model = QRDQN.load("../models/ppo/best_model_final.zip")
mean, std = evaluate_policy(model, eval_env, n_eval_episodes=10)
print(f"Return medio: {mean:.0f} ± {std:.0f}")

Recompensa media: 1153.0 +/- 206.2546969162157


## ver ejecuccion

In [None]:
play_env = make_atari_env(ENV_ID, n_envs=1, env_kwargs={"render_mode": "human"})
play_env = VecFrameStack(play_env, n_stack=4)

obs = play_env.reset()
done = False
total_reward = 0.0
while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, _ = play_env.step(action)
    total_reward += float(reward[0])
    play_env.render()
    time.sleep(0.02)

print("Episodio terminado, recompensa acumulada:", total_reward)
play_env.close()

Episodio terminado, recompensa acumulada: 84.0


# Advantage Actor-Critic (A3C)

In [None]:
env =make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED, 
    monitor_dir="../logs/logs_a3c/")

# Aplicar frame-stacking de 4 últimas imágenes (como en DQN de Atari)
env = VecFrameStack(env, n_stack=4)