# Import libraries and variables

In [16]:
# !pip install gymnasium[atari, accept-rom-license] stable-baselines3

import gymnasium as gym
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
import ale_py
gym.register_envs(ale_py)

import time
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3 import PPO

# Para selección de hiperparámetros 
import optuna    
import torch

import pickle, os, random, collections, tqdm, math

# Q learning

In [23]:
SEED = 42
random.seed(SEED); np.random.seed(SEED)

ENV_ID = "MsPacman-ramNoFrameskip-v4"
env = gym.make(ENV_ID, render_mode=None)
A   = env.action_space.n

# ---------- hiper‑parámetros -------------
TOTAL_FRAMES     = 4_000_000          # ~4‑5 h CPU
ALPHA_START      = 0.30               # tasa inicial
ALPHA_END        = 0.05               # tasa final (decay exp.)
GAMMA            = 0.99
EPS_START        = 1.0
EPS_END          = 0.05
EPS_DECAY_FRAMES = 1_500_000          # ε lineal
PRINT_EVERY      = 20_000

# ---------- función nombrada para Q -------
def zeros():
    return np.zeros(A, dtype=np.float32)

Q = collections.defaultdict(zeros)     # picklable

# ---------- discretización mejorada -------
GRID_W, GRID_H = 19, 21                # rejilla 8×10 px
def discretize(obs):
    pac_x = obs[0x6D] // 8
    pac_y = obs[0x6E] // 10

    # Fantasmas: cogemos el más cercano (Manhattan)
    ghost_coords = [(obs[o]//8, obs[o+1]//10)
                    for o in (0x4D, 0x51, 0x55, 0x59)]
    dists = [abs(px-pac_x)+abs(py-pac_y) for px,py in ghost_coords]
    min_d = min(dists)          # 0‑60 aprox.
    dist_bin = min_d // 2       # agrupa en buckets de 2 → 0‑30

    energ = 1 if obs[0x70] > 0 else 0

    return (pac_x, pac_y, dist_bin, energ)
# Nº de estados ≈ 19×21×31×2 ≈ 25 k → tabla compacta

# ---------- decaimiento de α -------------
def alpha(frame):
    ratio = frame / TOTAL_FRAMES
    return ALPHA_START * ((ALPHA_END/ALPHA_START) ** ratio)

# ---------- entrenamiento -----------------
obs, _ = env.reset(seed=SEED)
state  = discretize(obs)
eps    = EPS_START
total_reward_episode = 0

pbar = tqdm.tqdm(total=TOTAL_FRAMES, ncols=90)
for frame in range(1, TOTAL_FRAMES+1):

    # ε‑greedy
    if random.random() < eps:
        action = env.action_space.sample()
    else:
        action = int(np.argmax(Q[state]))

    next_obs, reward, terminated, truncated, _ = env.step(action)
    done       = terminated or truncated
    next_state = discretize(next_obs)

    lr = alpha(frame)
    best_next = np.max(Q[next_state])
    Q[state][action] += lr * (reward + GAMMA*best_next - Q[state][action])

    state = next_state
    total_reward_episode += reward

    # decay ε lineal
    eps = max(EPS_END, EPS_START - frame / EPS_DECAY_FRAMES)

    if done:
        obs, _ = env.reset()
        state  = discretize(obs)
        pbar.set_description(f"F {frame:,} | R_ep={total_reward_episode:6.0f} | ε={eps:.2f} | α={lr:.3f}")
        total_reward_episode = 0

    if frame % PRINT_EVERY == 0:
        pbar.update(PRINT_EVERY)

pbar.close();  env.close()

F 3,997,568 | R_ep=   330 | ε=0.05 | α=0.050: 100%|█| 4000000/4000000 [33:37<00:00, 1982.2


In [24]:
os.makedirs("../models", exist_ok=True)

with open("../models/qtable_pacman.pkl", "wb") as f:
    pickle.dump(dict(Q), f) 

In [28]:
with open("../models/qtable_pacman.pkl", "rb") as f:
    Q = pickle.load(f)
    
# ---------- 2. Crear el entorno RAM con render -------------------
ENV_ID = "MsPacman-ramNoFrameskip-v4"
SEED   = 42

eval_env = gym.make(
    ENV_ID,
    render_mode="human",   # ventana SDL
)
eval_env.reset(seed=SEED)

# ------- discretizador consistente ----------
GRID_W, GRID_H = 19, 21
def discretize(obs: np.ndarray):
    pac_x = obs[0x6D] // 8
    pac_y = obs[0x6E] // 10

    # fantasma más cercano
    ghost_coords = [(obs[o]//8, obs[o+1]//10)
                    for o in (0x4D, 0x51, 0x55, 0x59)]
    dists = [abs(px-pac_x)+abs(py-pac_y) for px,py in ghost_coords]
    dist_bin = min(dists) // 2          # 0‑30

    energ = 1 if obs[0x70] > 0 else 0
    return (pac_x, pac_y, dist_bin, energ)


# ---------- 4. Ejecutar un episodio -------------------------------
obs, _ = eval_env.reset()
state        = discretize(obs)
total_reward = 0.0
done         = False

while not done:
    # acción greedy respecto a la tabla
    action = int(np.argmax(Q[state]))
    
    obs, reward, terminated, truncated, _ = eval_env.step(action)
    done       = terminated or truncated
    total_reward += float(reward)
    
    state = discretize(obs)
    time.sleep(0.02)          # ralentiza para ver la partida

print("Episodio terminado, recompensa acumulada:", total_reward)
eval_env.close()


: 

---
---
# Modelos con deep learning:

In [2]:
ENV_ID = "MsPacmanNoFrameskip-v4"
N_ENVS = 1
SEED = 42

# Deep Q-Network (DQN)

In [3]:
env =make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED, 
    monitor_dir="../logs/logs_dqn/")

# Aplicar frame-stacking de 4 últimas imágenes (como en DQN de Atari) (para que el programa induzca movimiento)
env = VecFrameStack(env, n_stack=4)

In [4]:
model = DQN(
    policy="CnnPolicy",
    env=env,
    learning_rate=1e-4,
    buffer_size=100000,
    learning_starts=50000,
    batch_size=32,
    tau=1.0,
    gamma=0.99,
    train_freq=4,
    target_update_interval=10000,
    exploration_fraction=0.1,
    exploration_final_eps=0.1,
    exploration_initial_eps=1.0,
    verbose=1)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [9]:
total_timesteps = 5
model.learn(total_timesteps=total_timesteps)
model.save(f"../models/dqn_pacman_model_{total_timesteps}_timesteps.zip")
env.close()

## Medir performance del modelo con evaluate policy

In [None]:
eval_env = make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED)
eval_env = VecFrameStack(eval_env, n_stack=4)

model_with_n_steps = 5
model = DQN.load(f"../models/dqn_pacman_model_{total_timesteps}_timesteps.zip")

mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"Recompensa media: {mean_reward} +/- {std_reward}")

Recompensa media: 253.0 +/- 36.345563690772494


## ver ejecuccion

In [None]:
model = DQN.load("../models/dqn_pacman_model_1M.zip")

eval_env = make_atari_env(
    ENV_ID,
    n_envs=1,
    seed=SEED,
    env_kwargs={"render_mode": "human"}
)
eval_env = VecFrameStack(eval_env, n_stack=4)

obs = eval_env.reset()
done = False
total_reward = 0.0

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = eval_env.step(action)

    total_reward += float(reward[0])
    eval_env.render()
    time.sleep(0.02)

print("Episodio terminado, recompensa acumulada:", total_reward)
eval_env.close()

Episodio terminado, recompensa acumulada: 28.0


# DQN with optuna (hyperparameter tunning)

In [6]:
FRAME_STACK = 4

def make_mspacman_env(seed=SEED, n_envs=4):
    env = make_atari_env(
        "MsPacmanNoFrameskip-v4",
        n_envs=n_envs,
        seed=seed,
        monitor_dir="../logs/logs_dqn/")
    return VecFrameStack(env, n_stack=FRAME_STACK)

In [7]:
TOTAL_FRAMES_TUNE = 100_000

def objective(trial: optuna.Trial) -> float:
    print("cuda" if torch.cuda.is_available() else "cpu")
    # ----- sample hyper‑parameters -----
    lr        = trial.suggest_loguniform("learning_rate", 5e-4, 5e-3)
    buff_size = trial.suggest_categorical("buffer_size", [100_000])
    batch_sz  = trial.suggest_categorical("batch_size", [32, 64])
    tau       = trial.suggest_uniform("tau", 0.8, 1.0)
    gamma     = trial.suggest_uniform("gamma", 0.97, 0.999)
    train_fr  = trial.suggest_categorical("train_freq", [2, 4, 8])
    expl_frac = trial.suggest_uniform("exploration_fraction", 0.05, 0.2)

    env = make_mspacman_env()

    model = DQN(
        "CnnPolicy", env,
        learning_rate        = lr,
        buffer_size          = buff_size,
        batch_size           = batch_sz,
        tau                  = tau,
        gamma                = gamma,
        train_freq           = (train_fr, "step"),
        target_update_interval = 10_000,
        exploration_fraction = expl_frac,
        exploration_initial_eps = 1.0,
        exploration_final_eps   = 0.05,
        learning_starts      = 50_000,
        verbose              = 0,
        seed                 = SEED,
        device               = "cuda" if torch.cuda.is_available() else "cpu"
    )

    model.learn(TOTAL_FRAMES_TUNE, progress_bar=False)
    mean_reward, _ = evaluate_policy(model, make_mspacman_env(n_envs=1),
                                     n_eval_episodes=10, deterministic=True)
    env.close()
    # report to Optuna
    return mean_reward


In [8]:
study = optuna.create_study(
            direction="maximize",
            sampler=optuna.samplers.TPESampler(seed=SEED),
            pruner = optuna.pruners.MedianPruner(n_warmup_steps=25))

study.optimize(objective, n_trials=20, n_jobs=1)
print("Best reward:", study.best_value)
print("Best params:", study.best_params)


[I 2025-04-19 00:59:41,332] A new study created in memory with name: no-name-3fd76dd1-89a3-4ca4-895b-ee49f84b2d1f


  lr        = trial.suggest_loguniform("learning_rate", 5e-4, 5e-3)
  tau       = trial.suggest_uniform("tau", 0.8, 1.0)
  gamma     = trial.suggest_uniform("gamma", 0.97, 0.999)
  expl_frac = trial.suggest_uniform("exploration_fraction", 0.05, 0.2)


cuda


[I 2025-04-19 01:02:50,919] Trial 0 finished with value: 210.0 and parameters: {'learning_rate': 0.0011844319751820387, 'buffer_size': 100000, 'batch_size': 32, 'tau': 0.9197316968394074, 'gamma': 0.9745245405728307, 'train_freq': 8, 'exploration_fraction': 0.14016725176148134}. Best is trial 0 with value: 210.0.


cuda


[I 2025-04-19 01:06:07,467] Trial 1 finished with value: 210.0 and parameters: {'learning_rate': 0.002552951604697378, 'buffer_size': 100000, 'batch_size': 64, 'tau': 0.9664885281600843, 'gamma': 0.97615783420967, 'train_freq': 8, 'exploration_fraction': 0.1287134647448357}. Best is trial 0 with value: 210.0.


cuda


[I 2025-04-19 01:08:43,681] Trial 2 finished with value: 70.0 and parameters: {'learning_rate': 0.0013518080333310004, 'buffer_size': 100000, 'batch_size': 64, 'tau': 0.8278987721304084, 'gamma': 0.9784721948075213, 'train_freq': 8, 'exploration_fraction': 0.07995106732375397}. Best is trial 0 with value: 210.0.


cuda


[I 2025-04-19 01:11:26,958] Trial 3 finished with value: 70.0 and parameters: {'learning_rate': 0.0016338208828908817, 'buffer_size': 100000, 'batch_size': 32, 'tau': 0.9215089703802877, 'gamma': 0.9749451995869314, 'train_freq': 8, 'exploration_fraction': 0.1712596022174692}. Best is trial 0 with value: 210.0.


cuda


[I 2025-04-19 01:14:01,719] Trial 4 finished with value: 335.0 and parameters: {'learning_rate': 0.0010082860845904299, 'buffer_size': 100000, 'batch_size': 64, 'tau': 0.8880304987479203, 'gamma': 0.9735391088104985, 'train_freq': 8, 'exploration_fraction': 0.08881699724000255}. Best is trial 4 with value: 335.0.


cuda


[I 2025-04-19 01:17:22,726] Trial 5 finished with value: 210.0 and parameters: {'learning_rate': 0.002298752892366083, 'buffer_size': 100000, 'batch_size': 64, 'tau': 0.9093420558686559, 'gamma': 0.9753607792102402, 'train_freq': 2, 'exploration_fraction': 0.18422410256414734}. Best is trial 4 with value: 335.0.


cuda


[I 2025-04-19 01:20:03,088] Trial 6 finished with value: 495.0 and parameters: {'learning_rate': 0.001980933895203292, 'buffer_size': 100000, 'batch_size': 32, 'tau': 0.8391965724838291, 'gamma': 0.9713115913784056, 'train_freq': 4, 'exploration_fraction': 0.17431062637278943}. Best is trial 6 with value: 495.0.


cuda


[I 2025-04-19 01:22:57,016] Trial 7 finished with value: 728.0 and parameters: {'learning_rate': 0.0011369027867815917, 'buffer_size': 100000, 'batch_size': 64, 'tau': 0.8281848449949526, 'gamma': 0.9932637124418672, 'train_freq': 4, 'exploration_fraction': 0.07980735223012586}. Best is trial 7 with value: 728.0.


cuda


[I 2025-04-19 01:25:33,395] Trial 8 finished with value: 135.0 and parameters: {'learning_rate': 0.0005063981628665742, 'buffer_size': 100000, 'batch_size': 32, 'tau': 0.9458014336081975, 'gamma': 0.9923668400538924, 'train_freq': 4, 'exploration_fraction': 0.17946551388133902}. Best is trial 7 with value: 728.0.


cuda


[I 2025-04-19 01:28:17,073] Trial 9 finished with value: 210.0 and parameters: {'learning_rate': 0.002100236158351101, 'buffer_size': 100000, 'batch_size': 32, 'tau': 0.8621964643431325, 'gamma': 0.9794303163387756, 'train_freq': 8, 'exploration_fraction': 0.1208322387742924}. Best is trial 7 with value: 728.0.


cuda


[I 2025-04-19 01:31:15,318] Trial 10 finished with value: 210.0 and parameters: {'learning_rate': 0.004406629612367332, 'buffer_size': 100000, 'batch_size': 64, 'tau': 0.8015094638632587, 'gamma': 0.9975072643794499, 'train_freq': 4, 'exploration_fraction': 0.05580877999417749}. Best is trial 7 with value: 728.0.


cuda


[I 2025-04-19 01:33:52,596] Trial 11 finished with value: 361.0 and parameters: {'learning_rate': 0.0007535837941415454, 'buffer_size': 100000, 'batch_size': 32, 'tau': 0.8419361758930461, 'gamma': 0.9863563922288789, 'train_freq': 4, 'exploration_fraction': 0.15457338475101423}. Best is trial 7 with value: 728.0.


cuda


[I 2025-04-19 01:36:52,088] Trial 12 finished with value: 210.0 and parameters: {'learning_rate': 0.003412144275217819, 'buffer_size': 100000, 'batch_size': 64, 'tau': 0.8052800889761503, 'gamma': 0.9857126760756896, 'train_freq': 4, 'exploration_fraction': 0.10012536267493478}. Best is trial 7 with value: 728.0.


cuda


[I 2025-04-19 01:39:31,885] Trial 13 finished with value: 170.0 and parameters: {'learning_rate': 0.0008407512140165521, 'buffer_size': 100000, 'batch_size': 32, 'tau': 0.8718445321601477, 'gamma': 0.9929342381484171, 'train_freq': 4, 'exploration_fraction': 0.0629553382728906}. Best is trial 7 with value: 728.0.


cuda


[I 2025-04-19 01:42:29,560] Trial 14 finished with value: 70.0 and parameters: {'learning_rate': 0.0017235406011707976, 'buffer_size': 100000, 'batch_size': 32, 'tau': 0.839190918370566, 'gamma': 0.9906473562641889, 'train_freq': 2, 'exploration_fraction': 0.19869726160210313}. Best is trial 7 with value: 728.0.


cuda


[I 2025-04-19 01:45:26,139] Trial 15 finished with value: 70.0 and parameters: {'learning_rate': 0.0031542241651859823, 'buffer_size': 100000, 'batch_size': 64, 'tau': 0.8639183343499428, 'gamma': 0.9710394314418426, 'train_freq': 4, 'exploration_fraction': 0.10818678449942627}. Best is trial 7 with value: 728.0.


cuda


[I 2025-04-19 01:48:19,233] Trial 16 finished with value: 291.0 and parameters: {'learning_rate': 0.0006516972684852695, 'buffer_size': 100000, 'batch_size': 64, 'tau': 0.8239668064205552, 'gamma': 0.9988844926914752, 'train_freq': 4, 'exploration_fraction': 0.15447091177324063}. Best is trial 7 with value: 728.0.


cuda


[I 2025-04-19 01:51:01,219] Trial 17 finished with value: 335.0 and parameters: {'learning_rate': 0.001127642267557952, 'buffer_size': 100000, 'batch_size': 32, 'tau': 0.8866747114381133, 'gamma': 0.982342094466551, 'train_freq': 4, 'exploration_fraction': 0.07526570549280961}. Best is trial 7 with value: 728.0.


cuda


[I 2025-04-19 01:54:22,716] Trial 18 finished with value: 210.0 and parameters: {'learning_rate': 0.0017153093397977736, 'buffer_size': 100000, 'batch_size': 64, 'tau': 0.9965372313555609, 'gamma': 0.9884108225689964, 'train_freq': 2, 'exploration_fraction': 0.15590341800206542}. Best is trial 7 with value: 728.0.


cuda


[I 2025-04-19 01:57:03,710] Trial 19 finished with value: 220.0 and parameters: {'learning_rate': 0.0008756310298578361, 'buffer_size': 100000, 'batch_size': 32, 'tau': 0.8508656734531015, 'gamma': 0.9821947820206764, 'train_freq': 4, 'exploration_fraction': 0.10169701001102277}. Best is trial 7 with value: 728.0.


Best reward: 728.0
Best params: {'learning_rate': 0.0011369027867815917, 'buffer_size': 100000, 'batch_size': 64, 'tau': 0.8281848449949526, 'gamma': 0.9932637124418672, 'train_freq': 4, 'exploration_fraction': 0.07980735223012586}


In [9]:
BEST_PARAMS = dict(
    policy="CnnPolicy",
    env=make_mspacman_env(),
    learning_starts=50_000,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    target_update_interval=10_000,
    verbose=1,
    seed=SEED,
    device="cuda" if torch.cuda.is_available() else "cpu"
)
BEST_PARAMS.update(study.best_params)         

LONG_FRAMES = 5_000_000
best_model = DQN(**BEST_PARAMS)
best_model.learn(LONG_FRAMES, progress_bar=True)
best_model.save("../models/dqn_pacman_best.zip")

Using cuda device
Wrapping the env in a VecTransposeImage.


----------------------------------
| rollout/            |          |
|    exploration_rate | 0.998    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 574      |
|    time_elapsed     | 1        |
|    total_timesteps  | 744      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.997    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 695      |
|    time_elapsed     | 1        |
|    total_timesteps  | 1356     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.79e+03 |
|    ep_rew_mean      | 200      |
|    exploration_rate | 0.995    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 723      |
|    time_elapsed     | 2        |
|    total_timesteps  | 1952     |
--------------------

In [None]:
eval_env = make_mspacman_env(n_envs=1)

model = DQN.load("../models/dqn_pacman_best.zip")

mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"Recompensa media: {mean_reward} +/- {std_reward}")

Recompensa media: 518.0 +/- 94.31860898041276


In [None]:
model = DQN.load("../models/dqn_pacman_best.zip")

eval_env = make_atari_env(
    ENV_ID,
    n_envs=1,
    seed=SEED,
    env_kwargs={"render_mode": "human"}
)
eval_env = VecFrameStack(eval_env, n_stack=4)

obs = eval_env.reset()
done = False
total_reward = 0.0

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = eval_env.step(action)

    total_reward += float(reward[0])
    eval_env.render()
    time.sleep(0.02)

print("Episodio terminado, recompensa acumulada:", total_reward)
eval_env.close()

Episodio terminado, recompensa acumulada: 30.0


# Policy Gradient methods (PPO)

In [None]:
env =make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED, 
    monitor_dir="../logs/logs_ppo/")

# Aplicar frame-stacking de 4 últimas imágenes (como en DQN de Atari)
env = VecFrameStack(env, n_stack=4)

In [None]:
model = PPO(
    policy="CnnPolicy",
    env=env,
    n_steps=1024,              # pasos por entorno antes de cada update: 128*8 = 1024 muestras
    batch_size=256,
    n_epochs=4,
    gamma=0.99,
    gae_lambda=0.95,
    learning_rate=2.5e-4,     # se puede cambiar a schedule lineal → 2.5e‑4 → 0
    ent_coef=0.01,
    clip_range=0.1,
    vf_coef=0.5,
    max_grad_norm=0.5,
    verbose=1,
    device="auto"             # usa GPU si está disponible
)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [None]:
TOTAL_TIMESTEPS = 5_000_000
model.learn(total_timesteps=TOTAL_TIMESTEPS)
model.save(f"../models/ppo_pacman_model_{total_timesteps}timesteps.zip")
env.close()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.78e+03 |
|    ep_rew_mean     | 205      |
| time/              |          |
|    fps             | 296      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.84e+03     |
|    ep_rew_mean          | 218          |
| time/                   |              |
|    fps                  | 290          |
|    iterations           | 2            |
|    time_elapsed         | 7            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0028484466 |
|    clip_fraction        | 0.0437       |
|    clip_range           | 0.1          |
|    entropy_loss         | -2.2         |
|    explained_variance   | -0.00547     |
|    learning_r

## Medir performance del modelo con evaluate policy

In [None]:
eval_env = make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED)
eval_env = VecFrameStack(eval_env, n_stack=4)

model = PPO.load("../models/ppo_pacman_model.zip")

# Evaluamos en modo determinista en 10 episodios
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=False)

print(f"Recompensa media: {mean_reward} +/- {std_reward}")

Recompensa media: 1153.0 +/- 206.2546969162157


## ver ejecuccion

In [None]:
model = PPO.load("../models/ppo_pacman_model_5M.zip")

eval_env = make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED,
    env_kwargs={"render_mode": "human"}
)
eval_env = VecFrameStack(eval_env, n_stack=4)

obs = eval_env.reset()
done = False
total_reward = 0.0

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = eval_env.step(action)

    total_reward += float(reward[0])
    eval_env.render()
    time.sleep(0.02)

print("Episodio terminado, recompensa acumulada:", total_reward)
eval_env.close()

Episodio terminado, recompensa acumulada: 84.0


# Advantage Actor-Critic (A3C)

In [None]:
env =make_atari_env(
    ENV_ID,
    n_envs=N_ENVS,
    seed=SEED, 
    monitor_dir="../logs/logs_a3c/")

# Aplicar frame-stacking de 4 últimas imágenes (como en DQN de Atari)
env = VecFrameStack(env, n_stack=4)