In [1]:
# Verificar dependencias
import importlib
missing = []
for pkg in ('gymnasium','ale_py','shimmy','torch','torchvision','opencv_python','matplotlib','numpy'):
    try:
        importlib.import_module(pkg)
    except Exception:
        missing.append(pkg)
if missing:
    print('Paquetes faltantes:', missing)
    print('Inst√°lalos con: pip install -r requirements.txt')
else:
    print('‚úì Dependencias instaladas')

import torch
import numpy as np

torch.serialization.add_safe_globals([np._core.multiarray.scalar])

Paquetes faltantes: ['opencv_python']
Inst√°lalos con: pip install -r requirements.txt


In [2]:
# Imports y configuraci√≥n
import os
import sys
from pathlib import Path
from collections import deque
import gymnasium as gym

proj_root = Path.cwd()
if str(proj_root) not in sys.path:
    sys.path.insert(0, str(proj_root))

from src.train import train_agent
from src.record import record_episode
from src.agent import DQNAgent
from src.utils import preprocess_frame, stack_frames
import matplotlib.pyplot as plt

print('Project root:', proj_root)

Project root: c:\Users\carlo\Downloads\RL_Galaxian


In [3]:
# Par√°metros de entrenamiento Double DQN
EPISODES = 500
EMAIL = 'ang23010@uvg.edu.gt'
CHECKPOINT_DIR = 'checkpoints_dqn'
ENV_NAME = 'ALE/Galaxian-v5'
DEVICE = 'cpu'
EARLY_STOP_PATIENCE = 200
MA_WINDOW = 20
SAVE_EVERY = 100
BATCH_SIZE = 32
BUFFER_SIZE = 100000
TARGET_UPDATE = 1000

print('Par√°metros Double DQN:')
print(f' EPISODES= {EPISODES}')
print(f' BATCH_SIZE= {BATCH_SIZE}')
print(f' BUFFER_SIZE= {BUFFER_SIZE}')
print(f' TARGET_UPDATE= {TARGET_UPDATE}')
print(f' EARLY_STOP_PATIENCE= {EARLY_STOP_PATIENCE}')
print(f' MA_WINDOW= {MA_WINDOW}')
print(f' DEVICE= {DEVICE}')

Par√°metros Double DQN:
 EPISODES= 500
 BATCH_SIZE= 32
 BUFFER_SIZE= 100000
 TARGET_UPDATE= 1000
 EARLY_STOP_PATIENCE= 200
 MA_WINDOW= 20
 DEVICE= cpu


In [None]:
# Entrenamiento Double DQN
agent, results = train_agent(
    episodes=EPISODES,
    email=EMAIL,
    checkpoint_dir=CHECKPOINT_DIR,
    env_name=ENV_NAME,
    early_stop_patience=EARLY_STOP_PATIENCE,
    ma_window=MA_WINDOW,
    save_every=SAVE_EVERY,
    batch_size=BATCH_SIZE,
    buffer_size=BUFFER_SIZE,
    target_update=TARGET_UPDATE,
    device=DEVICE
)

In [None]:
# Visualizar m√©tricas de entrenamiento
rewards = results.get('rewards', [])
losses = results.get('losses', [])
epsilon = results.get('epsilon', [])
durations = results.get('durations', [])

fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Rewards
axes[0, 0].plot(rewards)
axes[0, 0].set_title('Rewards por Episodio')
axes[0, 0].set_xlabel('Episodio')
axes[0, 0].set_ylabel('Reward Total')
axes[0, 0].grid(True)

# Loss
axes[0, 1].plot(losses)
axes[0, 1].set_title('TD Loss')
axes[0, 1].set_xlabel('Episodio')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].grid(True)

# Epsilon (exploraci√≥n)
axes[1, 0].plot(epsilon)
axes[1, 0].set_title('Epsilon (Exploraci√≥n)')
axes[1, 0].set_xlabel('Episodio')
axes[1, 0].set_ylabel('Epsilon')
axes[1, 0].grid(True)

# Moving average de rewards
if len(rewards) >= MA_WINDOW:
    ma_rewards = [np.mean(rewards[max(0, i-MA_WINDOW+1):i+1]) for i in range(len(rewards))]
    axes[1, 1].plot(ma_rewards, color='green')
    axes[1, 1].set_title(f'MA Rewards (ventana={MA_WINDOW})')
    axes[1, 1].set_xlabel('Episodio')
    axes[1, 1].set_ylabel('MA Reward')
    axes[1, 1].grid(True)

plt.tight_layout()
plt.savefig(os.path.join(CHECKPOINT_DIR, 'training_metrics_dqn.png'), dpi=300)
plt.show()

print(f'\nüìä Mejor reward promedio: {max(ma_rewards) if len(rewards) >= MA_WINDOW else max(rewards):.1f}')
print(f'üìà Reward final (√∫ltimos {MA_WINDOW} eps): {np.mean(rewards[-MA_WINDOW:]):.1f}')

## Reentrenamiento (Continuar desde modelo guardado)

In [None]:
# Par√°metros de reentrenamiento
RESUME_MODEL = os.path.join(CHECKPOINT_DIR, f'final_model_dqn_ang23010.pth')
ADDITIONAL_EPISODES = 200
EARLY_STOP_PATIENCE_RESUME = 100
MA_WINDOW_RESUME = 20
SAVE_EVERY_RESUME = 50
BATCH_SIZE_RESUME = 32
TARGET_UPDATE_RESUME = 1000
EPSILON_START_RESUME = 0.1  # Empezar con menos exploraci√≥n

os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Crear entorno y agente
env = gym.make(ENV_NAME)
n_actions = env.action_space.n
state_shape = (4, 84, 84)

agent_resume = DQNAgent(
    state_shape, 
    n_actions, 
    buffer_size=BUFFER_SIZE,
    batch_size=BATCH_SIZE_RESUME,
    device=DEVICE
)

# Cargar modelo Y buffer
if os.path.exists(RESUME_MODEL):
    print(f'üìÇ Cargando modelo DQN desde {RESUME_MODEL}')
    agent_resume.load(RESUME_MODEL, load_buffer=True)
    print(f'‚úì Modelo cargado exitosamente')
    print(f'‚úì Replay buffer cargado con {len(agent_resume.memory)} experiencias')
else:
    print(f'‚ùå Modelo no encontrado: {RESUME_MODEL}')
    raise FileNotFoundError(f'No existe {RESUME_MODEL}')

# Tracking
best_ma = -np.inf
no_improve = 0
stacked = deque(maxlen=4)

rewards = []
losses = []
epsilon_values = []
durations = []

# Configurar epsilon inicial
agent_resume.epsilon = EPSILON_START_RESUME
steps_done = 0

# Loop de reentrenamiento
for ep in range(1, ADDITIONAL_EPISODES + 1):
    state, _ = env.reset()
    processed = preprocess_frame(state)
    state_stack = stack_frames(stacked, processed, True)

    done = False
    truncated = False
    ep_reward = 0.0
    ep_loss = 0.0
    ep_steps = 0
    loss_count = 0

    while not (done or truncated):
        action = agent_resume.select_action(state_stack)
        next_state, reward, done, truncated, _ = env.step(action)

        processed = preprocess_frame(next_state)
        next_stack = stack_frames(stacked, processed, False)

        agent_resume.memory.push(state_stack, action, reward, next_stack, done or truncated)

        # Train
        if len(agent_resume.memory) >= BATCH_SIZE_RESUME:
            loss = agent_resume.train_step()
            if loss is not None:
                ep_loss += loss
                loss_count += 1

        ep_reward += reward
        ep_steps += 1
        steps_done += 1
        state_stack = next_stack

        # Update target network
        if steps_done % TARGET_UPDATE_RESUME == 0:
            agent_resume.update_target_network()

    # Decay epsilon
    agent_resume.epsilon = max(0.01, agent_resume.epsilon * 0.995)

    # Promediar loss
    avg_loss = ep_loss / loss_count if loss_count > 0 else 0.0

    rewards.append(ep_reward)
    losses.append(avg_loss)
    epsilon_values.append(agent_resume.epsilon)
    durations.append(ep_steps)

    # Moving average
    if len(rewards) >= MA_WINDOW_RESUME:
        ma = np.mean(rewards[-MA_WINDOW_RESUME:])
    else:
        ma = np.mean(rewards) if rewards else -np.inf

    print(f'Resume Ep {ep} | Reward: {ep_reward:.1f} | MA: {ma:.1f} | '
          f'Loss: {avg_loss:.4f} | Œµ: {agent_resume.epsilon:.3f} | Steps: {ep_steps}')

    # Guardar mejor modelo
    if ma > best_ma:
        best_ma = ma
        no_improve = 0
        email_prefix = EMAIL.split('@')[0]
        best_path = os.path.join(CHECKPOINT_DIR, f'best_model_dqn_{email_prefix}.pth')
        agent_resume.save(best_path, save_buffer=True)
        print(f'  ‚úì Nuevo mejor MA: {ma:.1f} ‚Üí Modelo guardado')
    else:
        no_improve += 1

    # Checkpoints peri√≥dicos
    if ep % SAVE_EVERY_RESUME == 0:
        email_prefix = EMAIL.split('@')[0]
        cp_path = os.path.join(CHECKPOINT_DIR, f'checkpoint_dqn_resume_ep{ep}_{email_prefix}.pth')
        agent_resume.save(cp_path, save_buffer=True)
        print(f'  ‚Üí Checkpoint guardado: ep{ep}')

    # Early stopping
    if no_improve >= EARLY_STOP_PATIENCE_RESUME:
        print(f'Early stopping: {no_improve} episodios sin mejora')
        break

# Guardar modelo final
email_prefix = EMAIL.split('@')[0]
final_path = os.path.join(CHECKPOINT_DIR, f'final_model_dqn_{email_prefix}.pth')
agent_resume.save(final_path, save_buffer=True)
print(f'\n‚úì Reentrenamiento completado. Modelo final: {final_path}')

env.close()

In [None]:
# Visualizar m√©tricas de reentrenamiento
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Rewards
axes[0, 0].plot(rewards)
axes[0, 0].set_title('Rewards (Reentrenamiento)')
axes[0, 0].set_xlabel('Episodio')
axes[0, 0].set_ylabel('Reward Total')
axes[0, 0].grid(True)

# Loss
axes[0, 1].plot(losses)
axes[0, 1].set_title('TD Loss (Reentrenamiento)')
axes[0, 1].set_xlabel('Episodio')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].grid(True)

# Epsilon
axes[1, 0].plot(epsilon_values)
axes[1, 0].set_title('Epsilon (Reentrenamiento)')
axes[1, 0].set_xlabel('Episodio')
axes[1, 0].set_ylabel('Epsilon')
axes[1, 0].grid(True)

# MA Rewards
if len(rewards) >= MA_WINDOW_RESUME:
    ma_rewards = [np.mean(rewards[max(0, i-MA_WINDOW_RESUME+1):i+1]) for i in range(len(rewards))]
    axes[1, 1].plot(ma_rewards, color='green')
    axes[1, 1].set_title(f'MA Rewards (ventana={MA_WINDOW_RESUME})')
    axes[1, 1].set_xlabel('Episodio')
    axes[1, 1].set_ylabel('MA Reward')
    axes[1, 1].grid(True)

plt.tight_layout()
plt.savefig(os.path.join(CHECKPOINT_DIR, 'retrain_metrics_dqn.png'), dpi=300)
plt.show()

print(f'\nüìä Mejor reward promedio: {max(ma_rewards) if len(rewards) >= MA_WINDOW_RESUME else max(rewards):.1f}')
print(f'üìà Reward final (√∫ltimos {MA_WINDOW_RESUME} eps): {np.mean(rewards[-MA_WINDOW_RESUME:]):.1f}')

## Grabar episodio con el mejor modelo

In [10]:
# Grabar episodio con mejor modelo DQN
email_prefix = EMAIL.split('@')[0]
best_model_path = r"C:\Users\carlo\Downloads\RL_Galaxian\checkpoints\best_model_ang23010.pth"

if os.path.exists(best_model_path):
    print(f'üé¨ Cargando mejor modelo DQN desde: {best_model_path}')
    
    state_shape = (4, 84, 84)
    n_actions = 6
    
    agent_test = DQNAgent(state_shape, n_actions, device=DEVICE)
    agent_test.load(best_model_path, load_buffer=False)
    
    video_path = record_episode(agent_test, email=EMAIL, output_dir='videos_dqn', env_name=ENV_NAME)
    print(f'\n‚úì Video guardado en: {video_path}')
else:
    print(f'‚ùå No se encontr√≥ el modelo: {best_model_path}')

üé¨ Cargando mejor modelo DQN desde: C:\Users\carlo\Downloads\RL_Galaxian\checkpoints\best_model_ang23010.pth
Video saved: videos_dqn\ang23010_20251120232612_300.mp4
Score: 300 | Frames: 379

‚úì Video guardado en: videos_dqn\ang23010_20251120232612_300.mp4
Video saved: videos_dqn\ang23010_20251120232612_300.mp4
Score: 300 | Frames: 379

‚úì Video guardado en: videos_dqn\ang23010_20251120232612_300.mp4
