# Laboratorio 6: Comparación DQN vs Double DQN

## 1. Configuración del Entorno

In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")

## 2. Clonar Repositorio

In [None]:
!git clone https://github.com/ChristianPE1/Labs-Robotica-EPCC.git
%cd Labs-Robotica-EPCC/lab-6-ddqn

## 3. Importar Módulos

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import os

import config
from dqn_agent import DQNAgent
from utils import (
    save_metrics, load_metrics, set_random_seed,
    compute_moving_average, evaluate_agent, get_device_info
)
from visualize import (
    plot_reward_comparison, plot_q_value_comparison,
    plot_loss_comparison, plot_success_rate_comparison,
    plot_convergence_comparison, print_comparison_summary
)

## 4. Configuración

In [None]:
print("Configuración del Experimento:")
print(f"  Entorno: {config.ENV_NAME}")
print(f"  Episodios: {config.NUM_EPISODES}")
print(f"  Learning rate: {config.LEARNING_RATE}")
print(f"  Gamma: {config.GAMMA}")
print(f"  Batch size: {config.BATCH_SIZE}")
print(f"  Memory size: {config.MEMORY_SIZE}")
print(f"  Hidden layers: {config.HIDDEN_LAYERS}")
print(f"  Target update freq: {config.TARGET_UPDATE_FREQ}")
print(f"  Early stop threshold: {config.EARLY_STOP_THRESHOLD}")
print(f"  Dispositivo: {config.DEVICE}")

## 5. Función de Entrenamiento

In [None]:
def train_agent(use_double_dqn=False, verbose=True):
    algorithm_name = "Double DQN" if use_double_dqn else "DQN"
    algorithm_prefix = "ddqn" if use_double_dqn else "dqn"
    
    # Configurar semilla
    set_random_seed(config.RANDOM_SEED)
    
    if verbose:
        print(f"\n{'='*60}")
        print(f"Entrenando {algorithm_name}")
        print(f"{'='*60}")
    
    # Crear entorno
    env = gym.make(config.ENV_NAME)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    
    # Inicializar agente
    agent = DQNAgent(
        state_dim, action_dim, config, config.DEVICE,
        use_double_dqn=use_double_dqn
    )
    
    # Métricas
    episode_rewards = []
    episode_lengths = []
    episode_losses = []
    episode_q_values = []
    success_count = 0
    early_stop_counter = 0
    
    if verbose:
        print("Iniciando entrenamiento...")
    
    for episode in range(config.NUM_EPISODES):
        state, _ = env.reset()
        episode_reward = 0
        episode_length = 0
        episode_loss_values = []
        episode_q_vals = []
        
        done = False
        truncated = False
        
        while not (done or truncated):
            # Obtener Q-values para análisis
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(config.DEVICE)
                q_values = agent.policy_net(state_tensor)
                max_q = q_values.max().item()
                episode_q_vals.append(max_q)
            
            action = agent.select_action(state, training=True)
            next_state, reward, done, truncated, _ = env.step(action)
            agent.store_transition(state, action, reward, next_state, done or truncated)
            
            loss = agent.train_step()
            if loss is not None:
                episode_loss_values.append(loss)
            
            state = next_state
            episode_reward += reward
            episode_length += 1
        
        # Registrar métricas
        episode_rewards.append(episode_reward)
        episode_lengths.append(episode_length)
        episode_losses.append(np.mean(episode_loss_values) if episode_loss_values else 0.0)
        episode_q_values.append(np.mean(episode_q_vals) if episode_q_vals else 0.0)
        
        if episode_length >= 500:
            success_count += 1
        
        # Early stopping
        window = min(100, episode + 1)
        avg_reward = np.mean(episode_rewards[-window:])
        
        if avg_reward >= config.EARLY_STOP_THRESHOLD:
            early_stop_counter += 1
            if early_stop_counter >= config.EARLY_STOP_PATIENCE:
                if verbose:
                    print(f"\nEarly stopping en episodio {episode + 1}")
                break
        else:
            early_stop_counter = 0
        
        # Imprimir progreso
        if verbose and (episode + 1) % 50 == 0:
            success_rate = (success_count / (episode + 1)) * 100
            print(f"Ep {episode+1}/{config.NUM_EPISODES} | "
                  f"Reward: {episode_reward:.0f} | "
                  f"Avg: {avg_reward:.1f} | "
                  f"Success: {success_rate:.1f}%")
    
    env.close()
    
    # Guardar métricas
    metrics = {
        'algorithm': algorithm_name,
        'episode_rewards': episode_rewards,
        'episode_lengths': episode_lengths,
        'episode_losses': episode_losses,
        'episode_q_values': episode_q_values,
        'total_episodes': len(episode_rewards),
        'success_count': success_count,
        'final_epsilon': agent.epsilon
    }
    
    os.makedirs('metrics', exist_ok=True)
    save_metrics(metrics, f'metrics/{algorithm_prefix}_metrics.pkl')
    
    if verbose:
        print(f"\n{algorithm_name} completado!")
        print(f"Episodios: {len(episode_rewards)}")
        print(f"Recompensa promedio: {np.mean(episode_rewards):.2f}")
        print(f"Tasa de éxito: {(success_count/len(episode_rewards))*100:.2f}%")
    
    return metrics, agent

## 6. Entrenar DQN

In [None]:
dqn_metrics, dqn_agent = train_agent(use_double_dqn=False)

## 7. Entrenar Double DQN

In [None]:
ddqn_metrics, ddqn_agent = train_agent(use_double_dqn=True)

## 8. Comparación de Resultados

In [None]:
print_comparison_summary(dqn_metrics, ddqn_metrics)

## 9. Visualizaciones Comparativas

In [None]:
os.makedirs('plots', exist_ok=True)

# Generar todas las gráficas
plot_reward_comparison(dqn_metrics, ddqn_metrics)
plot_q_value_comparison(dqn_metrics, ddqn_metrics)
plot_loss_comparison(dqn_metrics, ddqn_metrics)
plot_success_rate_comparison(dqn_metrics, ddqn_metrics)
plot_convergence_comparison(dqn_metrics, ddqn_metrics)

### 9.1 Comparación de Recompensas

In [None]:
from IPython.display import Image, display
display(Image('plots/reward_comparison.png'))

### 9.2 Análisis de Q-Values (Sobreestimación)

In [None]:
display(Image('plots/q_value_comparison.png'))

### 9.3 Comparación de Pérdidas

In [None]:
display(Image('plots/loss_comparison.png'))

### 9.4 Tasa de Éxito

In [None]:
display(Image('plots/success_rate_comparison.png'))

### 9.5 Análisis de Convergencia

In [None]:
display(Image('plots/convergence_comparison.png'))

## 10. Evaluación de Agentes Entrenados

In [None]:
# Evaluar DQN
env = gym.make(config.ENV_NAME)
dqn_eval = evaluate_agent(dqn_agent, env, num_episodes=20)
env.close()

print("Evaluación DQN (20 episodios):")
print(f"  Recompensa promedio: {dqn_eval['mean_reward']:.2f} ± {dqn_eval['std_reward']:.2f}")
print(f"  Longitud promedio: {dqn_eval['mean_length']:.1f}")
print(f"  Tasa de éxito: {dqn_eval['success_rate']:.1f}%")

In [None]:
# Evaluar Double DQN
env = gym.make(config.ENV_NAME)
ddqn_eval = evaluate_agent(ddqn_agent, env, num_episodes=20)
env.close()

print("Evaluación Double DQN (20 episodios):")
print(f"  Recompensa promedio: {ddqn_eval['mean_reward']:.2f} ± {ddqn_eval['std_reward']:.2f}")
print(f"  Longitud promedio: {ddqn_eval['mean_length']:.1f}")
print(f"  Tasa de éxito: {ddqn_eval['success_rate']:.1f}%")

## 11. Análisis de Sobreestimación de Q-Values

In [None]:
# Comparar Q-values promedio
dqn_q_mean = np.mean(dqn_metrics['episode_q_values'])
ddqn_q_mean = np.mean(ddqn_metrics['episode_q_values'])

print("Análisis de Sobreestimación de Q-Values:")
print(f"\n  DQN Q-value promedio: {dqn_q_mean:.2f}")
print(f"  Double DQN Q-value promedio: {ddqn_q_mean:.2f}")
print(f"\n  Diferencia: {dqn_q_mean - ddqn_q_mean:.2f}")

if dqn_q_mean > ddqn_q_mean:
    reduction = ((dqn_q_mean - ddqn_q_mean) / dqn_q_mean) * 100
    print(f"  Double DQN reduce sobreestimación en {reduction:.1f}%")
else:
    print("  No se observa reducción de sobreestimación en este experimento")