# Práctica de Aprendizaje por Refuerzo: Cliff Walking

Este notebook reproduce la funcionalidad de `main.py` para entrenar y comparar agentes SARSA, Q-Learning y Monte Carlo en el entorno Cliff Walking.

In [None]:
import gymnasium as gym
import numpy as np
import sys
import matplotlib.pyplot as plt
import time
import json

# Asegurarse de que podemos importar desde src
import os
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

from src.agent import SarsaAgent, QLearningAgent, MonteCarloAgent
from src.utils import plot_metrics, print_policy

In [None]:
def run_experiment(env, agent, n_episodes=500, max_steps=1000):
    rewards = []
    steps_list = []
    
    # Límite de tiempo global de seguridad (5 minutos)
    MAX_GLOBAL_TIME = 300 
    start_time = time.time()
    
    # print(f"Iniciando entrenamiento con seguridad: Timeout global de {MAX_GLOBAL_TIME}s")
    
    for episode in range(n_episodes):
        # Chequeo de seguridad de tiempo global
        current_total_time = time.time() - start_time
        if current_total_time > MAX_GLOBAL_TIME:
            print(f"\n[ALERTA DE SEGURIDAD] Tiempo máximo excedido ({current_total_time:.2f}s > {MAX_GLOBAL_TIME}s).")
            print("Abortando entrenamiento y guardando progreso actual...")
            break
            
        state, _ = env.reset()
        action = agent.choose_action(state)
        total_reward = 0
        terminated = False
        truncated = False
        steps = 0
        
        # Log de progreso más frecuente (cada 5%)
        log_freq = max(1, n_episodes // 10)
        if episode % log_freq == 0:
            elapsed = time.time() - start_time
            # print(f"Episodio {episode}/{n_episodes} - Tiempo: {elapsed:.2f}s - Epsilon: {agent.epsilon:.2f}")
        
        while not (terminated or truncated) and steps < max_steps:
            next_state, reward, terminated, truncated, _ = env.step(action)
            
            # Choose next action (needed for SARSA, optional for others but good for uniformity)
            next_action = agent.choose_action(next_state)
            
            # Actualizar agente
            agent.update(state, action, reward, next_state, next_action)
            
            state = next_state
            action = next_action
            total_reward += reward
            steps += 1
            
        rewards.append(total_reward)
        steps_list.append(steps)
        
        # Callback para finalizar episodio
        agent.on_episode_end()
        
    return {'rewards': rewards, 'steps': steps_list}

In [None]:
# Configuración del entorno
try:
    env = gym.make('CliffWalking-v1', is_slippery=True)
except:
    print("Advertencia: 'is_slippery' no aceptado, usando config por defecto.")
    env = gym.make('CliffWalking-v1')

n_states = env.observation_space.n
n_actions = env.action_space.n
n_episodes = 1000

# Definir agentes para comparar
agents = {
    'SARSA': SarsaAgent(n_states, n_actions),
    'Q-Learning': QLearningAgent(n_states, n_actions),
    'Monte Carlo': MonteCarloAgent(n_states, n_actions)
}

all_metrics = {}

for name, agent in agents.items():
    print(f"\n--- Ejecutando {name} ---")
    start_time = time.time()
    metrics = run_experiment(env, agent, n_episodes)
    elapsed = time.time() - start_time
    print(f"Completado en {elapsed:.2f}s")
    
    all_metrics[name] = metrics
    
    # Mostrar política aprendida
    print_policy(agent)

In [None]:
# Graficar comparación
print("\nGenerando gráfica comparativa...")
plot_metrics(all_metrics)
plt.show()