# Laboratorio 7 Deep Learning

Laboratorio 7 – Deep Learning

Edwin Ortega 22305 - Esteban Zambrano 22119

Link del repositorio:<br>
https://github.com/EstebanZG999/Lab7_DL

### Task 1 - Práctica

In [17]:
# pip install gymnasium numpy torch matplotlib

#### Imports

In [18]:
import math
import random
import numpy as np
import collections
from dataclasses import dataclass
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

import gymnasium as gym

#### Configuración y utilidades

In [19]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ENV_ID = "CartPole-v1"
env = gym.make(ENV_ID)
eval_env = gym.make(ENV_ID)

obs, _ = env.reset(seed=SEED)

n_states = env.observation_space.shape[0]    # 4
n_actions = env.action_space.n               # 2

#### Red Q

In [20]:
class QNetwork(nn.Module):
    def __init__(self, in_dim: int, out_dim: int, hidden: int = 128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, out_dim),
        )

    def forward(self, x):
        return self.net(x)

#### Replay Buffer

In [21]:
Transition = collections.namedtuple(
    "Transition", ("state", "action", "reward", "next_state", "done")
)

class ReplayBuffer:
    def __init__(self, capacity: int):
        self.buffer = collections.deque(maxlen=capacity)

    def push(self, *args):
        self.buffer.append(Transition(*args))

    def sample(self, batch_size: int):
        batch = random.sample(self.buffer, batch_size)
        return Transition(*zip(*batch))

    def __len__(self):
        return len(self.buffer)

#### Hiperparámetros

In [22]:
GAMMA = 0.99
LR = 1e-3
BATCH_SIZE = 64
BUFFER_CAPACITY = 50_000

# Exploración (epsilon-greedy)
EPS_START = 1.0
EPS_END = 0.05
EPS_DECAY = 10_000 

# Sincronización de red objetivo
TARGET_SYNC_EVERY = 1000  # pasos
USE_SOFT_UPDATE = False
TAU = 0.005

# Entrenamiento
MAX_EPISODES = 400
MAX_STEPS_PER_EP = 1000
RENDER_DURING_TRAIN = False  # True para visualizar

# Evaluación
EVAL_EPISODES = 5

#### Inicialización de redes y optimizador


In [23]:
policy_net = QNetwork(n_states, n_actions).to(device)
target_net = QNetwork(n_states, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=LR)
replay = ReplayBuffer(BUFFER_CAPACITY)

#### Selección de acción

In [24]:
steps_done = 0

def epsilon_by_step(step: int) -> float:
    # Exponencial suave: eps = EPS_END + (EPS_START - EPS_END) * exp(-step/decay)
    return EPS_END + (EPS_START - EPS_END) * math.exp(-step / EPS_DECAY)

def select_action(state: np.ndarray) -> int:
    global steps_done
    eps = epsilon_by_step(steps_done)
    steps_done += 1
    if random.random() < eps:
        return env.action_space.sample()
    with torch.no_grad():
        s = torch.as_tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
        q = policy_net(s)
        return int(q.argmax(dim=1).item())

#### Actualización de la red

In [25]:
mse_loss = nn.SmoothL1Loss()  # Huber loss

def optimize_model():
    if len(replay) < BATCH_SIZE:
        return None

    batch = replay.sample(BATCH_SIZE)

    state_batch = torch.as_tensor(np.array(batch.state), dtype=torch.float32, device=device)
    action_batch = torch.as_tensor(batch.action, dtype=torch.int64, device=device).unsqueeze(1)
    reward_batch = torch.as_tensor(batch.reward, dtype=torch.float32, device=device).unsqueeze(1)
    next_state_batch = torch.as_tensor(np.array(batch.next_state), dtype=torch.float32, device=device)
    done_batch = torch.as_tensor(batch.done, dtype=torch.float32, device=device).unsqueeze(1)

    # Q(s,a) actual de la policy_net
    q_values = policy_net(state_batch).gather(1, action_batch)

    # Q objetivo: r + gamma * max_a' Q_target(s', a') * (1 - done)
    with torch.no_grad():
        next_q = target_net(next_state_batch).max(dim=1, keepdim=True)[0]
        target = reward_batch + (1.0 - done_batch) * GAMMA * next_q

    loss = mse_loss(q_values, target)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 10.0)
    optimizer.step()
    return float(loss.item())

def hard_update_target():
    target_net.load_state_dict(policy_net.state_dict())

def soft_update_target(tau: float):
    with torch.no_grad():
        for p, tp in zip(policy_net.parameters(), target_net.parameters()):
            tp.data.mul_(1 - tau).add_(tau * p.data)

#### Ciclo de entrenamiento

In [None]:
episode_rewards = []
episode_losses = []

global_step = 0

for ep in range(1, MAX_EPISODES + 1):
    state, _ = env.reset(seed=SEED + ep)
    ep_reward = 0.0
    ep_losses = []

    for t in range(MAX_STEPS_PER_EP):
        if RENDER_DURING_TRAIN:
            env.render()

        action = select_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # shaping
        replay.push(state, action, reward, next_state, float(done))

        state = next_state
        ep_reward += reward
        global_step += 1

        loss = optimize_model()
        if loss is not None:
            ep_losses.append(loss)

        # Actualizar red objetivo
        if USE_SOFT_UPDATE:
            soft_update_target(TAU)
        else:
            if global_step % TARGET_SYNC_EVERY == 0:
                hard_update_target()

        if done:
            break

    episode_rewards.append(ep_reward)
    episode_losses.append(np.mean(ep_losses) if ep_losses else np.nan)

    if ep % 10 == 0:
        avg_last = np.mean(episode_rewards[-10:])
        print(f"[Episodio {ep:03d}] Recompensa media (últimos 10): {avg_last:.1f} | "
              f"epsilon≈{epsilon_by_step(global_step):.3f}")

# Cerrar render si se usó
env.close()

[Episodio 010] Recompensa media (últimos 10): 24.9 | epsilon≈0.977
[Episodio 020] Recompensa media (últimos 10): 17.9 | epsilon≈0.960
[Episodio 030] Recompensa media (últimos 10): 22.7 | epsilon≈0.940
[Episodio 040] Recompensa media (últimos 10): 26.9 | epsilon≈0.916
[Episodio 050] Recompensa media (últimos 10): 23.5 | epsilon≈0.896
[Episodio 060] Recompensa media (últimos 10): 24.8 | epsilon≈0.875
[Episodio 070] Recompensa media (últimos 10): 23.4 | epsilon≈0.856
[Episodio 080] Recompensa media (últimos 10): 26.2 | epsilon≈0.835
[Episodio 090] Recompensa media (últimos 10): 17.8 | epsilon≈0.822
[Episodio 100] Recompensa media (últimos 10): 27.2 | epsilon≈0.801
[Episodio 110] Recompensa media (últimos 10): 35.7 | epsilon≈0.774
[Episodio 120] Recompensa media (últimos 10): 26.4 | epsilon≈0.756
[Episodio 130] Recompensa media (últimos 10): 32.0 | epsilon≈0.733
[Episodio 140] Recompensa media (últimos 10): 26.2 | epsilon≈0.716
[Episodio 150] Recompensa media (últimos 10): 22.9 | epsilon≈0