# Q-Learning y Deep Learning (DQN)

Q-Learning y Deep Learning (DQN)
Conceptos clave
Q-Learning clásico: usa una tabla Q para almacenar valores Q(s,a). Esto funciona bien para espacios de estado pequeños y discretos.

Deep Q-Network (DQN): usa una red neuronal para aproximar la función Q(s,a), permitiendo manejar espacios de estado grandes o continuos.

La red recibe el estado como entrada y produce un valor Q para cada acción posible.

Se entrena usando las muestras de experiencia (transiciones estado, acción, recompensa, siguiente estado).

In [2]:
pip install gymnasium[all]  # para instalar gymnasium con todos los extras

Collecting box2d-py==2.3.5 (from gymnasium[all])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/374.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m276.5/374.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.* (from gymnasium[all])
  Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Collecting mujoco>=2.1.5 (from gymnasium[all])
  Downloading mujoco-3.3.5-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting array-api-compat>=1.11.0 (from gymnasium[all])
 

In [5]:
import gymnasium as gym

env = gym.make("CartPole-v1")  # Crear el entorno
state, info = env.reset()       # Aquí sí devuelve dos valores

action = env.action_space.sample()  # Por ejemplo, una acción aleatoria

next_state, reward, terminated, truncated, info = env.step(action)
done = terminated or truncated

In [7]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

# Red neuronal para aproximar Q(s,a)
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )

    def forward(self, x):
        return self.net(x)

# Configuración
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

policy_net = DQN(state_dim, action_dim)
target_net = DQN(state_dim, action_dim)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=0.001)
criterion = nn.MSELoss()

memory = deque(maxlen=10000)
batch_size = 64
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
num_episodes = 300
target_update = 10

def select_action(state):
    global epsilon
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        state_v = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = policy_net(state_v)
        return torch.argmax(q_values).item()

def optimize_model():
    if len(memory) < batch_size:
        return
    batch = random.sample(memory, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions).unsqueeze(1)
    rewards = torch.FloatTensor(rewards).unsqueeze(1)
    next_states = torch.FloatTensor(next_states)
    dones = torch.FloatTensor(dones).unsqueeze(1)

    q_values = policy_net(states).gather(1, actions)
    with torch.no_grad():
        max_next_q_values = target_net(next_states).max(1)[0].unsqueeze(1)
        target_q_values = rewards + gamma * max_next_q_values * (1 - dones)

    loss = criterion(q_values, target_q_values)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Entrenamiento
for episode in range(num_episodes):
    state, _ = env.reset()  # <-- aquí el cambio
    done = False
    total_reward = 0

    while not done:
        action = select_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)  # <-- cambio aquí
        done = terminated or truncated

        memory.append((state, action, reward, next_state, float(done)))
        state = next_state
        total_reward += reward

        optimize_model()

    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())

    print(f"Episodio {episode+1}, Recompensa: {total_reward}, Epsilon: {epsilon:.3f}")

env.close()

Episodio 1, Recompensa: 31.0, Epsilon: 0.995


  states = torch.FloatTensor(states)


Episodio 2, Recompensa: 52.0, Epsilon: 0.990
Episodio 3, Recompensa: 19.0, Epsilon: 0.985
Episodio 4, Recompensa: 30.0, Epsilon: 0.980
Episodio 5, Recompensa: 39.0, Epsilon: 0.975
Episodio 6, Recompensa: 16.0, Epsilon: 0.970
Episodio 7, Recompensa: 15.0, Epsilon: 0.966
Episodio 8, Recompensa: 13.0, Epsilon: 0.961
Episodio 9, Recompensa: 27.0, Epsilon: 0.956
Episodio 10, Recompensa: 10.0, Epsilon: 0.951
Episodio 11, Recompensa: 36.0, Epsilon: 0.946
Episodio 12, Recompensa: 19.0, Epsilon: 0.942
Episodio 13, Recompensa: 35.0, Epsilon: 0.937
Episodio 14, Recompensa: 28.0, Epsilon: 0.932
Episodio 15, Recompensa: 11.0, Epsilon: 0.928
Episodio 16, Recompensa: 39.0, Epsilon: 0.923
Episodio 17, Recompensa: 22.0, Epsilon: 0.918
Episodio 18, Recompensa: 15.0, Epsilon: 0.914
Episodio 19, Recompensa: 25.0, Epsilon: 0.909
Episodio 20, Recompensa: 21.0, Epsilon: 0.905
Episodio 21, Recompensa: 16.0, Epsilon: 0.900
Episodio 22, Recompensa: 19.0, Epsilon: 0.896
Episodio 23, Recompensa: 17.0, Epsilon: 0.

In [8]:
state, _ = env.reset()
done = False
total_reward

210.0

In [9]:
state, _ = env.reset()
done = False
total_reward = 0

while not done:
    state_v = torch.FloatTensor(state).unsqueeze(0)
    with torch.no_grad():
        q_values = policy_net(state_v)
    action = torch.argmax(q_values).item()
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    state = next_state
    total_reward += reward
    env.render()

print(f"Recompensa en evaluación: {total_reward}")

env.close()

Recompensa en evaluación: 258.0


  gym.logger.warn(


In [10]:
torch.save(policy_net.state_dict(), "dqn_cartpole.pth")

In [11]:
num_eval_episodes = 10
total_rewards = []

for _ in range(num_eval_episodes):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    while not done:
        state_v = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = policy_net(state_v)
        action = torch.argmax(q_values).item()
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        state = next_state
        episode_reward += reward
    total_rewards.append(episode_reward)

print(f"Recompensa promedio en {num_eval_episodes} episodios: {np.mean(total_rewards)}")

Recompensa promedio en 10 episodios: 248.6


In [12]:
import matplotlib.pyplot as plt
from gym.wrappers.monitoring import video_recorder

env = gym.make("CartPole-v1", render_mode="rgb_array")
recorder = video_recorder.VideoRecorder(env, path="cartpole_dqn.mp4")

state, _ = env.reset()
done = False

while not done:
    recorder.capture_frame()
    state_v = torch.FloatTensor(state).unsqueeze(0)
    with torch.no_grad():
        q_values = policy_net(state_v)
    action = torch.argmax(q_values).item()
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    state = next_state

recorder.close()
env.close()

  logger.deprecation(
  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


In [13]:
# Guardar
torch.save(policy_net.state_dict(), "dqn_cartpole.pth")

# Cargar
policy_net.load_state_dict(torch.load("dqn_cartpole.pth"))
policy_net.eval()

DQN(
  (net): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=2, bias=True)
  )
)