In [1]:
%pip install swig -q
%pip install gymnasium[box2d] -q
%pip install loky -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import gymnasium as gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from collections import deque

In [3]:
env = gym.make('LunarLander-v3')
num_actions = env.action_space.n

In [4]:
class DQN(tf.keras.Model):
    """Perceptron multicapa de 2 capas de 32 y una de salida."""

    def __init__(self):
        super(DQN, self).__init__()
        self.dense1 = tf.keras.layers.Dense(32, activation="relu")
        self.dense2 = tf.keras.layers.Dense(32, activation="relu")
        self.dense3 = tf.keras.layers.Dense(num_actions, dtype=tf.float32)
        # No activation definida para la capa de salida

    def call(self, x):
        """Construcción de las capas."""
        x = self.dense1(x)
        x = self.dense2(x)
        return self.dense3(x)

# Red principal
main_nn = DQN()
# Red objetivo
target_nn = DQN()

# Optimizador Adam
optimizer = tf.keras.optimizers.Adam(1e-4)
# Función de pérdida (MSE)
mse = tf.keras.losses.MeanSquaredError()


In [5]:
class ReplayBuffer(object):
    """Experience replay buffer that samples uniformly."""

    def __init__(self, size):
        self.buffer = deque(maxlen=size)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def __len__(self):
        return len(self.buffer)

    def sample(self, num_samples):
        states, actions, rewards, next_states, dones = [], [], [], [], []

        idx = np.random.choice(len(self.buffer), num_samples)
        for i in idx:
            elem = self.buffer[i]
            state, action, reward, next_state, done = elem

            states.append(np.array(state, copy=False))
            actions.append(np.array(action, copy=False))
            rewards.append(reward)
            next_states.append(np.array(next_state, copy=False))
            dones.append(done)

        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards, dtype=np.float32)
        next_states = np.array(next_states)
        dones = np.array(dones, dtype=np.float32)

        return states, actions, rewards, next_states, dones


In [6]:
def select_epsilon_greedy_action(state, epsilon):
    """Acción aleatoria con probabilidad menor que epsilon, en otro caso la mejor."""
    result = tf.random.uniform((1,))
    if result < epsilon:
        # Elegimos una acción aleatoria
        return env.action_space.sample()
    else:
        # Elección de acción Greedy
        return tf.argmax(main_nn(state)[0]).numpy()

@tf.function
def train_step(states, actions, rewards, next_states, dones):
    """Configuración de cada iteración de entrenamiento."""
    # Cálculo de los objetivos (segunda red)
    next_qs = target_nn(next_states)
    max_next_qs = tf.reduce_max(next_qs, axis=-1)
    target = rewards + (1. - dones) * discount * max_next_qs

    with tf.GradientTape() as tape:
        qs = main_nn(states)
        action_masks = tf.one_hot(actions, num_actions)
        masked_qs = tf.reduce_sum(action_masks * qs, axis=-1)
        loss = mse(target, masked_qs)

    grads = tape.gradient(loss, main_nn.trainable_variables)
    optimizer.apply_gradients(zip(grads, main_nn.trainable_variables))
    return loss

In [9]:
state = env.reset()
print("state type:", type(state))
print("state:", state)

state type: <class 'tuple'>
state: (array([-0.00364151,  1.4190147 , -0.3688649 ,  0.35975143,  0.00422643,
        0.08355333,  0.        ,  0.        ], dtype=float32), {})


In [16]:
# Hyperparámetros
num_episodes = 1000
epsilon = 1.0
batch_size = 32
discount = 0.99
buffer = ReplayBuffer(100000)
cur_frame = 0

# Comienzo del entrenamiento. Jugamos una vez y entrenamos con un batch.
last_100_ep_rewards = []

for episode in range(num_episodes + 1):
    state,_ = env.reset()  # Reseteo del ecosistema
    ep_reward = 0
    done = False

    while not done:
        state_in = tf.expand_dims(np.asarray(state, dtype=np.float32), axis=0)
        action = select_epsilon_greedy_action(state_in, epsilon)
        next_state, reward, done, truncated, info = env.step(action)

        ep_reward += reward

        # Guardamos la experiencia.
        buffer.add(state, action, reward, next_state, done)
        state = next_state
        cur_frame += 1

        # Copiamos los pesos de main_nn a target_nn cada 2000 frames.
        if cur_frame % 2000 == 0:
            target_nn.set_weights(main_nn.get_weights())

        # Entrenamiento de la red neuronal.
        if len(buffer) >= batch_size:
            states, actions, rewards, next_states, dones = buffer.sample(batch_size)
            loss = train_step(states, actions, rewards, next_states, dones)

    # Actualización de epsilon mientras no se llegue a 950 episodios.
    if episode < 950:
        epsilon -= 0.001

    # Mantenimiento de la lista de recompensas de los últimos 100 episodios.
    if len(last_100_ep_rewards) == 100:
        last_100_ep_rewards = last_100_ep_rewards[1:]
    last_100_ep_rewards.append(ep_reward)

    # Impresión del progreso cada 50 episodios.
    if episode % 50 == 0:
        print(f'Episode {episode}/{num_episodes}. Epsilon: {epsilon:.3f}. '
              f'Reward in last 100 episodes: {np.mean(last_100_ep_rewards):.3f}')

env.close()

Episode 0/1000. Epsilon: 0.999. Reward in last 100 episodes: -109.824
Episode 50/1000. Epsilon: 0.949. Reward in last 100 episodes: -181.054
Episode 100/1000. Epsilon: 0.899. Reward in last 100 episodes: -174.349
Episode 150/1000. Epsilon: 0.849. Reward in last 100 episodes: -166.462
Episode 200/1000. Epsilon: 0.799. Reward in last 100 episodes: -159.868
Episode 250/1000. Epsilon: 0.749. Reward in last 100 episodes: -140.349
Episode 300/1000. Epsilon: 0.699. Reward in last 100 episodes: -126.237
Episode 350/1000. Epsilon: 0.649. Reward in last 100 episodes: -111.820
Episode 400/1000. Epsilon: 0.599. Reward in last 100 episodes: -75.713
Episode 450/1000. Epsilon: 0.549. Reward in last 100 episodes: -56.561
Episode 500/1000. Epsilon: 0.499. Reward in last 100 episodes: -47.810
Episode 550/1000. Epsilon: 0.449. Reward in last 100 episodes: -47.941
Episode 600/1000. Epsilon: 0.399. Reward in last 100 episodes: -69.999
Episode 650/1000. Epsilon: 0.349. Reward in last 100 episodes: -53.414
E