In [14]:
import tensorflow as tf

class DQN(tf.keras.Model):
    """
    Perceptrón multicapa de 2 capas de 32 unidades y una capa de salida
    """
    def __init__(self, num_actions):
        super(DQN, self).__init__()
        self.dense1 = tf.keras.layers.Dense(32, activation="relu")
        self.dense2 = tf.keras.layers.Dense(32, activation="relu")
        self.dense3 = tf.keras.layers.Dense(num_actions, dtype=tf.float32)  # Sin activación

    def call(self, x):
        """
        Construcción de las capas
        """
        x = self.dense1(x)
        x = self.dense2(x)
        return self.dense3(x)

# Número de acciones (salida de la red)
num_actions = 4  # Cambia esto según el problema

# Crear las redes principal y objetivo
main_nn = DQN(num_actions)  # Red principal
target_nn = DQN(num_actions)  # Red objetivo

# Optimizador y función de pérdida
optimizer = tf.keras.optimizers.Adam(1e-4)  # Optimizador Adam
mse = tf.keras.losses.MeanSquaredError()

In [15]:
import numpy as np
from collections import deque

class ReplayBuffer:
    """
    Experience replay buffer that samples uniformly.
    """
    def __init__(self, size):
        self.buffer = deque(maxlen=size)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def __len__(self):
        return len(self.buffer)

    def sample(self, num_samples):
        states, actions, rewards, next_states, dones = [], [], [], [], []
        idx = np.random.choice(len(self.buffer), num_samples)

        for i in idx:
            state, action, reward, next_state, done = self.buffer[i]
            states.append(np.array(state, copy=False))
            actions.append(np.array(action, copy=False))
            rewards.append(reward)
            next_states.append(np.array(next_state, copy=False))
            dones.append(done)

        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards, dtype=np.float32)
        next_states = np.array(next_states)
        dones = np.array(dones, dtype=np.float32)

        return states, actions, rewards, next_states, dones

In [16]:
import tensorflow as tf

# Función para seleccionar una acción epsilon-greedy
def select_epsilon_greedy_action(state, epsilon, env, main_nn):
    """
    Selecciona una acción aleatoria con probabilidad epsilon,
    o la mejor acción según la red principal con probabilidad 1 - epsilon.
    """
    result = tf.random.uniform((1,))
    if result < epsilon:
        return env.action_space.sample()  # Acción aleatoria
    else:
        return tf.argmax(main_nn(state)[0]).numpy()  # Acción greedy

# Función de entrenamiento con TensorFlow@tf.function para optimización
discount = 0.99  # Factor de descuento (gamma)

@tf.function
def train_step(states, actions, rewards, next_states, dones, main_nn, target_nn, optimizer, mse, num_actions):
    """
    Ejecuta un paso de entrenamiento de la red principal.
    """
    # Calcular los valores Q para los siguientes estados con la red objetivo
    next_qs = target_nn(next_states)
    max_next_qs = tf.reduce_max(next_qs, axis=-1)

    # Calcular las etiquetas objetivo para la función de pérdida
    target = rewards + (1.0 - dones) * discount * max_next_qs

    with tf.GradientTape() as tape:
        qs = main_nn(states)
        action_masks = tf.one_hot(actions, num_actions)
        masked_qs = tf.reduce_sum(action_masks * qs, axis=-1)
        loss = mse(target, masked_qs)

    # Calcular y aplicar gradientes
    grads = tape.gradient(loss, main_nn.trainable_variables)
    optimizer.apply_gradients(zip(grads, main_nn.trainable_variables))

    return loss

In [20]:
import numpy as np
import tensorflow as tf
import gymnasium as gym

# Hiperparámetros
num_episodes = 1000
epsilon = 1.0
batch_size = 32
discount = 0.99
buffer = ReplayBuffer(100000)
cur_frame = 0

env = gym.make("LunarLander-v3", render_mode=None)

# Seguimiento de recompensas recientes
last_100_ep_rewards = []

last_best_reward = -np.inf

# Bucle principal de entrenamiento
for episode in range(num_episodes + 1):
    state, _ = env.reset()  # Reiniciar el entorno
    ep_reward, done = 0, False
    state = np.array(state, dtype=np.float32)

    while not done:
        state_in = tf.convert_to_tensor([state], dtype=tf.float32)  # Asegura tensor de forma correcta
        action = select_epsilon_greedy_action(state_in, epsilon, env, main_nn)
        next_state, reward, done, _, _ = env.step(action)  # Algunos entornos devuelven más valores
        next_state = np.array(next_state, dtype=np.float32)
        ep_reward += reward

        # Añadir experiencia al buffer
        buffer.add(state, action, reward, next_state, done)
        state = next_state
        cur_frame += 1

        # Actualizar pesos de la red objetivo cada 2000 frames
        if cur_frame % 2000 == 0:
            target_nn.set_weights(main_nn.get_weights())

        # Entrenar la red si hay suficientes muestras en el buffer
        if len(buffer) >= batch_size:
            states, actions, rewards, next_states, dones = buffer.sample(batch_size)
            loss = train_step(states, actions, rewards, next_states, dones, main_nn, target_nn, optimizer, mse, num_actions)

    # Reducir epsilon para disminuir la exploración
    if episode < 950:
        epsilon = max(epsilon - 0.01, 0.1)

    # Actualizar recompensas de los últimos 100 episodios
    if len(last_100_ep_rewards) == 100:
        last_100_ep_rewards.pop(0)
    last_100_ep_rewards.append(ep_reward)

    # Imprimir información de progreso cada 50 episodios
    if episode % 50 == 0:
        print(f'Episode {episode}/{num_episodes}. Epsilon: {epsilon:.3f}. '
              f'Reward in last 100 episodes: {np.mean(last_100_ep_rewards):.3f}')
    
        if last_best_reward < np.mean(last_100_ep_rewards):
            main_nn.save_weights("best_model_main.weights.h5")
            target_nn.save_weights("best_model_target.weights.h5")
            last_best_reward = np.mean(last_100_ep_rewards)


# Cerrar el entorno
env.close()


Episode 0/1000. Epsilon: 0.990. Reward in last 100 episodes: -118.519
Episode 50/1000. Epsilon: 0.490. Reward in last 100 episodes: -93.285
Episode 100/1000. Epsilon: 0.100. Reward in last 100 episodes: -90.993
Episode 150/1000. Epsilon: 0.100. Reward in last 100 episodes: 22.597
Episode 200/1000. Epsilon: 0.100. Reward in last 100 episodes: 115.000
Episode 250/1000. Epsilon: 0.100. Reward in last 100 episodes: 118.040
Episode 300/1000. Epsilon: 0.100. Reward in last 100 episodes: 105.967
Episode 350/1000. Epsilon: 0.100. Reward in last 100 episodes: 88.349
Episode 400/1000. Epsilon: 0.100. Reward in last 100 episodes: 126.601
Episode 450/1000. Epsilon: 0.100. Reward in last 100 episodes: 155.706
Episode 500/1000. Epsilon: 0.100. Reward in last 100 episodes: 161.419
Episode 550/1000. Epsilon: 0.100. Reward in last 100 episodes: 171.899
Episode 600/1000. Epsilon: 0.100. Reward in last 100 episodes: 190.788
Episode 650/1000. Epsilon: 0.100. Reward in last 100 episodes: 185.801
Episode 70

In [10]:
main_nn.save_weights('main_nn_weights_episode.weights.h5')
target_nn.save_weights('target_nn_weights_episode.weights.h5')

In [None]:
main_nn.load_weights('.h5')
target_nn.load_weights('.h5')