# Integrantes de la prácticas

* Alejandro Cortijo Benito
* Alejandro García Mota

# Librerias utilizadas

In [None]:
import tensorflow as tf

import numpy as np
from collections import deque

import gymnasium as gym

import imageio

# Arquitectura del DQN

Hemos tomado como referencia la red del `Lunar Lander`.

In [None]:
class DQN(tf.keras.Model):
    def __init__(self, num_actions):
        super(DQN, self).__init__()
        self.dense1 = tf.keras.layers.Dense(32, activation="relu")
        self.dense2 = tf.keras.layers.Dense(32, activation="relu")
        self.dense3 = tf.keras.layers.Dense(num_actions, dtype=tf.float32) 

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        return self.dense3(x)

# Número de acciones (salida de la red)
num_actions = 4 

main_nn = DQN(num_actions)  
target_nn = DQN(num_actions)  

optimizer = tf.keras.optimizers.Adam(1e-4)
mse = tf.keras.losses.MeanSquaredError()

class ReplayBuffer:
    def __init__(self, size):
        self.buffer = deque(maxlen=size)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def __len__(self):
        return len(self.buffer)

    def sample(self, num_samples):
        states, actions, rewards, next_states, dones = [], [], [], [], []
        idx = np.random.choice(len(self.buffer), num_samples)

        for i in idx:
            state, action, reward, next_state, done = self.buffer[i]
            states.append(np.array(state, copy=False))
            actions.append(np.array(action, copy=False))
            rewards.append(reward)
            next_states.append(np.array(next_state, copy=False))
            dones.append(done)

        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards, dtype=np.float32)
        next_states = np.array(next_states)
        dones = np.array(dones, dtype=np.float32)

        return states, actions, rewards, next_states, dones

## Política 

En base a la experiencia de la primera parte de la práctica, decidimos usar una política greedy.

In [None]:
def select_epsilon_greedy_action(state, epsilon, env, main_nn):
    result = tf.random.uniform((1,))
    if result < epsilon:
        return env.action_space.sample()  
    else:
        return tf.argmax(main_nn(state)[0]).numpy() 

## Entrenamiento

Usando como referencia el método de entrenamiento propuesto en las diapositivas, adaptamos el método para nuestro caso de estudio.

In [None]:
discount = 0.99

@tf.function
def train_step(states, actions, rewards, next_states, dones, main_nn, target_nn, optimizer, mse, num_actions):
    next_qs = target_nn(next_states)
    max_next_qs = tf.reduce_max(next_qs, axis=-1)

    target = rewards + (1.0 - dones) * discount * max_next_qs

    with tf.GradientTape() as tape:
        qs = main_nn(states)
        action_masks = tf.one_hot(actions, num_actions)
        masked_qs = tf.reduce_sum(action_masks * qs, axis=-1)
        loss = mse(target, masked_qs)

    grads = tape.gradient(loss, main_nn.trainable_variables)
    optimizer.apply_gradients(zip(grads, main_nn.trainable_variables))

    return loss

## Hiperparámetros

Usamos los hiperparámetros de las diapositivas, intentamos cambiarlos para tratar de mejor los resultados pero no pudimos. Así que decidimos mantener los de las diapositivas.

In [None]:
# Hiperparámetros
num_episodes = 1000
epsilon = 1.0
batch_size = 32
discount = 0.99
buffer = ReplayBuffer(100000)
cur_frame = 0

## Proceso de entrenamiento

Usando las funciones definidas previamente, empezamos el aprendizaje.

In [None]:
env = gym.make("LunarLander-v3", render_mode=None)

last_100_ep_rewards = []

last_best_reward = -np.inf

for episode in range(num_episodes + 1):
    state, _ = env.reset()  
    ep_reward, done = 0, False
    state = np.array(state, dtype=np.float32)

    while not done:
        state_in = tf.convert_to_tensor([state], dtype=tf.float32) 
        action = select_epsilon_greedy_action(state_in, epsilon, env, main_nn)
        next_state, reward, done, _, _ = env.step(action)  
        next_state = np.array(next_state, dtype=np.float32)
        ep_reward += reward

        buffer.add(state, action, reward, next_state, done)
        state = next_state
        cur_frame += 1

        if cur_frame % 2000 == 0:
            target_nn.set_weights(main_nn.get_weights())

        if len(buffer) >= batch_size:
            states, actions, rewards, next_states, dones = buffer.sample(batch_size)
            loss = train_step(states, actions, rewards, next_states, dones, main_nn, target_nn, optimizer, mse, num_actions)

    if episode < 950:
        epsilon = max(epsilon - 0.01, 0.1)

    if len(last_100_ep_rewards) == 100:
        last_100_ep_rewards.pop(0)
    last_100_ep_rewards.append(ep_reward)

    if episode % 50 == 0:
        print(f'Episode {episode}/{num_episodes}. Epsilon: {epsilon:.3f}. '
              f'Reward in last 100 episodes: {np.mean(last_100_ep_rewards):.3f}')
    
        if last_best_reward < np.mean(last_100_ep_rewards):
            main_nn.save_weights("best_model_main.weights.h5")
            target_nn.save_weights("best_model_target.weights.h5")
            last_best_reward = np.mean(last_100_ep_rewards)

env.close()

Episode 0/1000. Epsilon: 0.990. Reward in last 100 episodes: -118.519
Episode 50/1000. Epsilon: 0.490. Reward in last 100 episodes: -93.285
Episode 100/1000. Epsilon: 0.100. Reward in last 100 episodes: -90.993
Episode 150/1000. Epsilon: 0.100. Reward in last 100 episodes: 22.597
Episode 200/1000. Epsilon: 0.100. Reward in last 100 episodes: 115.000
Episode 250/1000. Epsilon: 0.100. Reward in last 100 episodes: 118.040
Episode 300/1000. Epsilon: 0.100. Reward in last 100 episodes: 105.967
Episode 350/1000. Epsilon: 0.100. Reward in last 100 episodes: 88.349
Episode 400/1000. Epsilon: 0.100. Reward in last 100 episodes: 126.601
Episode 450/1000. Epsilon: 0.100. Reward in last 100 episodes: 155.706
Episode 500/1000. Epsilon: 0.100. Reward in last 100 episodes: 161.419
Episode 550/1000. Epsilon: 0.100. Reward in last 100 episodes: 171.899
Episode 600/1000. Epsilon: 0.100. Reward in last 100 episodes: 190.788
Episode 650/1000. Epsilon: 0.100. Reward in last 100 episodes: 185.801
Episode 70

# Evaluación de resultados

Cargamos los mejores pesos, ejecutamos `5` experimentos y nos guardamos los resultados en un `.mp4`.

In [None]:
# Inferencia y generación de video
env = gym.make("LunarLander-v3", render_mode="rgb_array")
main_nn.load_weights("best_model_main.weights.h5")

frames = []
for episode in range(5):  
    state, _ = env.reset()
    done = False
    while not done:
        state_in = tf.convert_to_tensor([state], dtype=tf.float32)
        action = tf.argmax(main_nn(state_in)[0]).numpy()
        next_state, _, done, _, _ = env.step(action)
        frames.append(env.render())
        state = next_state

# Guardar el video
video_path = "lunar_lander_results.mp4"
imageio.mimsave(video_path, frames, fps=30, codec='libx264')
print(f"Video saved at {video_path}")

env.close()

Video saved at lunar_lander_results.mp4
