## 1. Entorno crudeTanksEnv

### 1.1. Establecer el entorno

En primer lugar cargaremos la librería __gym__ e inicializaremos el entorno.

In [1]:
if not ('isColab' in locals()):
    if 'google.colab' in str(get_ipython()):
        print('Running on CoLab')
        !pip install gymnasium
        !wget https://github.com/BaltiBoix/Scheduling/raw/master/tankEnvV1.py
        isColab = True
    else:
        print('Not running on CoLab')

Not running on CoLab


In [2]:
import gymnasium as gym
from gymnasium.spaces import flatten as flat
import torch
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy, copy
from collections import namedtuple, deque, OrderedDict
import os
import IPython.display
import sklearn
import sklearn.pipeline
from sklearn.kernel_approximation import RBFSampler

import tankEnvV1

env = gym.envs.make('crudeTanksEnv-v1')

  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


In [3]:
print('Is torch cuda available?: {}'.format(torch.cuda.is_available()))

Is torch cuda available?: True


In [4]:
env.spec

EnvSpec(id='crudeTanksEnv-v1', entry_point=<class 'tankEnvV1.crudeTanksEnv'>, reward_threshold=500, nondeterministic=False, max_episode_steps=720, order_enforce=True, autoreset=False, disable_env_checker=False, apply_api_compatibility=False, kwargs={}, namespace=None, name='crudeTanksEnv', version=1)

In [5]:
flat(env.observation_space, env.observation_space.sample()).shape[0]

163

### Agente Dueling DQN

En este apartado resolveremos el mismo entorno con las mismas características para el agente, pero usando una dueling DQN. Como en el caso anterior, primero definiremos el modelo de red neuronal, luego describiremos el comportamiento del agente, lo entrenaremos y, finalmente, testearemos el funcionamiento del mismo.

### Definición de la arquitectura  de la red neuronal

El objetivo principal de las dueling DQN es "ahorrarse" el cálculo del valor de Q en aquéllos estados en los que es irrelevante la acción que se tome. Para ello se descompone la función Q en dos componentes:


$$Q(s, a) = A(s, a) + V (s)$$


Esta descomposición se realiza a nivel de la arquitectura de la red neuronal. Las primeras capas que teníamos en la DQN serán comunes, y luego la red se dividirá en dos partes separadas definidas por el resto de capas.

La descomposición en sub-redes del modelo de la DQN implementada en el apartado anterior, será entonces:
<ol>
    <li>Bloque común, la red CNN</li>
    <li>Red advantage A(s,a):</li>
         <ul>

In [6]:
class duelingDQN(torch.nn.Module):

    def __init__(self, env, learning_rate=1e-3, device='cuda'):
        """
        Params
        ======
        input_shape: tamaño del espacio de estados
        n_outputs: tamaño del espacio de acciones
        actions: array de acciones posibles
        device: cpu o cuda
        red_cnn: definición de la red convolucional
        value: definición de la red lineal value
        advantage: definición de la red lineal advantage
        """

        super(duelingDQN, self).__init__()

        if device=='cuda' and torch.cuda.is_available():
            self.device = 'cuda'
        else:
            self.device = 'cpu'
        self.n_inputs = flat(env.observation_space, env.observation_space.sample()).shape[0]
        self.n_outputs = env.action_space.n
        self.actions = np.arange(env.action_space.n)
        self.learning_rate = learning_rate
            

        self.red_lineal = torch.nn.Sequential(
            torch.nn.Linear(self.n_inputs, 128, bias=True),
            torch.nn.ReLU(),        
            torch.nn.Linear(128, 256, bias=True),
            torch.nn.ReLU()        
        )
        
        self.advantage = torch.nn.Sequential(
            torch.nn.Linear(256, 256, bias=True),
            torch.nn.ReLU(),        
            torch.nn.Linear(256, self.n_outputs, bias=True)
        )
        
        self.value = torch.nn.Sequential(
            torch.nn.Linear(256, 256, bias=True),
            torch.nn.ReLU(),        
            torch.nn.Linear(256, 1, bias=True)
        )

        ### Se ofrece la opción de trabajar con cuda
        if self.device == 'cuda':
            self.red_lineal.cuda()
            self.value.cuda()
            self.advantage.cuda()

        ### Inicializamos el optimizador
        self.optimizer = torch.optim.RAdam(self.parameters(), lr=learning_rate)
        self.lr_sched = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='max', patience=25000,
                                                                   factor=0.5, min_lr=0.00001, verbose=True)

    def forward(self, x):
        out = self.red_lineal(x)
        adv = self.advantage(out)
        out = self.value(out) + adv
        return out

    ### Método e-greedy
    def get_action(self, state, actionAvail, epsilon=0.05):
        if np.random.random() < epsilon:
            for _ in range(10):
                action = np.random.choice(self.actions)  # acción aleatoria
                if actionAvail[action]:
                    break
        else:
            qvals = self.get_qvals(state)  # acción a partir del cálculo del valor de Q para esa acción
            action= torch.max(qvals * torch.tensor(actionAvail+1E-5, device=self.device), dim=-1)[1].item()
        return action


    def get_qvals(self, state):
        if type(state) is tuple:
            state = np.array(state)
        state_t = torch.FloatTensor(state).to(device=self.device)
        return self.forward(state_t)

####  _Buffer_ de repetición de experiencias

In [7]:
class experienceReplayBuffer:

    def __init__(self, memory_size=50000, burn_in=10000):
        self.memory_size = memory_size
        self.burn_in = burn_in
        self.buffer = namedtuple('Buffer', 
            field_names=['state', 'action', 'reward', 'done', 'next_state'])
        self.replay_memory = deque(maxlen=memory_size)

    ##Creamos una lista de índices aleatorios y empaquetamos las experiencias en arrays de Numpy (facilita el cálculo posterior de la pérdida)
    def sample_batch(self, batch_size=32):
        samples = np.random.choice(len(self.replay_memory), batch_size, 
                                   replace=False)
        # Use asterisk operator to unpack deque 
        batch = zip(*[self.replay_memory[i] for i in samples])
        return batch

    ## Se añaden las nuevas experiencias 
    def append(self, state, action, reward, done, next_state):
        self.replay_memory.append(
            self.buffer(state, action, reward, done, next_state))

    ## Rellenamos el buffer con experiencias aleatorias al inicio del entrenamiento
    def burn_in_capacity(self):
        return len(self.replay_memory) / self.burn_in

### Definición del agente

In [8]:
class duelingDQNAgent:

    def __init__(self, env, main_network,
                 buffer,
                 epsilon=0.1, eps_decay=0.99, batch_size=32, nblock=100):
        """"
        Params
        ======
        env: entorno
        dnnetwork: clase con la red neuronal diseñada
        target_network: red objetivo
        buffer: clase con el buffer de repetición de experiencias
        epsilon: epsilon
        eps_decay: epsilon decay
        batch_size: batch size
        nblock: bloque de los X últimos episodios de los que se calculará la media de recompensa
        reward_threshold: umbral de recompensa definido en el entorno
        """
        if torch.cuda.is_available():
            self.device = 'cuda'
        else:
            self.device = 'cpu'
        self.env = env
        self.main_network = main_network
        self.target_network = deepcopy(main_network) # red objetivo (copia de la principal)
        self.buffer = buffer
        self.epsilon = epsilon
        self.eps_decay = eps_decay
        self.batch_size = batch_size
        self.nblock = nblock
        self.reward_threshold = env.spec.reward_threshold
        self.mean_rewards = -10000.
        self.update_loss = []
        self.training_rewards = []
        self.mean_training_rewards = []
        self.epsilons = []
        self.losses = []
        self.initialize()

    def initialize(self):
        self.total_reward = 0
        self.step_count = 0
        state, data = self.env.reset()
        self.state0 = flat(self.env.observation_space, state)
        self.actionAvail = data['actionAvail']

    def take_step(self, eps, mode='train'):
        if mode == 'explore':
            action = self.env.action_space.sample()  # acción aleatoria en el burn-in
        else:
            action = self.main_network.get_action(self.state0, self.actionAvail, eps)# acción a partir del valor de Q (elección de la acción con mejor Q)
        self.step_count += 1

        new_state, reward, done, truncated, info = self.env.step(action)
        new_state = flat(env.observation_space, new_state)
        self.total_reward += reward
        self.buffer.append(self.state0, action, reward, done, new_state) # guardar experiencia en el buffer
        self.state0 = deepcopy(new_state)
        self.actionAvail = info['actionAvail']

#         if done:
#             state, data = self.env.reset()
#             self.state0 = flat(self.env.observation_space, state)
#             self.actionAvail = data['actionAvail']
        return done



    ## Entrenamiento
    def train(self, gamma=0.99, max_episodes=50000,
              batch_size=32,
              dnn_update_frequency=4,
              dnn_sync_frequency=2000, min_episodios=250, min_epsilon = 0.01):
        self.gamma = gamma
        # Rellenamos el buffer con N experiencias aleatorias ()
        print("Filling replay buffer...")
        while self.buffer.burn_in_capacity() < 1:
            self.take_step(self.epsilon, mode='explore')

        episode = 0
        training = True
        print("Training...")
        maximo = -10000.0
        while training:
            state, data = self.env.reset()
            self.state0 = flat(self.env.observation_space, state)
            self.actionAvail = data['actionAvail']
            self.total_reward = 0
            gamedone = False
            while gamedone == False:
                # El agente toma una acción
                gamedone = self.take_step(self.epsilon, mode='train')

                if self.step_count % dnn_update_frequency == 0:
                    self.update()

                if self.step_count % dnn_sync_frequency == 0:
                    self.target_network.load_state_dict(
                        self.main_network.state_dict())

            episode += 1
            self.epsilons.append(self.epsilon)
            if len(self.update_loss):
                self.losses.append(self.update_loss[-1])
            self.training_rewards.append(self.total_reward) # guardamos las recompensas obtenidas
            if self.total_reward > maximo:
                maximo = self.total_reward
                self.maxS = deepcopy(self.env.S)

            self.update_loss = []

            self.mean_rewards = np.mean(self.training_rewards[-self.nblock:])
            self.mean_training_rewards.append(self.mean_rewards)                    

            if self.epsilon == min_epsilon and self.mean_rewards < self.mean_training_rewards[-self.nblock]:
                self.epsilon = 0.5
                print("reinitialize epsilon")

            if episode > 100 and episode % 100 == 0:
                self.plot_rewards()

            print("\rEpisode {:d} Mean Rewards {:.2f} Epsilon {:.4f} , Maximo {:.2f}\t\t".format(
                episode, self.mean_rewards, self.epsilon, maximo), end="")

            # Comprobar si se ha llegado al máximo de episodios
            if episode >= max_episodes:
                training = False
                print('\nEpisode limit reached.')
                break


            # Termina el juego si la media de recompensas ha llegado al umbral fijado para este juego
            # y se ha entrenado un mínimo de episodios
            if self.mean_rewards >= self.reward_threshold and min_episodios <  episode:
                training = False
                print('\nEnvironment solved in {} episodes!'.format(
                    episode))
                break

            self.epsilon =  max(self.epsilon * self.eps_decay, min_epsilon)


     ## Cálculo de la pérdida
    def calculate_loss(self, batch):
        # Separamos las variables de la experiencia y las convertimos a tensores
        states, actions, rewards, dones, next_states = [i for i in batch]
        rewards_vals = torch.FloatTensor(rewards).to(device=self.device).reshape(-1,1)
        actions_vals = torch.LongTensor(np.array(actions)).reshape(-1,1).to(
            device=self.device)
        dones_t = torch.BoolTensor(dones).to(device=self.device)

        # Obtenemos los valores de Q de la red principal
        qvals = torch.gather(self.main_network.get_qvals(states), 1, actions_vals)

        #DQN update#
        next_actions = torch.max(self.main_network.get_qvals(next_states), dim=-1)[1]
        next_actions_vals = next_actions.reshape(-1,1).to(device=self.device)
        # Obtenemos los valores de Q de la red objetivo
        target_qvals = self.target_network.get_qvals(next_states)
        qvals_next = torch.gather(target_qvals, 1, next_actions_vals) #.detach()

        qvals_next[dones_t] = 0 # 0 en estados terminales

        expected_qvals = self.gamma * qvals_next + rewards_vals
        
        loss = torch.nn.MSELoss()(qvals, expected_qvals.reshape(-1,1))
        return loss



    def update(self):
        self.main_network.optimizer.zero_grad()  # eliminamos cualquier gradiente pasado
        batch = self.buffer.sample_batch(batch_size=self.batch_size) # seleccionamos un conjunto del buffer
        loss = self.calculate_loss(batch) # calculamos la pérdida
        loss.backward() # hacemos la diferencia para obtener los gradientes
        self.main_network.optimizer.step() # aplicamos los gradientes a la red neuronal
        self.main_network.lr_sched.step(self.mean_rewards)
        # Guardamos los valores de pérdida
        if self.device == 'cuda':
            self.update_loss.append(loss.detach().cpu().numpy())
        else:
            self.update_loss.append(loss.detach().numpy())

    def plot_rewards(self):
      
        IPython.display.clear_output(wait=True)

        plt.figure(figsize=(12,8))
        plt.plot(self.training_rewards, label='Rewards')
        plt.plot(self.mean_training_rewards, label='Mean Rewards')
        plt.axhline(self.env.spec.reward_threshold, color='r', label="Reward threshold")
        plt.xlabel('Episodes')
        plt.ylabel('Rewards')
        plt.legend(loc="upper left")
        plt.grid()
        plt.show()

### 3.3 Entrenamiento

A continuación entrenaremos el modelo dueling DQN con los mismos hiperparámetros con los que entrenamos la DQN.

In [9]:
lr = 0.001            #Velocidad de aprendizaje
BATCH_SIZE = 32       #Conjunto a coger del buffer para la red neuronal
MEMORY_SIZE = 10000   #Máxima capacidad del buffer
GAMMA = 0.99          #Valor gamma de la ecuación de Bellman
EPSILON = 1           #Valor inicial de epsilon
EPSILON_DECAY = .99   #Decaimiento de epsilon
MIN_EPSILON = 0.02    #Mínimo epsilon
NBLOCK = 100          #Número de steps para rellenar el buffer
MAX_EPISODES = 5000   #Número máximo de episodios (el agente debe aprender antes de llegar a este valor)
MIN_EPISODIOS = 100   #Número minímo de episodios
BURN_IN = 2000        #Número de episodios iniciales usados para rellenar el buffer antes de entrenar
DNN_UPD = 1           #Frecuencia de actualización de la red neuronal 
DNN_SYNC = 1500       #Frecuencia de sincronización de pesos entre la red neuronal y la red objetivo

In [10]:
buffer = experienceReplayBuffer(memory_size=MEMORY_SIZE, burn_in=BURN_IN)

In [11]:
ddqn = duelingDQN(env, learning_rate=lr)

In [12]:
dagent = duelingDQNAgent(env, ddqn, buffer, EPSILON, EPSILON_DECAY, BATCH_SIZE, NBLOCK)

In [None]:
%%time

dagent.train(gamma=GAMMA, max_episodes=MAX_EPISODES, 
             batch_size=BATCH_SIZE, dnn_update_frequency=DNN_UPD, 
             dnn_sync_frequency=DNN_SYNC, min_episodios=MIN_EPISODIOS, min_epsilon=MIN_EPSILON)

Filling replay buffer...


  if not isinstance(terminated, (bool, np.bool8)):


In [None]:
dagent.plot_rewards()

In [None]:
dagent.maxS.render()

In [None]:
myfile = 'agentDDQN_Trained_Model.pth'
if os.path.isfile(myfile):
    os.remove(myfile)
torch.save(ddqn.state_dict(), myfile)

In [None]:
%%time

env = gym.envs.make('crudeTanksEnv-v1')

d_dqn = duelingDQN(env, device='cuda')
d_dqn.load_state_dict(torch.load(myfile))

reward_list = []
n_list = []
for n_episodes in range(20):
    state, data = env.reset()
    state0 = flat(env.observation_space, state)
    actionAvail = data['actionAvail']
    done = False
    episode_reward = 0
    n = 0
    action_list = []
    while not done:
        action= torch.max(d_dqn.get_qvals(state0) * torch.tensor(actionAvail+1E-5, device=d_dqn.device), dim=-1)[1].item()
        action_list.append(action)
        state, reward, done, truncated, info = env.step(action)
        actionAvail = info['actionAvail']
        episode_reward += reward
        n += 1
        state0 = flat(env.observation_space, state)
    a, b = np.unique(action_list, return_counts=True)
    txt = ' '.join(['{:2d}: {:3d} |'.format(x, y) for x, y in zip(a, b)])
    print('{:2d} n: {:4d}   reward: {:4.0f}   Action Freq {}'.format(n_episodes+1, n, episode_reward, txt))
    n_list.append(n)
    reward_list.append(episode_reward)
print('\n')

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

fig.suptitle('Recompensa de 20 partidas con el modelo ajustado')

ax.plot(reward_list)
ax.axhline(y=env.spec.reward_threshold, color='r', linestyle='-')
ax.set_xlabel('suma de recompensas de cada partida')
ax.set_xticks(range(20))
ax.grid()

plt.show()