# INTELIGENCIA ARTIFICIAL (INF371)¶

Dr. Edwin Villanueva (evillatal@gmail.com)

## Aprendizaje por refuerzo con Deep Q-networks  - juego CartPole

Esta es una implementacion de un agente Deep Q-learning que aprende a equilibrar el mastil del juego CartPole. Adaptado de  https://github.com/keon/deep-q-learning  

Es necesario tener instalado  gym, tensorflow y keras

### Clase <b>DQNAgent</b>

Esta es la clase que implementa el agente Deep Q-lerrning. Es una implementacion general, pudiendo ser usado en diferentes entornos de gym u otros. El constructor recibe las dimensiones del estado y la dimesion del vector de acciones posibles 

In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size     # tamaño de un estado (numero de atributos que representan un estado)
        self.action_size = action_size   # tamaño del vector de acciones 
        self.memory = deque(maxlen=3000)  # define la memoria del agente (2000 registros como maximo)
        self.gamma = 0.95                 # discount rate
        self.learning_rate = 0.001        # taza de aprendizaje 
        
        self.epsilon = 1.0          # factor de exploration inicial
        self.epsilon_min = 0.01     # factor de exploration minimo
        self.epsilon_decay = 0.995   # factor de decaimiento del factor de exploracion
        self.model = self._build_model()  # construye el modelo neuronal para estimar las utilidades

    def _build_model(self):
        # Define y compila un modelo de red neuronal de 3 capas: state_size entradas X 20 neuronas X 20 neuronas x action_size neuronas de salida
        model = Sequential()   # Informa que las capas que se van agregar son secuenciales
        model.add(Dense(20, input_dim=self.state_size, activation='relu')) # 1ra capa de 20 neuronas, cada neurona recibe state_size entradas (4 para CartPole), activacion relu
        model.add(Dense(20, activation='relu')) # 2da capa de 20 neuronas, funcion de activacion relu
        model.add(Dense(self.action_size, activation='linear')) # 3ra capa (salida) de action_size neuronas (2 para CartPole)
       
        model.compile(loss='mse', optimizer = Adam(lr=self.learning_rate)) # la funcion de perdida es el error cuadratico medio (mse)
        return model

    # metodo para guardar una transicion del agente (experiencia): (estado, accion, reward resultante, nuevo estado, done)
    # done es un flag que indica que el entorno cayo en un estado terminal
    def remember(self, state, action, reward, next_state, done): 
        self.memory.append((state, action, reward, next_state, done))

    # retorna una accion.  
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:  # retorna una accion aleatoria con probabilidad self.epsilon
            return random.randrange(self.action_size)
        action_values = self.model.predict(state) # obtiene los q valores predichos por el modelo para cada accion
        return np.argmax(action_values[0])  # retorna la accion con el maximo q-valor predicho

    def replay(self, batch_size): # ajusta la red neuronal con una muestra de su memoria de tamaño batch_size
        # obtiene una muestra de su memoria de experiencias
        minibatch = random.sample(self.memory, batch_size) 
        
        # recorre cada experiencia del minibatch de experiencias
        for state, action, reward, next_state, done in minibatch:
            
            # target es el vector de Q values de las posibles acciones desde state (por defecto son los predichos por el modelo)
            target = self.model.predict(state)
            
            if done:  # si cayo en un estado terminal
                # Actualiza el Q valor del target correspondiente a action, colocando el valor Q = reward
                target[0][action] = reward   
            else:  # si  no es estado terminal 
                # Predice los valores Q del next_state usando el modelo
                Qvals_next_state = self.model.predict(next_state)[0]
                # Actualiza el Q value del target correspondiente a la accion action con el future discounted reward
                target[0][action] = reward + self.gamma * np.amax(Qvals_next_state)
 
            self.model.fit(state, target, epochs=1, verbose=0) # ajusta pesos de la red con el ejemplo: (state,target)

        # si no esta en el valor minimo del factor de exploracion -> hace un decaimiento del factor de exploracion
        if self.epsilon > self.epsilon_min: 
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


Using TensorFlow backend.


### Ejecutando el agente de aprendizaje DQN en el entorno CartPole

 

In [2]:
done = False
batch_size = 100    # tamaño del batch con el que se re-entrena el modelo neuronal
EPISODES = 500     # numero de episodios

env = gym.make('CartPole-v1')   # carga el modelo Cartpole de gym
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)  # instancia el agente deep q-network

for e in range(EPISODES):   # por cada episodio
    state = env.reset()     # resetea el entorno
    state = np.reshape(state, [1, state_size])
    for step in range(500):   # se prueba el agente hasta 500 pasos, sale de este loop cuando se cae en estado terminal (perdida de equilibrio)  
        #env.render()  # renderiza el entorno (no funciona en Colab)
        action = agent.get_action(state)   # obtiene una accion del agente
        next_state, reward, done, _ = env.step(action)  # ejecuta action en el entorno y obtiene: nuevo estado, reward y flag done (si es estado terminal)
        reward = reward if not done else -10  # si es estado terminal el reward es -10
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done) # almacena esta experiencia en la memoria del agente
        state = next_state   # actualiza el estado actual al nuevo estado
        if done:  # si es estado terminal, imprime resultados del trial. El escore del trial es el numero de pasos que logro ejecutar el agente
            print("episode: {}/{}, score: {}, e: {:.2}".format(e, EPISODES, step, agent.epsilon))
            break
    if len(agent.memory) > batch_size:  # si el agente tiene suficiente experiencias en su memoria -> ajusta su modelo neuronal 
        agent.replay(batch_size)
        
env.close()

Instructions for updating:
Colocations handled automatically by placer.
episode: 0/500, score: 32, e: 1.0
episode: 1/500, score: 15, e: 1.0
episode: 2/500, score: 21, e: 1.0
episode: 3/500, score: 10, e: 1.0
episode: 4/500, score: 10, e: 1.0
episode: 5/500, score: 32, e: 1.0
Instructions for updating:
Use tf.cast instead.
episode: 6/500, score: 13, e: 0.99
episode: 7/500, score: 8, e: 0.99
episode: 8/500, score: 14, e: 0.99
episode: 9/500, score: 13, e: 0.98
episode: 10/500, score: 15, e: 0.98
episode: 11/500, score: 13, e: 0.97
episode: 12/500, score: 13, e: 0.97
episode: 13/500, score: 8, e: 0.96
episode: 14/500, score: 23, e: 0.96
episode: 15/500, score: 16, e: 0.95
episode: 16/500, score: 35, e: 0.95
episode: 17/500, score: 17, e: 0.94
episode: 18/500, score: 25, e: 0.94
episode: 19/500, score: 10, e: 0.93
episode: 20/500, score: 16, e: 0.93
episode: 21/500, score: 25, e: 0.92
episode: 22/500, score: 12, e: 0.92
episode: 23/500, score: 17, e: 0.91
episode: 24/500, score: 10, e: 0.9

episode: 209/500, score: 162, e: 0.36
episode: 210/500, score: 163, e: 0.36
episode: 211/500, score: 244, e: 0.36
episode: 212/500, score: 170, e: 0.35
episode: 213/500, score: 248, e: 0.35
episode: 214/500, score: 145, e: 0.35
episode: 215/500, score: 191, e: 0.35
episode: 216/500, score: 151, e: 0.35
episode: 217/500, score: 163, e: 0.35
episode: 218/500, score: 194, e: 0.34
episode: 219/500, score: 185, e: 0.34
episode: 220/500, score: 189, e: 0.34
episode: 221/500, score: 218, e: 0.34
episode: 222/500, score: 34, e: 0.34
episode: 223/500, score: 195, e: 0.34
episode: 224/500, score: 43, e: 0.33
episode: 225/500, score: 298, e: 0.33
episode: 226/500, score: 355, e: 0.33
episode: 227/500, score: 499, e: 0.33
episode: 228/500, score: 284, e: 0.33
episode: 229/500, score: 233, e: 0.33
episode: 230/500, score: 194, e: 0.32
episode: 231/500, score: 162, e: 0.32
episode: 232/500, score: 172, e: 0.32
episode: 233/500, score: 227, e: 0.32
episode: 234/500, score: 208, e: 0.32
episode: 235/5

episode: 426/500, score: 275, e: 0.12
episode: 427/500, score: 251, e: 0.12
episode: 428/500, score: 225, e: 0.12
episode: 429/500, score: 499, e: 0.12
episode: 430/500, score: 308, e: 0.12
episode: 431/500, score: 213, e: 0.12
episode: 432/500, score: 246, e: 0.12
episode: 433/500, score: 302, e: 0.12
episode: 434/500, score: 472, e: 0.12
episode: 435/500, score: 402, e: 0.12
episode: 436/500, score: 264, e: 0.12
episode: 437/500, score: 221, e: 0.11
episode: 438/500, score: 253, e: 0.11
episode: 439/500, score: 207, e: 0.11
episode: 440/500, score: 301, e: 0.11
episode: 441/500, score: 233, e: 0.11
episode: 442/500, score: 216, e: 0.11
episode: 443/500, score: 232, e: 0.11
episode: 444/500, score: 245, e: 0.11
episode: 445/500, score: 172, e: 0.11
episode: 446/500, score: 169, e: 0.11
episode: 447/500, score: 15, e: 0.11
episode: 448/500, score: 156, e: 0.11
episode: 449/500, score: 160, e: 0.11
episode: 450/500, score: 171, e: 0.11
episode: 451/500, score: 152, e: 0.11
episode: 452/

### Funcion para probar el agente entrenado  en el entorno CartPole

play_CartPole()  no realiza entrenamiento, solo actua en el ambiente de acuerdo a su modelo aprendido previamente
 

In [3]:
def play_CartPole(agent, trials = 1):
    env = gym.make('CartPole-v1')  
    scores = []
    for trial in range(trials):
        score = 0
        game_memory = []
        state = []
        env.reset()
        for step in range(500): # en cada trial ejecuta 500  pasos
            env.render()

            if len(state) == 0:    # si es el primer movimiento  -> escoge una accion aleatoria
                action = random.randrange(0,2)
            else:
                action_values = agent.model.predict(state.reshape(1, 4)) # predice los q valores con la RN del agente
                action = np.argmax(action_values[0])      # retorna la accion con el maximo q-valor predicho

            next_state, reward, done, _  = env.step(action)  # corre el entorno un step ejecutando la accion inferida
            score += reward   # acumula el reward (reward=1 en cualquier estado no terminal)
            state = next_state
            game_memory.append([next_state, action])
            if done: 
                print("Play {}/{}, score: {}".format(trial, trials, step))
                break
        scores.append(score)
    env.close()
 
    print("Score medio = {}".format(sum(scores) /float(trials)) )

In [5]:
# prueba el agente 5 trials del entorno
play_CartPole(agent,5)

Play 0/5, score: 189
Play 1/5, score: 221
Play 2/5, score: 216
Play 3/5, score: 207
Play 4/5, score: 221
Score medio = 211.8
