In [20]:
import gym
import warnings

import numpy as np
import time
import random
import os
import tensorflow as tf

from time import sleep
from collections import deque

In [21]:
warnings.filterwarnings("ignore")

In [22]:
env = gym.make('CarRacing-v0').env

In [24]:
state = env.reset()

Track generation: 1064..1339 -> 275-tiles track


In [34]:
env.action_space.sample()

array([0.886416  , 0.69067097, 0.8968802 ], dtype=float32)

In [85]:
state, reward, _, _ = env.step([0,0,1])
print(reward)

-0.09999999999999964


In [86]:
env.render()

False

In [5]:
checkpoint_path = "./"

if os.system("cd " + checkpoint_path) == 512: # caso a pasta de caminho não exista, o os.system retornara 512.
    os.system("mkdir "+ checkpoint_path)

In [9]:
print("Action Space:\t{}".format(env.action_space))
print("State Space:\t{}".format(env.observation_space))

Action Space:	Box(3,)
State Space:	Box(96, 96, 3)


In [10]:
class Memory():
    def __init__(self, max_memory):
        self.__buffer = deque(maxlen=max_memory)

    def add(self, experience):
        self.__buffer.append(experience)
        
    def get_buffer(self):
        return self.__buffer
        
    def sample(self, batch_size):
        buffer_size = len(self.__buffer)
        index = np.random.choice(np.arange(buffer_size), size=batch_size)
        
        return [self.__buffer[i] for i in index]

In [17]:
class DQN():
    
    def __init__(self, n_actions, hidden=128, learning_rate=0.0001, shape=(64, 64, 3)):
        self.n_actions = n_actions
        self.hidden = hidden
        self.learning_rate = learning_rate
        self.shape = shape
        
        self.X = tf.placeholder(tf.float32, [None, *self.shape], name="X")
                
        # targetQ according to Bellman equation: 
        # Q = r + gamma*max Q', calculated in the function learn()
        self.target_Q = tf.placeholder(tf.float32, [None], name="target")
        
        # Action that was performed
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        
        #Normalização do input
        self.inputscaled = self.X / 255
        
        with tf.name_scope("conv_1"):
            self.conv1 = tf.layers.conv2d(
                inputs=self.inputscaled, filters=32, kernel_size=[5, 5], strides=2,
                padding="valid", activation=tf.nn.tanh, use_bias=False, name='conv1')
        
            self.conv2 = tf.layers.conv2d(
                inputs=self.conv1, filters=32, kernel_size=[3, 3], strides=2,
                padding="valid", activation=tf.nn.tanh, use_bias=False, name='conv2')
            
            self.pooling_1 = tf.layers.max_pooling2d(self.conv2, pool_size=(2,2), 
                                                    strides=2, padding="SAME")
            
        with tf.name_scope("conv_2"):
            
        
            self.conv3 = tf.layers.conv2d(
                inputs=self.pooling_1, filters=64, kernel_size=[5, 5], strides=1,
                padding="valid", activation=tf.nn.tanh, use_bias=False, name='conv3')

            self.conv4 = tf.layers.conv2d(
                inputs=self.conv3, filters=self.hidden, kernel_size=[3, 3], strides=1,
                padding="valid", activation=tf.nn.tanh, use_bias=False, name='conv4')
            
            self.pooling_2 = tf.layers.max_pooling2d(self.conv4, pool_size=(2,2), 
                                                    strides=1, padding="SAME")

        
        self.valuestream, self.advantagestream = tf.split(self.pooling_2, 2, 3)
        self.valuestream = tf.layers.flatten(self.valuestream)
        self.advantagestream = tf.layers.flatten(self.advantagestream)
        
        self.advantage = tf.layers.dense(
            inputs=self.advantagestream, units=self.n_actions,
            name="advantage")
        self.value = tf.layers.dense(
            inputs=self.valuestream, units=1,
            name='value')

        # Combining value and advantage into Q-values as described above
        self.q_values = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage, axis=1, keepdims=True))
        self.best_action = tf.argmax(self.q_values, 1)
        
        # Q value of the action that was performed
        self.Q = tf.reduce_sum(tf.multiply(self.q_values, tf.one_hot(self.action,
                                                                     self.n_actions,
                                                                     dtype=tf.float32)), axis=1)
        # Parameter updates
        self.loss_op = tf.reduce_mean(tf.losses.huber_loss(labels=self.target_Q, predictions=self.Q))
        self.train_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss_op)

In [12]:
# Função epsilon para a exploração do cenario (environment).
def epsilon_greedy(q_values, step, n_steps):
    eps_min = 0.01
    eps_max = 1.0
    
    if step < n_steps * 0.9:
        eps_decay_steps = 5000000
        epsilon = (max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps))
    else:
        epsilon = 0.6

    if random.uniform(0, 1) < epsilon:
        return np.random.randint(n_output), epsilon # random action
    else:
        return np.argmax(q_values), epsilon # optimal action

In [13]:
# Memory relay
batch_size = 32
memory_size = 128
possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())

input_shape = env.get_sample_shape()
n_output = env.action_space.n
hidden = 128

done = False
n_steps = 120
step_test = 100
save_steps = 50
gamma = 0.99
learning_rate = 0.01

tensorboard = "./tensorboard/train/"

os.makedirs(tensorboard, exist_ok=True)
SUMM_WRITER = tf.summary.FileWriter(tensorboard)

AttributeError: 'Box' object has no attribute 'n'

In [14]:
def learn(session, replay_memory, main_dqn, target_dqn, batch_size, gamma):
    """
    Args:
        session: A tensorflow sesson object
        replay_memory: A ReplayMemory object
        main_dqn: A DQN object
        target_dqn: A DQN object
        batch_size: Integer, Batch size
        gamma: Float, discount factor for the Bellman equation
    Returns:
        loss: The loss of the minibatch, for tensorboard
    Draws a minibatch from the replay memory, calculates the 
    target Q-value that the prediction Q-value is regressed to. 
    Then a parameter update is performed on the main DQN.
    """
    # Draw a minibatch from the replay memory
    batch = replay_memory.sample(batch_size)
    
    state = np.array([each[0] for each in batch], ndmin=3)
    actions = np.array([each[1] for each in batch])
    rewards = np.array([each[2] for each in batch])
    done = np.array([each[3] for each in batch])
    next_state = np.array([each[4] for each in batch], ndmin=3)

    # state, reward, done, next_state = replay_memory.sample(batch_size)
    
    # The main network estimates which action is best (in the next 
    # state s', new_states is passed!) 
    # for every transition in the minibatch
    arg_q_max = session.run(main_dqn.best_action, feed_dict={main_dqn.X: next_state})
    
    # The target network estimates the Q-values (in the next state s', new_states is passed!) 
    # for every transition in the minibatch
    q_vals = session.run(target_dqn.q_values, feed_dict={target_dqn.X: next_state})
    double_q = q_vals[range(batch_size), arg_q_max]
    
    # Bellman equation. Multiplication with (1-terminal_flags) makes sure that 
    # if the game is over, targetQ=rewards
    target_q = rewards + gamma * double_q
    
    # Gradient descend step to update the parameters of the main network
    loss, _ = session.run([main_dqn.loss_op, main_dqn.train_op],
                          feed_dict={main_dqn.X: state,
                                     main_dqn.target_Q: target_q,
                                     main_dqn.action: actions})
    return loss

In [15]:
def train_model(checkpoint_path, input_shape, num_hidden, n_output,
                done, n_steps, save_steps, gamma, lr, env, memory):

    with tf.variable_scope('mainDQN'):
        MAIN_DQN = DQN(env.action_space.n, hidden, learning_rate)  # (★★)
    with tf.variable_scope('targetDQN'):
        TARGET_DQN = DQN(env.action_space.n, hidden)  # (★★)

    LAYER_IDS = ["conv1", "conv2", "conv3", "conv4", "denseAdvantage",
                 "denseAdvantageBias", "denseValue", "denseValueBias"]

    with tf.name_scope('Performance'):
        LOSS_PH = tf.placeholder(tf.float32, shape=None, name='loss_summary')
        LOSS_SUMMARY = tf.summary.scalar('loss', LOSS_PH)
        REWARD_PH = tf.placeholder(tf.float32, shape=None, name='reward_summary')
        REWARD_SUMMARY = tf.summary.scalar('reward', REWARD_PH)
        EVAL_SCORE_PH = tf.placeholder(tf.float32, shape=None, name='evaluation_summary')
        EVAL_SCORE_SUMMARY = tf.summary.scalar('evaluation_score', EVAL_SCORE_PH)

    PERFORMANCE_SUMMARIES = tf.summary.merge([LOSS_SUMMARY, REWARD_SUMMARY])
   
    saver = tf.train.Saver(max_to_keep=1, keep_checkpoint_every_n_hours=1)
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init) 
        step = 0
        max_reward = np.Infinity
        count_reward = 0
        
        rewards = []
        loss_list = []
        actions_list = []
        
        
        for i in range(1, n_steps):  
            
            episode_reward_sum = 0

            state = env.reset() # inicialização do jogo.
            
            step += 1 # Incremento na variavel step.
            action = env.action_space.sample()
            actions_list.append(action)
            
            state, reward, done, next_state = env.step(action) # inicio das variaveis para o calculo do Q-value
            memory.add((state, action, reward, done, next_state))
            
            while not done:                
                    
                q_value = sess.run(TARGET_DQN.best_action, feed_dict={TARGET_DQN.X: [state]})
                img_state = env.get_state()
                # Ação para o q_value do estado anterior.
                action, epsilon = epsilon_greedy(q_value, i, n_steps)
                
                # action = possible_actions[choice]
                state, reward, done, next_state = env.step(action)
                
                #return None
                
                episode_reward_sum += reward
                
                memory.add((state, action, reward, done, next_state))
                
                loss = learn(sess, memory, MAIN_DQN, TARGET_DQN, batch_size, gamma)
                loss_list.append(loss)
                
                state = next_state
                
                clear_output(wait=True)
                # Ignora isso, é só um print...
                print("State: {}\tReward: {}\tAction: {}\tepslon: {:.2f}\nTraining step: {}/{}\tLoss: {:.4f}\tQ_value: {}".format(
                    img_state.split("_")[0], reward, action, epsilon, i, n_steps, loss, q_value[0]))
                
                # And save regularly
                if step % save_steps == 0 and done == True:
                    saver.save(sess, checkpoint_path)
                      
            rewards.append(episode_reward_sum)
            
            # Scalar summaries for tensorboard
            summ = sess.run(PERFORMANCE_SUMMARIES, 
                            feed_dict={LOSS_PH:np.mean(loss_list), 
                                       REWARD_PH:np.mean(rewards[-100:])})

            SUMM_WRITER.add_summary(summ)
            # Histogramm summaries for tensorboard
            summ_param = sess.run(PARAM_SUMMARIES)
            SUMM_WRITER.add_summary(summ_param)    

In [None]:
memory = Memory(memory_size)

for i in range(batch_size):
    # If it's the first step
    if i == 0:
        state = env.reset()

    # Get the next_state, the rewards, done by taking a random action
    action = env.action_space.sample()
    # action = possible_actions[choice]

    state, reward, done, next_state = env.step(action)

    # If the episode is finished (we're dead 3x)
    if done:
        # We finished the episode
        next_state = np.zeros(env.observation_space.shape[0])

        # Add experience to memory
        if type(action) == np.ndarray:
            memory.add((state, action, reward, done, next_state))

        # Start a new episode
        state = env.reset()

        # Stack the frames
        # state, stacked_frames = stack_frames(stacked_frames, state, True)

    else:
        # Add experience to memory
        if type(action) == np.ndarray:
            memory.add((state, action, reward, done, next_state))

        # Our new state is now the next_state
        # state = next_state

In [None]:
# limpeza de possiveis grafos carregados na memoria.
tf.reset_default_graph()

# Inicialização do treino do modelo DQN
teste = train_model(checkpoint_path, input_shape, hidden, n_output,
           done, n_steps, save_steps, gamma, learning_rate, env, memory)