<a href="https://colab.research.google.com/github/Alsr96/Reinforcement_Learning/blob/master/DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import tensorflow as tf
import numpy as np

# Neural Network class
class NeuralNetwork(object):
    # initialise the class
    # ('dropout',0.5), ('l1', 0.01), ('l2', 0.01)
    def __init__(self, learning_rate, output_dims, name, input_dims,
                 hidden_layer1=256, hidden_layer2=256, chkpt_dir='tmp/dqn',
                 regularizer=None):
        # learning rate
        self.learning_rate = learning_rate
        # no. of actions (dimensions of the output layer of the network)
        self.output_dims = output_dims
        # name of  the network
        self.name = name
        # hidden layer 1 dimensions
        self.hidden_layer1 = hidden_layer1
        # hidden layer 2 dimensions
        self.hidden_layer2 = hidden_layer2
        # checkpoint directory location
        self.chkpt_dir = chkpt_dir
        # input dimensions
        self.input_dims = input_dims
        
        self.regularizer=regularizer
        # session to run the network functions
        self.sess = tf.Session()
        # build the network
        self.build_network()
        # initialise variables
        self.sess.run(tf.global_variables_initializer())
        # network saver
        self.saver = tf.train.Saver()
        self.checkpoint_file = os.path.join(chkpt_dir,'deepqnet.ckpt')
        # gets the set of trainable variables
        self.parameters = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope=self.name)
    # function to build the network
    def build_network(self):
        # check the scope
        with tf.variable_scope(self.name):
            # define inputs
            self.input = tf.placeholder(tf.float32,
                                        shape=[None, *self.input_dims],
                                        name='inputs')
            
            self.actions = tf.placeholder(tf.float32,
                                          shape=[None, self.output_dims],
                                          name='action_taken')
            self.q_target = tf.placeholder(tf.float32,
                                           shape=[None, self.output_dims],
                                           name='q_value')
            # regularizers
            if self.regularizer==None:
                self.do_dropout=False
                self.reg=None
            elif self.regularizer[0]=='dropout':
                self.do_dropout=True
                self.dropout_rate=self.regularizer[1]
                self.reg=None
            elif self.regularizer[0]=='l1':
                self.l1_scale=self.regularizer[1]
                self.reg=tf.contrib.layers.l1_regularizer(self.l1_scale)
                self.do_dropout=False
            elif self.regularizer[0]=='l2':
                self.l2_scale=self.regularizer[1]
                self.reg=tf.contrib.layers.l2_regularizer(self.l2_scale)
                self.do_dropout=False
            else:
                raise Exception('Invalid regularizer')
            
            # define the layers of the network
            flat = tf.layers.flatten(self.input)
            dense1 = tf.layers.dense(flat, units=self.hidden_layer1,
                                     activation=tf.nn.relu,
                                     kernel_regularizer=self.reg,
                                     kernel_initializer=tf.contrib.layers.variance_scaling_initializer())
            if self.do_dropout==True:
                dense1_dropout = tf.layers.dropout(dense1,rate=self.dropout_rate)
            else:
                dense1_dropout=dense1
            
            dense2 = tf.layers.dense(dense1_dropout, units=self.hidden_layer2,
                                     activation=tf.nn.relu,
                                     kernel_regularizer=self.reg,
                                     kernel_initializer=tf.contrib.layers.variance_scaling_initializer())
            if self.do_dropout==True:
                dense2_dropout = tf.layers.dropout(dense2,rate=self.dropout_rate)
            else:
                dense2_dropout=dense2
            
            self.Q_values = tf.layers.dense(dense2_dropout, units=self.output_dims,            
                                            kernel_regularizer=self.reg,
                                            kernel_initializer=tf.contrib.layers.variance_scaling_initializer())
            
            # loss function
            self.loss = tf.reduce_mean(tf.square(self.Q_values - self.q_target))+ tf.losses.get_regularization_loss()
            # train function
            self.train_NN = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

    # function to load the saved network from the given directory
    def load_checkpoint(self):
        print("...Loading checkpoint...")
        self.saver.restore(self.sess, self.checkpoint_file)
    # function to save the network in the given directory
    def save_checkpoint(self):
        print("...Saving checkpoint...")
        self.saver.save(self.sess, self.checkpoint_file)


class common(object):
    def __init__(self,alpha, gamma, memory_size, n_actions, epsilon, batch_size,
                 n_games, input_dims=(210,160,4), epsilon_dec=0.996,
                 epsilon_min=0.01,regularizer=None):
        #q_eval_dir='tmp/q_eval', q_eval_dir_prime='tmp/q_eval_prime'
        self.action_space = [i for i in range(n_actions)]
        # no. of all possible actions
        self.n_actions = n_actions
        # total number of games to be played
        self.n_games = n_games
        #
        self.gamma = gamma
        # number of states to be stored in the memory
        self.memory_size = memory_size
        self.memory_cntr = 0
        #
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_min = epsilon_min
        # number of states taken for training
        self.batch_size = batch_size
        # regularizer
        self.regularizer=regularizer
        self.state_memory = np.zeros((self.memory_size, *input_dims))
        # 
        self.new_state_memory = np.zeros((self.memory_size, *input_dims))
        self.action_memory = np.zeros((self.memory_size, self.n_actions),
                                      dtype=np.int8)
        self.reward_memory = np.zeros(self.memory_size)
        self.terminal_memory = np.zeros(self.memory_size, dtype=np.int8)
    
    def store_transition(self, state, action, reward, next_state, terminal):
        index = self.memory_cntr % self.memory_size
        self.state_memory[index] = state
        actions = np.zeros(self.n_actions)
        actions[action] = 1.0
        self.action_memory[index] = actions
        self.reward_memory[index] = reward
        self.new_state_memory[index] = next_state
        self.terminal_memory[index] = 1 - terminal
        self.memory_cntr += 1
    
    def save_models(self):
        self.q_eval.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()

class Agent_DDQN(common):
    
    def __init__(self,alpha, gamma, memory_size, n_actions, epsilon, batch_size,
                 n_games, input_dims=(210,160,4), epsilon_dec=0.996,
                 epsilon_min=0.01,regularizer=None,q_eval_dir='tmp/q_eval',
                 q_eval_dir_prime='tmp/q_eval_prime'):
        
        super().__init__(self,alpha, gamma, memory_size, n_actions, epsilon, batch_size,
                 n_games, input_dims=(210,160,4), epsilon_dec=0.996,
                 epsilon_min=0.01,regularizer=None)
        
        # Policy network 1 to evaluate q values for given state
        self.q_eval = NeuralNetwork(alpha, n_actions, input_dims=input_dims,
                                   name='q_eval', chkpt_dir=q_eval_dir,
                                   regularizer=self.regularizer)
        # policy network 2 to evaluate q values for given state
        self.q_eval_prime = NeuralNetwork(alpha, n_actions, input_dims=input_dims,
                                   name='q_eval_prime',chkpt_dir=q_eval_dir_prime,
                                   regularizer=self.regularizer)
        
    def choose_action(self, state):
        # if random number less than epsilon then take random action else take
        # action according to policy network 2
        state = state[np.newaxis, :]
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            if self.memory_cntr == 0: 
                actions = self.q_eval.sess.run(self.q_eval.Q_values,
                      feed_dict={self.q_eval.input: state} )
                action = np.argmax(actions)
            else:
                actions = self.q_eval_prime.sess.run(self.q_eval_prime.Q_values,
                      feed_dict={self.q_eval_prime.input: state} )
                action = np.argmax(actions)
        return action
    
    # learning algorithm for the agent
    def learn(self):
        # check if memory is greater than batch size, to prevent repeatition of samples
        if self.memory_cntr > self.batch_size:
            max_mem = self.memory_cntr if self.memory_cntr < self.memory_size \
                                    else self.memory_size
            
            # randomly sample from memory
            batch = np.random.choice(max_mem, self.batch_size)
            state_batch = self.state_memory[batch]
            action_batch = self.action_memory[batch]
            action_values = np.array(self.action_space, dtype=np.int8)
            action_indices = np.dot(action_batch, action_values)
            reward_batch = self.reward_memory[batch]
            new_state_batch = self.new_state_memory[batch]
            terminal_batch = self.terminal_memory[batch]
            
            # q value using network 1
            q_eval = self.q_eval.sess.run(self.q_eval.Q_values,
                                         feed_dict={self.q_eval.input: state_batch})
            # q_next value using network 1
            q_next = self.q_eval.sess.run(self.q_eval.Q_values,
                        feed_dict={self.q_eval.input: new_state_batch})

            q_target = q_eval.copy()
            batch_index = np.arange(self.batch_size, dtype=np.int32)
            
            # calculate target with q_next (q_next from network 1)
            q_target[batch_index,action_indices] = reward_batch + \
                                  self.gamma*np.max(q_next, axis=1)*terminal_batch
            # train network 1
            _ = self.q_eval.sess.run(self.q_eval.train_NN,
                            feed_dict={self.q_eval.input: state_batch,
                                       self.q_eval.actions: action_batch,
                                       self.q_eval.q_target: q_target})
            # update epsilon
            self.epsilon = self.epsilon*self.epsilon_decay if self.epsilon > \
                           self.epsilon_min else self.epsilon_min

class Agent_DQN(common):
    
    def __init__(self,alpha, gamma, memory_size, n_actions, epsilon, batch_size,
                 n_games, input_dims=(210,160,4), epsilon_dec=0.996,
                 epsilon_min=0.01,regularizer=None,q_eval_dir='tmp/q_eval',
                 q_eval_dir_prime='tmp/q_eval_prime'):
        
        super().__init__(self,alpha, gamma, memory_size, n_actions, epsilon, batch_size,
                 n_games, input_dims=(210,160,4), epsilon_dec=0.996,
                 epsilon_min=0.01,regularizer=None)
        
        self.q_eval = NeuralNetwork(alpha, n_actions, input_dims=input_dims,
                                   name='q_eval', chkpt_dir=q_eval_dir,
                                   regularizer=self.regularizer)
        
    def choose_action(self, state):
        state = state[np.newaxis, :]
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.sess.run(self.q_eval.Q_values,
                      feed_dict={self.q_eval.input: state} )
            action = np.argmax(actions)
        return action
    
    def learn(self):
        if self.memory_cntr > self.batch_size:
            max_memory = self.memory_cntr if self.memory_cntr < self.memory_size \
                                    else self.memory_size

            batch = np.random.choice(max_memory, self.batch_size)

            state_batch = self.state_memory[batch]
            action_batch = self.action_memory[batch]
            action_values = np.array(self.action_space, dtype=np.int8)
            action_indices = np.dot(action_batch, action_values)
            reward_batch = self.reward_memory[batch]
            new_state_batch = self.new_state_memory[batch]
            terminal_batch = self.terminal_memory[batch]

            q_eval = self.q_eval.sess.run(self.q_eval.Q_values,
                                         feed_dict={self.q_eval.input: state_batch})

            q_next = self.q_eval.sess.run(self.q_eval.Q_values,
                        feed_dict={self.q_eval.input: new_state_batch})

            q_target = q_eval.copy()
            batch_index = np.arange(self.batch_size, dtype=np.int32)

            q_target[batch_index,action_indices] = reward_batch + \
                                  self.gamma*np.max(q_next, axis=1)*terminal_batch

            _ = self.q_eval.sess.run(self.q_eval.train_NN,
                            feed_dict={self.q_eval.input: state_batch,
                                       self.q_eval.actions: action_batch,
                                       self.q_eval.q_target: q_target})

            self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \
                           self.epsilon_min else self.epsilon_min