In [1]:
import tensorflow as tf
import sys
import time
import os
from collections import deque
import numpy as np
import matplotlib.pyplot as plt
import gym
import random
import tensorboard
from tensorflow.keras import backend as K
from Sum_Tree import SumTree
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
#tf.compat.v1.disable_eager_execution()
#tf.config.optimizer.set_jit(True)
#tf.config.experimental.enable_mlir_bridge()


In [2]:
def clean_tensorboard():
    import os
    tensor_board_sess_path = 'C:/Users/Eaj59/AppData/Local/Temp/.tensorboard-info/'
    temp_sess_files = os.listdir(tensor_board_sess_path)
    for i in temp_sess_files:
        temp_file_path = os.path.join(tensor_board_sess_path,i)
        os.remove(temp_file_path)

In [3]:
#clean_tensorboard()
#%load_ext tensorboard
#%tensorboard --logdir 'C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/log_dir'

In [4]:
class Agent():
    
    def __init__(self,env_name,model_name,enable_PER=True,enable_DDQN=True,enable_tb_logging=True,enable_dueling=True):
        self.env = gym.make(env_name)
        #self.env.seed(0)
        self.env_state = self.env.reset()
        self.lr = 0.01
        self.gamma = .99
        self.epsilon_decay_rate = .99
        self.initializer = tf.keras.initializers.HeUniform()
        self.loss_parameter = 1.0
        self.loss_fn = tf.keras.losses.Huber(delta=self.loss_parameter)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr)
        self.n_actions = self.env.action_space.n
        self.n_states = self.env.observation_space.shape[0]
        self.tau = 0.10
        self.batch_size = 32
        self.epsilon = .75
        self.min_epsilon = .05
        self.dueling_method = 'average'
        self.enable_tb_logging = enable_tb_logging
        self.enable_dueling = enable_dueling
        self.episodic_rewards = 0
        self.episode_counter = 0
        self.episode_time_step_counter = 0
        self.enable_DDQN = True
        self.enable_PER = enable_PER
        self.model_name = model_name
        self.target_update_method = 'hard'
        self.online = self.make_nn()
        self.target = self.make_target()
        self.capacity = 10000
        
        
        if self.enable_PER:
            self.buffer = Prioritized_Buffer(self.capacity)
        else:
            self.buffer = ReplayBuffer(self.capacity)
            
        if self.enable_tb_logging:
            self.online_run_id, self.online_tensor_board_callback = self.create_tensor_board_callback(model_name=self.model_name)
            self.training_steps_counter = 0
        
    def reset_env(self):
        self.episode_counter+=1
        if self.enable_tb_logging:
            with self.online_tensor_board_callback.as_default():
                tf.summary.scalar(name='Episode_Reward',data=self.episodic_rewards,step=self.episode_counter)
                tf.summary.scalar(name='Episode_Time_Steps',data=self.episode_time_step_counter,step=self.episode_counter)
        self.episodic_rewards = 0
        self.episode_time_step_counter = 0
        self.env_state = self.env.reset()
        
    
    def __dict__(self):
        parameter_dict = {'Model_Name': self.model_name,
                        'Date': time.strftime('%Y_%m_%D'),
                        'DDQN': self.enable_DDQN,
                        'Dueling': self.enable_dueling,
                        'Gamma': self.gamma,
                        'Inital_Epsilon':self.epsilon,
                        'Epsilon_Decay_Rate': self.epsilon_decay_rate,
                        'Epsilon_Minimum': self.min_epsilon,
                        'Optimizer': self.optimizer.__dict__['_name'],
                        'Learning_Rate': self.lr,
                        'Loss_Function' : self.loss_fn.__dict__['name'],
                        'Loss_Parameter': self.loss_parameter}
        return str(parameter_dict)
        

    def create_tensor_board_callback(self,model_name):
        run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
        run_id = model_name+'_'+run_id
        base_dir = 'C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL'
        os.chdir('C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/log_dir')
        os.mkdir(run_id)
        os.chdir(run_id)
        text_file_name = 'model_summary_' + model_name +'.txt'
        text_file_name2 = 'model_hyper_parameters_' + model_name +'.txt'
        f = open(text_file_name,"w+")
        f.write(self.online.to_json())
        f.close()
        f2 = open(text_file_name2,"w+")
        f2.write(self.__dict__())
        f2.close()
        os.chdir(base_dir)
        root_log_dir = os.path.join(os.curdir,'log_dir')
        model_cb_path = os.path.join(root_log_dir,run_id)
        file_writer = tf.summary.create_file_writer(model_cb_path)
        
        return run_id, file_writer
    
    def make_nn(self):
        
        if self.enable_dueling:
            Dueling_Input = tf.keras.layers.Input(shape=(self.n_states,),name='Dueling_Input')
            layer_1_Dense = tf.keras.layers.Dense(256,activation='swish',kernel_initializer=self.initializer,name='layer_1_Dense')(Dueling_Input)
            layer_2_Dense = tf.keras.layers.Dense(128,activation='swish',kernel_initializer=self.initializer,name='layer_2_Dense')(layer_1_Dense)
            layer_3_Dense = tf.keras.layers.Dense(64,activation='swish',kernel_initializer=self.initializer,name='layer_3_Dense')(layer_2_Dense)
            Advantage_Layer = tf.keras.layers.Dense(32,activation='swish',kernel_initializer=self.initializer,name='Advantage_Layer')(layer_3_Dense)
            Value_Layer = tf.keras.layers.Dense(32,activation='swish',kernel_initializer=self.initializer,name='Value_Layer')(layer_3_Dense)
            Advantage_Layer2 = tf.keras.layers.Dense(self.n_actions,activation='linear',kernel_initializer=self.initializer,name='Advantage_Layer2')(Advantage_Layer)
            Value_Layer2 = tf.keras.layers.Dense(1,activation='linear',kernel_initializer=self.initializer,name='Value_Layer2')(Value_Layer)

            if self.dueling_method == 'average':
                Value_Expanded = tf.keras.layers.Lambda(lambda s: K.expand_dims(s[:,0],-1),output_shape=(self.n_actions,))(Value_Layer2)
                Average_Advantage = tf.keras.layers.Lambda(lambda a: a[:,:] - K.mean(a[:,:],keepdims=True),output_shape=(self.n_states,))(Advantage_Layer2)
                online_output = tf.keras.layers.Add()([Value_Expanded,Average_Advantage])


            elif self.dueling_method == 'sum':
                Value_Expanded = tf.keras.layers.Lambda(lambda s: K.expand_dims(s[:,0],-1),output_shape=(self.n_actions,))(Value_Layer)
                Average_Advantage = tf.keras.layers.Lambda(lambda a: a[:,:],output_shape=(self.n_states,))(Advantage_Layer)
                online_output = tf.keras.layers.Add()([Value_Expanded,Average_Advantage])

            else:
                Value_Expanded = tf.keras.layers.Lambda(lambda s: K.expand_dims(s[:,0],-1),output_shape=(self.n_actions,))(Value_Layer)
                Average_Advantage = tf.keras.layers.Lambda(lambda a: a[:,:] - K.max(a[:,:],keepdims=True),output_shape=(self.n_states,))(Advantage_Layer)
                online_output = tf.keras.layers.Add()([Value_Expanded,Average_Advantage])

            online = tf.keras.Model(Dueling_Input,online_output)
            online.compile(loss=self.loss_fn,optimizer=self.optimizer,metrics=['accuracy'])
            online.summary()
            return online

        else:
            online = tf.keras.models.Sequential()
            online.add(tf.keras.layers.Dense(128,input_dim=self.n_states,activation='swish',kernel_initializer=self.initializer))
            online.add(tf.keras.layers.Dense(64, activation='swish',kernel_initializer=self.initializer))
            online.add(tf.keras.layers.Dense(32, activation='swish',kernel_initializer=self.initializer))
            online.add(tf.keras.layers.Dense(self.n_actions,activation='linear',kernel_initializer=self.initializer))
            online.compile(loss=self.loss_fn,optimizer=self.optimizer,metrics=['accuracy'])
            return online
        
        
    
    def make_target(self):
        target = tf.keras.models.clone_model(self.online)
        target.set_weights(self.online.get_weights())
        return target
        
        
    def epsilon_greedy_policy(self):
        if np.random.random() <= self.epsilon:
            return np.random.randint(low=0,high=self.n_actions)
        else:
            Q_values = self.online.predict(np.reshape(self.env_state,(1,len(self.env_state))))
            return np.argmax(Q_values[0])
        
    def apply_epsilon_decay(self):
        self.epsilon = max(self.epsilon * self.epsilon_decay_rate,self.min_epsilon)   
        

    def play_one_step(self):
        state = np.array(self.env_state)
        action = self.epsilon_greedy_policy()
        next_state, reward, done, _ = self.env.step(action)
        if self.enable_PER:
            self.buffer.add_exp_per(state=state, action=action, reward=reward, next_state=next_state, done=done)
        else:
            self.buffer.add_exp(state=state, action=action, reward=reward, next_state=next_state, done=done)
        self.env_state = next_state
        self.episodic_rewards += reward
        self.episode_time_step_counter += 1
        return state, action, reward, next_state, done
        
         
    def training_step(self):
        
        if len(self.buffer)<self.batch_size:
            batch_size_instance = len(self.buffer)
        else:
            batch_size_instance = self.batch_size
        
        if self.enable_PER:
            tree_index, weights,states,actions,rewards,next_states,dones = self.buffer.sample_PER(batch_size=batch_size_instance)
        else:
            states,actions,rewards,next_states,dones = self.buffer.exp_sample(batch_size=batch_size_instance)
            weights = None
        
        if self.enable_DDQN:
            online_state_prediction = self.online.predict(states)
            online_state_prediction_array = np.array(online_state_prediction)
            online_next_state_prediciton = self.online.predict(next_states)
            target_next_state_prediction = self.target.predict(next_states)
            online_next_state_arg_max = tf.argmax(online_next_state_prediciton,axis=1)
            double_learning_mask = tf.one_hot(online_next_state_arg_max,depth=self.n_actions)
            masked_target_qs = tf.reduce_sum(target_next_state_prediction*double_learning_mask,axis=1,keepdims=True)
            q_update = rewards + (1 - dones) * self.gamma * masked_target_qs
            q_update = tf.squeeze(q_update,axis=-1)
        
        else:
            states,actions,rewards,next_states,dones = self.buffer.exp_sample(batch_size=batch_size_instance)
            online_state_prediction = self.online.predict(states)
            online_state_prediction_array = np.array(online_state_prediction)
            target_max_next_state_action_value_prediction = np.amax(self.target.predict(next_states),axis=1,keepdims=True)
            q_update = rewards + (np.ones(shape=(batch_size_instance,1)) - dones) * self.gamma * target_max_next_state_action_value_prediction
            q_update = tf.squeeze(q_update,axis=-1)

        if self.enable_PER:
            absolute_td_errors = []
            for i in range(0,online_state_prediction.shape[0]):
                absolute_td_errors.append(abs(online_state_prediction_array[i,actions[i]]- q_update[i]))
                online_state_prediction_array[i,actions[i]] = q_update[i]
            self.buffer.update_td_batch(tree_index,absolute_td_errors)
        
        else:
            for i in range(0,online_state_prediction.shape[0]):
                online_state_prediction_array[i,actions[i]] = q_update[i]
        
    
        if self.enable_tb_logging:
            metrics_output = self.online.train_on_batch(states,online_state_prediction_array,sample_weight=weights,reset_metrics=True)
            self.training_steps_counter+=1
            with self.online_tensor_board_callback.as_default():
                tf.summary.scalar(name='Batch_Huber_Loss',data=metrics_output[0],step=self.training_steps_counter)
                tf.summary.scalar(name='Accuracy',data=metrics_output[1],step=self.training_steps_counter)
                tf.summary.scalar(name='Epsilon',data=self.epsilon,step=self.training_steps_counter)
        else:
            self.online.train_on_batch(states,online_state_prediction_array,sample_weight=weights)
            
     
    def update_target_network(self):
        
        if self.target_update_method == 'soft':
            target_weights_current = self.target.get_weights()
            online_weights_current = self.online.get_weights()
            layer_counter = 0
            for tw, ow in zip(target_weights_current,online_weights_current):
                new_target_weights = (1-self.tau)*tw + (self.tau)*ow
                target_weights_current[layer_counter] = new_target_weights
                layer_counter += 1
            self.target.set_weights(target_weights_current)
        
        else:
            self.target.set_weights(self.online.get_weights())
            
    

In [5]:
class ReplayBuffer():
    
    def __init__(self,max_len):
        self.max_len = max_len
        self.buffer = deque(maxlen=self.max_len)
        
    
    def add_exp(self,state,action,reward,next_state,done):
        exp = (state,action,reward, next_state, done)
        if len(self.buffer)<= self.max_len:
            self.buffer.append(exp)
        else:
            self.buffer[0] = exp
        
    def __len__(self):
        return len(self.buffer)
    
    def exp_sample(self,batch_size):
        indices = np.random.randint(len(self.buffer), size=batch_size)
        batch = [self.buffer[index] for index in indices]
        states, actions, rewards, next_states, dones = [np.array([experience[entry] for experience in batch])for entry in range(5)]
        return states, actions, rewards[:,np.newaxis], next_states, dones[:,np.newaxis]

In [6]:
class Prioritized_Buffer():
    
    def __init__(self,capacity):
        self.capacity = capacity
        self.alpha = 0.6
        self.beta = 0.1
        self.td_constant = 0.01
        self.beta_rate = 0.99992
        self.tree = SumTree(self.capacity)
        self.upper_bound_error = 1.0
        
        
    def add_exp_per(self,state,action,reward,next_state,done):
        max_priority = np.max(self.tree.tree[-self.tree.capacity:])
        if max_priority == 0:
            max_priority = self.upper_bound_error
        experience = (state,action,reward, next_state, done)
        self.tree.add(max_priority,experience)
        
    def __len__(self):
        return np.count_nonzero(self.tree.data_store)
    
    def update_beta(self):
        self.beta = min(1.0,self.beta*self.beta_rate**-1)
    
    def sample_PER(self,batch_size):
        tree_index_list = []
        data_index_list = []
        priority_list = []
        priority_segment = self.tree.total_priority / batch_size
        for i in range(batch_size):
            start_uniform,end_uniform = priority_segment * i, priority_segment * (i + 1)
            value = np.random.uniform(start_uniform,end_uniform)
            leaf_index, priority, data_index = self.tree.get_node(value)
            tree_index_list.append(leaf_index)
            data_index_list.append(data_index)
            priority_list.append(priority)
        priority_vector = np.array(priority_list)
        probabilities = (priority_vector**self.alpha) / np.sum(priority_vector**self.alpha)
        weights = (probabilities * batch_size)**-self.beta
        weights = weights / np.max(weights)
        batch = [self.tree.data_store[index] for index in data_index_list]
        states, actions, rewards, next_states, dones = [np.array([experience[entry] for experience in batch])for entry in range(5)]
        self.update_beta()
        return tree_index_list, weights, states, actions, rewards[:,np.newaxis], next_states, dones[:,np.newaxis]
    
    def update_td_batch(self,tree_index,absolute_td_errors):
        
        absolute_td_errors = np.array(absolute_td_errors) + self.td_constant
        for a, b in zip(tree_index,absolute_td_errors):
            self.tree.update_tree(a,b)
        

In [7]:
my_agent = Agent('LunarLander-v2',model_name='Lunar_Lander_NoPER_DDQN',enable_PER=False,enable_tb_logging=True,enable_DDQN=True,enable_dueling=False)
rewards_buffer = []
max_episodes = 800
mean_score_buffer = deque(maxlen=50)

In [8]:

for i in range(0,max_episodes):
    my_agent.reset_env()
    done = False
    rewards_episodic = 0
    while not(done):
        state, action, reward, next_state, done = my_agent.play_one_step()
        
        if not done or i == my_agent.env._max_episode_steps-1:
            reward = reward
        else:
            reward = -100
        
        #rewards_episodic += reward
        rewards_episodic += 1
        my_agent.training_step()
        
        
    
    my_agent.apply_epsilon_decay()
    my_agent.update_target_network()
    
    rewards_buffer.append(round(rewards_episodic,2))
    mean_score = round(np.mean(rewards_buffer))
    mean_score_buffer.append(rewards_episodic)
    recent_mean = round(np.mean(list(mean_score_buffer)))
    
    print(f" Episode: {i}, Episode_Reward: {rewards_episodic}, 10_Episode_Reward_Avg {recent_mean}",end='\r',flush=True)
    
    
    '''if i == 500:
        my_agent.online.save('C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/Dueling_DDQN_Episode_500')
        
    if i == 1000:
        my_agent.online.save('C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/Dueling_DDQN_Episode_1000')
        
    if i == 1500:
        my_agent.online.save('C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/Dueling_DDQN_Episode_1500')
        
    if i == 2000:
        my_agent.online.save('C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/Dueling_DDQN_Episode_2000')'''
        
        
    if recent_mean >= 200.0:
        print('DQN solved problem terminating...')
        break
        
    

    
     
          
    
        

DQN solved problem terminating..., 10_Episode_Reward_Avg 2057
