In [1]:
import tensorflow as tf
import sys
import time
import os
from collections import deque
import numpy as np
import matplotlib.pyplot as plt
import gym
import random
import tensorboard
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
#tf.compat.v1.disable_eager_execution()


In [2]:
def clean_tensorboard():
    import os
    tensor_board_sess_path = 'C:/Users/Eaj59/AppData/Local/Temp/.tensorboard-info/'
    temp_sess_files = os.listdir(tensor_board_sess_path)
    for i in temp_sess_files:
        temp_file_path = os.path.join(tensor_board_sess_path,i)
        os.remove(temp_file_path)

In [3]:
#clean_tensorboard()
#%load_ext tensorboard
#%tensorboard --logdir 'C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/log_dir'

In [2]:
class Agent():
    
    def __init__(self,env_name,enable_tb_logging=False,enable_dueling=True):
        self.env = gym.make(env_name)
        self.env_state = self.env.reset()
        self.lr = 0.001
        self.gamma = .95
        self.epsilon_decay_rate = .99
        self.initializer = tf.keras.initializers.HeUniform(seed=None)
        self.loss_fn = tf.keras.losses.Huber(delta=1.0)
        self.optimizer = tf.keras.optimizers.Adam(lr=self.lr)
        self.n_actions = self.env.action_space.n
        self.n_states = self.env.observation_space.shape[0]
        self.buffer = ReplayBuffer(max_len=10000)
        self.tau = 0.10
        self.batch_size = 32
        self.epsilon = 1
        self.min_epsilon = .001
        self.dueling_method = 'max'
        self.enable_tb_logging = enable_tb_logging
        self.enable_dueling = enable_dueling
        self.episodic_rewards = 0
        self.episode_counter = 0
        self.episode_time_step_counter = 0
        

        if self.enable_dueling:
            self.online = self.make_dueling_nn()
        else:
            self.online = self.make_nn()
            
        self.target = self.make_target()
        
        if self.enable_tb_logging:
            self.online_run_id, self.online_tensor_board_callback = self.create_tensor_board_callback(model_name='Soft_10_Target_MAX_Dueling_DDQN_Huber')
            self.training_steps_counter = 0
        
    def reset_env(self):
        self.episode_counter+=1
        if self.enable_tb_logging:
            with self.online_tensor_board_callback.as_default():
                tf.summary.scalar(name='Episode_Reward',data=self.episodic_rewards,step=self.episode_counter)
                tf.summary.scalar(name='Episode_Time_Steps',data=self.episode_time_step_counter,step=self.episode_counter)
        self.episodic_rewards = 0
        self.episode_time_step_counter = 0
        self.env_state = self.env.reset()
        
        
    def generate_run_directory(self,root_log_dir,model_name):
        run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
        run_id = model_name+'_'+run_id
        base_dir = 'C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL'
        os.chdir('C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/log_dir')
        os.mkdir(run_id)
        os.chdir(base_dir)
        return run_id
        
    
    def create_tensor_board_callback(self,model_name):
        root_log_dir = os.path.join(os.curdir,'log_dir')
        run_id = self.generate_run_directory(root_log_dir=root_log_dir,model_name=model_name)
        model_cb_path = os.path.join(root_log_dir,run_id)
        file_writer = tf.summary.create_file_writer(model_cb_path)
        return run_id, file_writer
    
        
        
    def make_nn(self):
        online = tf.keras.models.Sequential()
        online.add(tf.keras.layers.Dense(24,input_dim=self.n_states,activation='swish',kernel_initializer=self.initializer))
        online.add(tf.keras.layers.Dense(24, activation='swish',kernel_initializer=self.initializer))
        online.add(tf.keras.layers.Dense(self.n_actions,activation='linear',kernel_initializer=self.initializer))
        online.compile(loss=self.loss_fn,optimizer=self.optimizer,metrics=['accuracy'])
        return online
    
    def make_dueling_nn(self):
        Dueling_Input = tf.keras.layers.Input(shape=(self.n_states,),name='Dueling_Input')
        layer_1_Dense = tf.keras.layers.Dense(64,activation='swish',kernel_initializer=self.initializer,name='layer_1_Dense')(Dueling_Input)
        layer_2_Dense = tf.keras.layers.Dense(64,activation='swish',kernel_initializer=self.initializer,name='layer_2_Dense')(layer_1_Dense)
        layer_3_Dense = tf.keras.layers.Dense(36,activation='swish',kernel_initializer=self.initializer,name='layer_3_Dense')(layer_2_Dense)
        Value_Function_Dense_Layer = tf.keras.layers.Dense(512,activation='swish',kernel_initializer=self.initializer,name='Value_Function_Dense_Layer')(layer_3_Dense)
        Value_Function_Output_Layer = tf.keras.layers.Dense(1,activation='linear',kernel_initializer=self.initializer,name='Value_Function_Output_Layer')(Value_Function_Dense_Layer)
        Advantage_Function_Dense_Layer = tf.keras.layers.Dense(512,activation='swish',kernel_initializer=self.initializer,name='Advantage_Function_Dense_Layer')(layer_3_Dense)
        Advantage_Function_Ouput_Layer = tf.keras.layers.Dense(self.n_actions,activation='linear',kernel_initializer=self.initializer,name='Advantage_Function_Output_Layer')(Advantage_Function_Dense_Layer)
        
        if self.dueling_method == 'average':
            online_output = Value_Function_Output_Layer + (Advantage_Function_Ouput_Layer - tf.reduce_mean(Advantage_Function_Ouput_Layer,axis=1,keepdims=True))
            
            
        elif self.dueling_method == 'sum':
            online_output = Value_Function_Output_Layer + Advantage_Function_Ouput_Layer
           
            
        else:
            online_output = Value_Function_Output_Layer + (Advantage_Function_Ouput_Layer - tf.reduce_max(Advantage_Function_Ouput_Layer,axis=1,keepdims=True))
            
            
        online = tf.keras.Model(Dueling_Input,online_output)
        online.compile(loss=self.loss_fn,optimizer=self.optimizer,metrics=['accuracy'])
        online.summary()
        return online
            

    def make_target(self):
        target = tf.keras.models.clone_model(self.online)
        target.set_weights(self.online.get_weights())
        return target
        
        
    def epsilon_greedy_policy(self):
        if np.random.random() <= self.epsilon:
            return np.random.randint(low=0,high=self.n_actions)
        else:
            Q_values = self.online.predict(np.reshape(self.env_state,(1,len(self.env_state))))
            return np.argmax(Q_values[0])
        
        
        
    def play_one_step(self):
        state = np.array(self.env_state)
        action = self.epsilon_greedy_policy()
        next_state, reward, done, _ = self.env.step(action)
        self.buffer.add_exp(state=state, action=action, reward=reward, next_state=next_state, done=done)
        self.env_state = next_state
        self.episodic_rewards += reward
        self.episode_time_step_counter +=1
        return state, action, reward, next_state, done
        
        
    def soft_target_update(self):
        target_weights_current = self.target.get_weights()
        online_weights_current = self.online.get_weights()
        layer_counter = 0
        for tw, ow in zip(target_weights_current,online_weights_current):
            new_target_weights = (1-self.tau)*tw + (self.tau)*ow
            target_weights_current[layer_counter] = new_target_weights
            layer_counter += 1
        self.target.set_weights(target_weights_current)
        
        
    def training_step(self):
        
        if len(self.buffer)<self.batch_size:
            batch_size_instance = len(self.buffer)
        else:
            batch_size_instance = self.batch_size
            
        states,actions,rewards,next_states,dones = self.buffer.exp_sample(batch_size=batch_size_instance)
        online_state_prediction = self.online.predict(states)
        online_next_state_arg_max = tf.math.argmax(self.online.predict(next_states),axis=1)
        online_arg_max_mask = tf.one_hot(online_next_state_arg_max,depth=self.n_actions)
        target_next_state_preds = self.target.predict(next_states)
        max_target_next_state_preds = tf.reduce_sum(target_next_state_preds*online_arg_max_mask,axis=1,keepdims=True)
        q_update = rewards + (1 - dones) * self.gamma * max_target_next_state_preds
        actual_actions_mask = tf.one_hot(actions,depth=self.n_actions)
        inverse_actions_mask = tf.reverse(actual_actions_mask,axis=[-1])
        online_state_prediction = (q_update*actual_actions_mask)+(online_state_prediction*inverse_actions_mask)

        if self.enable_tb_logging:
            metrics_output = self.online.train_on_batch(states,online_state_prediction,reset_metrics=True)
            self.training_steps_counter+=1
            with self.online_tensor_board_callback.as_default():
                tf.summary.scalar(name='Batch_MSE_Loss',data=metrics_output[0],step=self.training_steps_counter)
                tf.summary.scalar(name='Accuracy',data=metrics_output[1],step=self.training_steps_counter)
                tf.summary.scalar(name='Epsilon',data=self.epsilon,step=self.training_steps_counter)
        else:
            self.online.train_on_batch(states,online_state_prediction)
        
        
        
            

    def apply_epsilon_decay(self):
        self.epsilon = max(self.epsilon * self.epsilon_decay_rate,self.min_epsilon)
        
            
    def hard_target_update(self):
        self.target.set_weights(self.online.get_weights())
            
            
        
        
        
    

In [3]:
class ReplayBuffer():
    
    def __init__(self,max_len):
        self.max_len = max_len
        self.buffer = deque(maxlen=self.max_len)
        
    
    def add_exp(self,state,action,reward,next_state,done):
        exp = (state,action,reward, next_state, done)
        if len(self.buffer)<= self.max_len:
            self.buffer.append(exp)
        else:
            self.buffer[0] = exp
        
    def __len__(self):
        return len(self.buffer)
    
    def exp_sample(self,batch_size):
        indices = np.random.randint(len(self.buffer), size=batch_size)
        batch = [self.buffer[index] for index in indices]
        states, actions, rewards, next_states, dones = [np.array([experience[entry] for experience in batch])for entry in range(5)]
        return states, actions, rewards[:,np.newaxis], next_states, dones[:,np.newaxis]

In [4]:
my_agent = Agent('LunarLander-v2',enable_tb_logging=True,enable_dueling=True)
rewards_buffer = []
max_episodes = 750
mean_score_buffer = deque(maxlen=100)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Dueling_Input (InputLayer)      [(None, 8)]          0                                            
__________________________________________________________________________________________________
layer_1_Dense (Dense)           (None, 64)           576         Dueling_Input[0][0]              
__________________________________________________________________________________________________
layer_2_Dense (Dense)           (None, 64)           4160        layer_1_Dense[0][0]              
__________________________________________________________________________________________________
layer_3_Dense (Dense)           (None, 36)           2340        layer_2_Dense[0][0]              
______________________________________________________________________________________________

In [5]:

for i in range(0,max_episodes):
    my_agent.reset_env()
    done = False
    rewards_episodic = 0
    while not(done):
        state, action, reward, next_state, done = my_agent.play_one_step()
        reward = reward if not done else -100
        rewards_episodic += reward
        #my_agent.buffer.add_exp(state, action, reward, next_state, done)
        my_agent.training_step()
        
        
    
    my_agent.apply_epsilon_decay()
    my_agent.soft_target_update()
    
    rewards_buffer.append(round(rewards_episodic,2))
    mean_score = round(np.mean(rewards_buffer))
    mean_score_buffer.append(rewards_episodic)
    recent_mean = round(np.mean(list(mean_score_buffer)))
    print("\r Episode: {}, Episode_Reward: {}, 10_Episode_Reward_Avg {}".format(i, rewards_episodic, recent_mean, end=""))
    
    
    '''if i == 500:
        my_agent.online.save('C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/Dueling_DDQN_Episode_500')
        
    if i == 1000:
        my_agent.online.save('C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/Dueling_DDQN_Episode_1000')
        
    if i == 1500:
        my_agent.online.save('C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/Dueling_DDQN_Episode_1500')
        
    if i == 2000:
        my_agent.online.save('C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/Dueling_DDQN_Episode_2000')'''
        
        
    if recent_mean >= 200.0:
        print('DQN solved problem terminating...')
        break
        
    

    
     
          
    
        

 Episode: 0, Episode_Reward: -120.31704943306323, 10_Episode_Reward_Avg -120
 Episode: 1, Episode_Reward: -109.83881424937844, 10_Episode_Reward_Avg -115
 Episode: 2, Episode_Reward: -136.4321925280359, 10_Episode_Reward_Avg -122
 Episode: 3, Episode_Reward: -144.54061712085263, 10_Episode_Reward_Avg -128
 Episode: 4, Episode_Reward: -145.31802907352966, 10_Episode_Reward_Avg -131
 Episode: 5, Episode_Reward: -62.62096238089086, 10_Episode_Reward_Avg -120
 Episode: 6, Episode_Reward: -82.7771844083966, 10_Episode_Reward_Avg -115
 Episode: 7, Episode_Reward: -90.71188897467833, 10_Episode_Reward_Avg -112
 Episode: 8, Episode_Reward: -165.36666895502663, 10_Episode_Reward_Avg -118
 Episode: 9, Episode_Reward: -98.81136855698135, 10_Episode_Reward_Avg -116
 Episode: 10, Episode_Reward: -305.1513542613416, 10_Episode_Reward_Avg -133
 Episode: 11, Episode_Reward: -147.05669418880532, 10_Episode_Reward_Avg -134
 Episode: 12, Episode_Reward: -63.43201318261844, 10_Episode_Reward_Avg -129
 Epi