In [1]:
import tensorflow as tf
import sys
import time
import os
from collections import deque
import numpy as np
import matplotlib.pyplot as plt
import gym
import random
import tensorboard
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
#tf.compat.v1.disable_eager_execution()


In [2]:
def clean_tensorboard():
    import os
    tensor_board_sess_path = 'C:/Users/Eaj59/AppData/Local/Temp/.tensorboard-info/'
    temp_sess_files = os.listdir(tensor_board_sess_path)
    for i in temp_sess_files:
        temp_file_path = os.path.join(tensor_board_sess_path,i)
        os.remove(temp_file_path)

In [3]:
clean_tensorboard()
%load_ext tensorboard
%tensorboard --logdir 'C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/log_dir'

ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 5296.

In [6]:
class Agent():
    
    def __init__(self,env_name,enable_tb_logging):
        self.env = gym.make(env_name)
        self.env.reset()
        self.lr = 0.001
        self.gamma = .99
        self.epsilon_decay_rate = .99
        self.initializer = tf.keras.initializers.HeUniform(seed=None)
        self.loss_fn = tf.keras.losses.MeanSquaredError()
        self.optimizer = tf.keras.optimizers.Adam(lr=self.lr)
        self.n_actions = self.env.action_space.n
        self.n_states = self.env.observation_space.shape[0]
        self.online = self.make_nn()
        self.target = self.make_target()
        self.buffer = ReplayBuffer(max_len=1000)
        self.tau = 0.05
        self.batch_size = 32
        self.epsilon = .75
        self.min_epsilon = .001
        self.enable_tb_logging = enable_tb_logging
        if self.enable_tb_logging:
            self.online_run_id, self.online_tensor_board_callback = self.create_tensor_board_callback(model_name='Online_DQN')
            self.training_steps_counter = 0
        #self.target_run_id, self.target_tensor_board_callback = self.create_tensor_board_callback(model_name='Target_DQN')
        
        
    def generate_run_directory(self,root_log_dir,model_name):
        run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
        run_id = model_name+'_'+run_id
        base_dir = 'C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL'
        os.chdir('C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/log_dir')
        os.mkdir(run_id)
        os.chdir(base_dir)
        return run_id
        
    
    def create_tensor_board_callback(self,model_name):
        root_log_dir = os.path.join(os.curdir,'log_dir')
        run_id = self.generate_run_directory(root_log_dir=root_log_dir,model_name=model_name)
        model_cb_path = os.path.join(root_log_dir,run_id)
        #tf.keras.callbacks.TensorBoard(model_cb_path)
        file_writer = tf.summary.create_file_writer(model_cb_path)
        return run_id, file_writer
    
        
        
    def make_nn(self):
        online = tf.keras.models.Sequential()
        online.add(tf.keras.layers.Dense(24,input_dim=self.n_states,activation='swish',kernel_initializer=self.initializer))
        online.add(tf.keras.layers.Dense(24, activation='swish',kernel_initializer=self.initializer))
        online.add(tf.keras.layers.Dense(self.n_actions,activation='linear',kernel_initializer=self.initializer))
        online.compile(loss=self.loss_fn,optimizer=self.optimizer,metrics=['accuracy','mean_squared_error'])
        return online
        
        
    def make_target(self):
        target = tf.keras.models.clone_model(self.online)
        target.set_weights(self.online.get_weights())
        return target
        
        
    def epsilon_greedy_policy(self):
        if np.random.random() <= self.epsilon:
            return np.random.randint(low=0,high=self.n_actions)
        else:
            Q_values = self.online.predict(np.reshape(self.env.state,(1,len(self.env.state))))
            return np.argmax(Q_values[0])
        
        
        
    def play_one_step(self):
        state = np.array(self.env.state)
        action = self.epsilon_greedy_policy()
        next_state, reward, done, _ = self.env.step(action)
        self.buffer.add_exp(state=state, action=action, reward=reward, next_state=next_state, done=done)
        return state, action, reward, next_state, done
        
        
    def soft_target_update(self):
        target_weights_current = self.target.get_weights()
        online_weights_current = self.online.get_weights()
        layer_counter = 0
        for tw, ow in zip(target_weights_current,online_weights_current):
            new_target_weights = (1-self.tau)*tw + (self.tau)*ow
            target_weights_current[layer_counter] = new_target_weights
            layer_counter += 1
        self.target.set_weights(target_weights_current)
        
        
    def training_step(self):
        
        if len(self.buffer)<self.batch_size:
            batch_size_instance = len(self.buffer)
        else:
            batch_size_instance = self.batch_size
            
        states,actions,rewards,next_states,dones = self.buffer.exp_sample(batch_size=batch_size_instance)
        online_state_prediction = self.online.predict(states)
        online_next_state_prediction = self.online.predict(next_states)
        target_next_state_prediciton = self.target.predict(next_states)
        
        
        
        target_max_next_state_action_value_prediction = np.amax(self.target.predict(next_states),axis=1,keepdims=True)
        q_update = rewards + (np.ones(shape=(batch_size_instance,1)) - dones) * self.gamma * target_max_next_state_action_value_prediction

        #keep the action not taken the same value only change the action taken value
        for i in range(0,online_state_prediction.shape[0]):
            if actions[i] == 0:
                online_state_prediction[i,0] = q_update[i]
            else:
                online_state_prediction[i,1] = q_update[i]
        
        if self.enable_tb_logging:
            metrics_output = self.online.train_on_batch(states,online_state_prediction,reset_metrics=True)
            self.training_steps_counter+=1
            with self.online_tensor_board_callback.as_default():
                tf.summary.scalar(name='Batch_MSE_Loss',data=metrics_output[0],step=self.training_steps_counter)
                tf.summary.scalar(name='Accuracy',data=metrics_output[1],step=self.training_steps_counter)
                tf.summary.scalar(name='Model_MSE_Loss',data=metrics_output[2],step=self.training_steps_counter)
                tf.summary.scalar(name='Epsilon',data=self.epsilon,step=self.training_steps_counter)
        else:
            self.online.train_on_batch(states,online_state_prediction)
        
        
        
            

    def apply_epsilon_decay(self):
        self.epsilon = max(self.epsilon * self.epsilon_decay_rate,self.min_epsilon)
        
            
    def hard_target_update(self):
        self.target.set_weights(self.online.get_weights())
            
            
        
        
        
    

In [7]:
class ReplayBuffer():
    
    def __init__(self,max_len):
        self.max_len = max_len
        self.buffer = deque(maxlen=self.max_len)
        
    
    def add_exp(self,state,action,reward,next_state,done):
        exp = (state,action,reward, next_state, done)
        if len(self.buffer)<= self.max_len:
            self.buffer.append(exp)
        else:
            self.buffer[0] = exp
        
    def __len__(self):
        return len(self.buffer)
    
    def exp_sample(self,batch_size):
        indices = np.random.randint(len(self.buffer), size=batch_size)
        batch = [self.buffer[index] for index in indices]
        states, actions, rewards, next_states, dones = [np.array([experience[entry] for experience in batch])for entry in range(5)]
        return states, actions, rewards[:,np.newaxis], next_states, dones[:,np.newaxis]

In [9]:
my_agent = Agent('CartPole-v0',enable_tb_logging=True)
rewards_buffer = []
learning_start_min = 1000
max_episodes = 500
mean_score_buffer = deque(maxlen=10)
for i in range(0,max_episodes):
    my_agent.env.reset()
    done = False
    rewards_episodic = 0
    while not(done):
        
        state, action, reward, next_state, done = my_agent.play_one_step()
        rewards_episodic+= reward
        reward = reward if not done else -100
        #my_agent.buffer.add_exp(state, action, reward, next_state, done)
        my_agent.training_step()
        
        
    my_agent.apply_epsilon_decay()
    rewards_buffer.append(rewards_episodic)
    my_agent.hard_target_update()
    mean_score = round(np.mean(rewards_buffer))
    mean_score_buffer.append(rewards_episodic)
    recent_mean = round(np.mean(list(mean_score_buffer)))
    
    if recent_mean >= 195.0:
        print('DQN solved problem terminating...')
        break
        
    print("\rEpisode: {}, Steps: {}, recent_avg {}".format(i, rewards_episodic, recent_mean, end=""))

    
     
          
    
        

Episode: 0, Steps: 21.0, recent_avg 21
Episode: 1, Steps: 38.0, recent_avg 30
Episode: 2, Steps: 17.0, recent_avg 25
Episode: 3, Steps: 31.0, recent_avg 27
Episode: 4, Steps: 13.0, recent_avg 24
Episode: 5, Steps: 13.0, recent_avg 22
Episode: 6, Steps: 23.0, recent_avg 22
Episode: 7, Steps: 16.0, recent_avg 22
Episode: 8, Steps: 21.0, recent_avg 21
Episode: 9, Steps: 13.0, recent_avg 21
Episode: 10, Steps: 10.0, recent_avg 20
Episode: 11, Steps: 12.0, recent_avg 17
Episode: 12, Steps: 14.0, recent_avg 17
Episode: 13, Steps: 21.0, recent_avg 16
Episode: 14, Steps: 57.0, recent_avg 20
Episode: 15, Steps: 9.0, recent_avg 20
Episode: 16, Steps: 13.0, recent_avg 19
Episode: 17, Steps: 16.0, recent_avg 19
Episode: 18, Steps: 17.0, recent_avg 18
Episode: 19, Steps: 15.0, recent_avg 18
Episode: 20, Steps: 8.0, recent_avg 18
Episode: 21, Steps: 12.0, recent_avg 18
Episode: 22, Steps: 15.0, recent_avg 18
Episode: 23, Steps: 14.0, recent_avg 18
Episode: 24, Steps: 14.0, recent_avg 13
Episode: 25,