In [1]:
import tensorflow as tf
import sys
import time
import os
from collections import deque
import numpy as np
import matplotlib.pyplot as plt
import gym
import random
import tensorboard
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
#tf.compat.v1.disable_eager_execution()


In [2]:
def clean_tensorboard():
    import os
    tensor_board_sess_path = 'C:/Users/Eaj59/AppData/Local/Temp/.tensorboard-info/'
    temp_sess_files = os.listdir(tensor_board_sess_path)
    for i in temp_sess_files:
        temp_file_path = os.path.join(tensor_board_sess_path,i)
        os.remove(temp_file_path)

In [3]:
clean_tensorboard()

In [4]:
#clean_tensorboard()
%load_ext tensorboard
%tensorboard --logdir 'C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/log_dir/'

ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 11612.

In [11]:
class Agent():
    
    def __init__(self,env_name,enable_tb_logging):
        self.env = gym.make(env_name)
        self.env_state = self.env.reset()
        self.lr = 0.001
        self.gamma = .99
        self.epsilon_decay_rate = .99
        self.initializer = tf.keras.initializers.HeUniform(seed=None)
        self.loss_fn = tf.keras.losses.Huber(delta=1.0)
        self.optimizer = tf.keras.optimizers.Adam(lr=self.lr)
        self.n_actions = self.env.action_space.n
        self.n_states = self.env_state.shape[0]
        self.online = self.make_nn()
        self.target = self.make_target()
        self.buffer = ReplayBuffer(max_len=10000)
        self.tau = 0.05
        self.batch_size = 256
        self.epsilon = .99
        self.min_epsilon = .005
        self.episode_counter = 0
        self.episode_step_counter = 0
        self.episode_reward = 0
        self.enable_tb_logging = enable_tb_logging
        if self.enable_tb_logging:
            self.online_run_id, self.online_tensor_board_callback = self.create_tensor_board_callback(model_name='Lunar_Lander_Online_DQN')
            self.training_steps_counter = 0
        #self.target_run_id, self.target_tensor_board_callback = self.create_tensor_board_callback(model_name='Target_DQN')
        
        
    def generate_run_directory(self,root_log_dir,model_name):
        run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
        run_id = model_name+'_'+run_id
        base_dir = 'C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL'
        os.chdir('C:/Users/Eaj59/Documents/RL_Projects/Project_2_DRL/log_dir')
        os.mkdir(run_id)
        os.chdir(base_dir)
        return run_id
        
    
    def create_tensor_board_callback(self,model_name):
        root_log_dir = os.path.join(os.curdir,'log_dir')
        run_id = self.generate_run_directory(root_log_dir=root_log_dir,model_name=model_name)
        model_cb_path = os.path.join(root_log_dir,run_id)
        file_writer = tf.summary.create_file_writer(model_cb_path)
        return run_id, file_writer
    
    def reset_env_state(self):
        self.env_state = self.env.reset()
        if self.enable_tb_logging:
            with self.online_tensor_board_callback.as_default():
                tf.summary.scalar(name='Episode_Time_Steps_Taken',data=self.episode_step_counter,step=self.episode_counter)
                tf.summary.scalar(name='Episode Reward',data=self.episode_reward,step=self.episode_counter)
        
        self.episode_counter += 1
        self.episode_step_counter = 0
        self.episode_reward = 0
        
    def make_nn(self):
        online = tf.keras.models.Sequential()
        online.add(tf.keras.layers.Dense(128,input_dim=self.n_states,activation='swish',kernel_initializer=self.initializer))
        online.add(tf.keras.layers.Dense(64, activation='swish',kernel_initializer=self.initializer))
        online.add(tf.keras.layers.Dense(32, activation='swish',kernel_initializer=self.initializer))
        online.add(tf.keras.layers.Dense(24, activation='swish',kernel_initializer=self.initializer))
        online.add(tf.keras.layers.Dense(self.n_actions,activation='linear',kernel_initializer=self.initializer))
        online.compile(loss=self.loss_fn,optimizer=self.optimizer,metrics=['accuracy'])
        return online
        
        
    def make_target(self):
        target = tf.keras.models.clone_model(self.online)
        target.set_weights(self.online.get_weights())
        return target
        
        
    def epsilon_greedy_policy(self):
        if np.random.random() <= self.epsilon:
            return np.random.randint(low=0,high=self.n_actions)
        else:
            Q_values = self.online.predict(np.reshape(self.env_state,(1,len(self.env_state))))
            return np.argmax(Q_values[0])
        
        
        
    def play_one_step(self):
        state = np.array(self.env_state)
        action = self.epsilon_greedy_policy()
        next_state, reward, done, _ = self.env.step(action)
        self.buffer.add_exp(state=state, action=action, reward=reward, next_state=next_state, done=done)
        self.env_state = next_state
        self.episode_reward += reward
        self.episode_step_counter +=1
        return state, action, reward, next_state, done
        
        
    def soft_target_update(self):
        target_weights_current = self.target.get_weights()
        online_weights_current = self.online.get_weights()
        layer_counter = 0
        for tw, ow in zip(target_weights_current,online_weights_current):
            new_target_weights = (1-self.tau)*tw + (self.tau)*ow
            target_weights_current[layer_counter] = new_target_weights
            layer_counter += 1
        self.target.set_weights(target_weights_current)
        
        
    def training_step(self):
        
        if len(self.buffer)<self.batch_size:
            batch_size_instance = len(self.buffer)
        else:
            batch_size_instance = self.batch_size
            
        states,actions,rewards,next_states,dones = self.buffer.exp_sample(batch_size=batch_size_instance)
        online_state_prediction = self.online.predict(states)
        target_max_next_state_action_value_prediction = np.amax(self.target.predict(next_states),axis=1,keepdims=True)
        q_update = rewards + (np.ones(shape=(batch_size_instance,1)) - dones) * self.gamma * target_max_next_state_action_value_prediction

        #keep the action not taken the same value only change the action taken value
        for i in range(0,online_state_prediction.shape[0]):
            if actions[i] == 0:
                online_state_prediction[i,0] = q_update[i]
            else:
                online_state_prediction[i,1] = q_update[i]
        
        if self.enable_tb_logging:
            metrics_output = self.online.train_on_batch(states,online_state_prediction,reset_metrics=True)
            self.training_steps_counter+=1
            with self.online_tensor_board_callback.as_default():
                tf.summary.scalar(name='Batch_MSE_Loss',data=metrics_output[0],step=self.training_steps_counter)
                tf.summary.scalar(name='Accuracy',data=metrics_output[1],step=self.training_steps_counter)
                tf.summary.scalar(name='Epsilon',data=self.epsilon,step=self.training_steps_counter)
                tf.summary.scalar(name='Episode',data=self.episode_counter,step=self.training_steps_counter)
                
        else:
            self.online.train_on_batch(states,online_state_prediction)
        
        
        
            

    def apply_epsilon_decay(self):
        self.epsilon = max(self.epsilon * self.epsilon_decay_rate,self.min_epsilon)
        
            
    def hard_target_update(self):
        self.target.set_weights(self.online.get_weights())
            
            
        
        
        
    

In [12]:
class ReplayBuffer():
    
    def __init__(self,max_len):
        self.max_len = max_len
        self.buffer = deque(maxlen=self.max_len)
        
    
    def add_exp(self,state,action,reward,next_state,done):
        exp = (state,action,reward, next_state, done)
        if len(self.buffer)<= self.max_len:
            self.buffer.append(exp)
        else:
            self.buffer[0] = exp
        
    def __len__(self):
        return len(self.buffer)
    
    def exp_sample(self,batch_size):
        indices = np.random.randint(len(self.buffer), size=batch_size)
        batch = [self.buffer[index] for index in indices]
        states, actions, rewards, next_states, dones = [np.array([experience[entry] for experience in batch])for entry in range(5)]
        return states, actions, rewards[:,np.newaxis], next_states, dones[:,np.newaxis]

In [13]:
my_agent = Agent('LunarLander-v2',enable_tb_logging=True)
rewards_buffer = []
max_episodes = 500
mean_score_buffer = deque(maxlen=10)
for i in range(0,max_episodes):
    my_agent.reset_env_state()
    done = False
    rewards_episodic = 0
    env_steps = 0
    while not(done):
        state, action, reward, next_state, done = my_agent.play_one_step()
        rewards_episodic+= reward
        env_steps += 1
        reward = reward if env_steps<=999 else -100
        #my_agent.buffer.add_exp(state, action, reward, next_state, done)
        my_agent.training_step()
        
    my_agent.apply_epsilon_decay()
    rewards_buffer.append(rewards_episodic)
    my_agent.hard_target_update()
    mean_score = round(np.mean(rewards_buffer))
    mean_score_buffer.append(rewards_episodic)
    recent_mean = round(np.mean(list(mean_score_buffer)))
    
    if recent_mean >= 195.0:
        print('DQN solved problem terminating...')
        break
        
    print("\rEpisode: {}, Steps: {}, recent_avg {}".format(i, rewards_episodic, recent_mean, end=""))

    
     
          
    
        

Episode: 0, Steps: -100.11179035419424, recent_avg -100
Episode: 1, Steps: -163.68413254461555, recent_avg -132
Episode: 2, Steps: -32.106811270142416, recent_avg -99
Episode: 3, Steps: -18.638402572311193, recent_avg -79
Episode: 4, Steps: -144.84745616949522, recent_avg -92
Episode: 5, Steps: -275.1049925926842, recent_avg -122
Episode: 6, Steps: -152.9917446888412, recent_avg -127
Episode: 7, Steps: -205.49918111807148, recent_avg -137
Episode: 8, Steps: -103.87257899576888, recent_avg -133
Episode: 9, Steps: -128.61569773251213, recent_avg -133
Episode: 10, Steps: -202.6781782474799, recent_avg -143
Episode: 11, Steps: -110.78005233025328, recent_avg -138
Episode: 12, Steps: -229.67611839320517, recent_avg -157
Episode: 13, Steps: -126.08622225785221, recent_avg -168
Episode: 14, Steps: -74.11508240703837, recent_avg -161
Episode: 15, Steps: -108.07836088003924, recent_avg -144
Episode: 16, Steps: -132.0526782699666, recent_avg -142
Episode: 17, Steps: -126.91165708089086, recent_a

In [3]:
env_test = gym.make('LunarLander-v2')
test_state = env_test.reset()

In [5]:
env_test.__dict__

{'env': <gym.envs.box2d.lunar_lander.LunarLander at 0x26691bf8f70>,
 'action_space': Discrete(4),
 'observation_space': Box(-inf, inf, (8,), float32),
 'reward_range': (-inf, inf),
 'metadata': {'render.modes': ['human', 'rgb_array'],
  'video.frames_per_second': 50},
 '_max_episode_steps': 1000,
 '_elapsed_steps': 0}