In [1]:
import tensorflow as tf
import sys
import time
import os
from collections import deque
import numpy as np
import matplotlib.pyplot as plt
import gym
import random
import tensorboard
from tensorflow.keras import backend as K
from Sum_Tree import SumTree
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)


class Agent():
    
    def __init__(self,env_name):
        
        self.env = gym.make(env_name)
        #self.env.seed(0)
        self.env_state = self.env.reset()
        self.lr = 0.1
        self.gamma = .99
        self.epsilon_decay_rate = .8
        self.initializer = tf.keras.initializers.HeUniform()
        self.loss_parameter = 1.0
        self.loss_fn = tf.keras.losses.Huber(delta=self.loss_parameter)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr)
        self.n_actions = self.env.action_space.n
        self.n_states = self.env.observation_space.shape[0]
        self.tau = 0.10
        self.batch_size = 32
        self.epsilon = .5
        self.min_epsilon = .05
        self.dueling_method = 'max'
        self.episodic_rewards = 0
        self.episode_counter = 0
        self.episode_time_step_counter = 0
        self.target_update_method = 'hard'
        self.online = self.make_nn()
        self.target = self.make_target()
        self.capacity = 5000
        self.buffer = ReplayBuffer(self.capacity)

            

        
    def reset_env(self):
        self.episode_counter+=1
        self.episodic_rewards = 0
        self.episode_time_step_counter = 0
        self.env_state = self.env.reset()
        



    def make_nn(self):
        

        Dueling_Input = tf.keras.layers.Input(shape=(self.n_states,),name='Dueling_Input')
        
        #layer_1_Dense = tf.keras.layers.Dense(64,activation='swish',kernel_initializer=self.initializer,name='layer_1_Dense')(Dueling_Input)
        
        layer_2_Dense = tf.keras.layers.Dense(64,activation='swish',kernel_initializer=self.initializer,name='layer_2_Dense')(Dueling_Input)
        
        #layer_3_Dense = tf.keras.layers.Dense(24,activation='swish',kernel_initializer=self.initializer,name='layer_3_Dense')(layer_2_Dense)
        
        Advantage_Layer = tf.keras.layers.Dense(32,activation='swish',kernel_initializer=self.initializer,name='Advantage_Layer')(layer_2_Dense)
        
        Value_Layer = tf.keras.layers.Dense(32,activation='swish',kernel_initializer=self.initializer,name='Value_Layer')(layer_2_Dense)
        
        Advantage_Layer2 = tf.keras.layers.Dense(self.n_actions,activation='linear',kernel_initializer=self.initializer,name='Advantage_Layer2')(Advantage_Layer)
        
        Value_Layer2 = tf.keras.layers.Dense(1,activation='linear',kernel_initializer=self.initializer,name='Value_Layer2')(Value_Layer)

        if self.dueling_method == 'average':
            
            Value_Expanded = tf.keras.layers.Lambda(lambda s: K.expand_dims(s[:,0],-1),output_shape=(self.n_actions,))(Value_Layer2)
            
            Average_Advantage = tf.keras.layers.Lambda(lambda a: a[:,:] - K.mean(a[:,:],keepdims=True),output_shape=(self.n_states,))(Advantage_Layer2)
            
            online_output = tf.keras.layers.Add()([Value_Expanded,Average_Advantage])


        else:
            
            Value_Expanded = tf.keras.layers.Lambda(lambda s: K.expand_dims(s[:,0],-1),output_shape=(self.n_actions,))(Value_Layer2)
            
            Average_Advantage = tf.keras.layers.Lambda(lambda a: a[:,:] - K.max(a[:,:],keepdims=True),output_shape=(self.n_states,))(Advantage_Layer2)
            
            online_output = tf.keras.layers.Add()([Value_Expanded,Average_Advantage])

        online = tf.keras.Model(Dueling_Input,online_output)
        online.compile(loss=self.loss_fn,optimizer=self.optimizer,metrics=['accuracy'])
        return online

        
        
    
    def make_target(self):
        target = tf.keras.models.clone_model(self.online)
        target.set_weights(self.online.get_weights())
        return target
        
        
    def epsilon_greedy_policy(self):
        if np.random.random() <= self.epsilon:
            return np.random.randint(low=0,high=self.n_actions)
        else:
            Q_values = self.online.predict(np.reshape(self.env_state,(1,self.n_states)))
            return np.argmax(Q_values[0])
        
    def apply_epsilon_decay(self):
        self.epsilon = max(self.epsilon * self.epsilon_decay_rate,self.min_epsilon)   
        

    def play_one_step(self):
        state = np.array(self.env_state)
        action = self.epsilon_greedy_policy()
        next_state, reward, done, _ = self.env.step(action)
        self.buffer.add_exp(state=state, action=action, reward=reward, next_state=next_state, done=done)
        self.env_state = next_state
        self.episodic_rewards += reward
        self.episode_time_step_counter += 1
        return state, action, reward, next_state, done
        
         
    def training_step(self):
        
        if len(self.buffer)<self.batch_size:
            batch_size_instance = len(self.buffer)
        else:
            batch_size_instance = self.batch_size
        
        states,actions,rewards,next_states,dones = self.buffer.exp_sample(batch_size=batch_size_instance)
        online_state_prediction = self.online.predict(states)
        online_state_prediction_array = np.array(online_state_prediction)
        online_next_state_prediciton = self.online.predict(next_states)
        target_next_state_prediction = self.target.predict(next_states)
        online_next_state_arg_max = tf.argmax(online_next_state_prediciton,axis=1)
        double_learning_mask = tf.one_hot(online_next_state_arg_max,depth=self.n_actions)
        masked_target_qs = tf.reduce_sum(target_next_state_prediction*double_learning_mask,axis=1,keepdims=True)
        q_update = rewards + (1 - dones) * self.gamma * masked_target_qs
        q_update = tf.squeeze(q_update,axis=-1)
        
        for i in range(0,online_state_prediction.shape[0]):
            online_state_prediction_array[i,actions[i]] = q_update[i]
        self.online.train_on_batch(states,online_state_prediction_array)
            
     
    def update_target_network(self):
        
        if self.target_update_method == 'soft':
            target_weights_current = self.target.get_weights()
            online_weights_current = self.online.get_weights()
            layer_counter = 0
            for tw, ow in zip(target_weights_current,online_weights_current):
                new_target_weights = (1-self.tau)*tw + (self.tau)*ow
                target_weights_current[layer_counter] = new_target_weights
                layer_counter += 1
            self.target.set_weights(target_weights_current)
        
        else:
            self.target.set_weights(self.online.get_weights())

In [2]:
class ReplayBuffer():
    
    def __init__(self,max_len):
        self.max_len = max_len
        self.buffer = deque(maxlen=self.max_len)
        
    
    def add_exp(self,state,action,reward,next_state,done):
        exp = (state,action,reward, next_state, done)
        if len(self.buffer)<= self.max_len:
            self.buffer.append(exp)
        else:
            self.buffer[0] = exp
        
    def __len__(self):
        return len(self.buffer)
    
    def exp_sample(self,batch_size):
        indices = np.random.randint(len(self.buffer), size=batch_size)
        batch = [self.buffer[index] for index in indices]
        states, actions, rewards, next_states, dones = [np.array([experience[entry] for experience in batch])for entry in range(5)]
        return states, actions, rewards[:,np.newaxis], next_states, dones[:,np.newaxis]

In [3]:
my_agent = Agent('CartPole-v0')
rewards_buffer = []
max_episodes = 150
mean_score_buffer = deque(maxlen=5)

In [None]:
for i in range(0,max_episodes):
    my_agent.reset_env()
    done = False
    rewards_episodic = 0
    while not(done):
        state, action, reward, next_state, done = my_agent.play_one_step()
        
        if not done or i == my_agent.env._max_episode_steps-1:
            reward = reward
        else:
            reward = -100
        
        #rewards_episodic += reward
        rewards_episodic += 1
        my_agent.training_step()
        
        
    
    my_agent.apply_epsilon_decay()
    my_agent.update_target_network()
    
    rewards_buffer.append(round(rewards_episodic,2))
    mean_score = round(np.mean(rewards_buffer))
    mean_score_buffer.append(rewards_episodic)
    recent_mean = round(np.mean(list(mean_score_buffer)))
    
    print("\r Episode: {}, Episode_Reward: {}, 10_Episode_Reward_Avg {}".format(i, round(rewards_episodic,2), recent_mean, end=""))
    
    if recent_mean >= 195.0:
        print('DQN solved problem terminating...')
        break
        
    

 Episode: 0, Episode_Reward: 41, 10_Episode_Reward_Avg 41
 Episode: 1, Episode_Reward: 23, 10_Episode_Reward_Avg 32
 Episode: 2, Episode_Reward: 14, 10_Episode_Reward_Avg 26
 Episode: 3, Episode_Reward: 24, 10_Episode_Reward_Avg 26
 Episode: 4, Episode_Reward: 15, 10_Episode_Reward_Avg 23
 Episode: 5, Episode_Reward: 14, 10_Episode_Reward_Avg 18
 Episode: 6, Episode_Reward: 10, 10_Episode_Reward_Avg 15
 Episode: 7, Episode_Reward: 11, 10_Episode_Reward_Avg 15
 Episode: 8, Episode_Reward: 8, 10_Episode_Reward_Avg 12
 Episode: 9, Episode_Reward: 10, 10_Episode_Reward_Avg 11
 Episode: 10, Episode_Reward: 19, 10_Episode_Reward_Avg 12
 Episode: 11, Episode_Reward: 10, 10_Episode_Reward_Avg 12
 Episode: 12, Episode_Reward: 12, 10_Episode_Reward_Avg 12
 Episode: 13, Episode_Reward: 16, 10_Episode_Reward_Avg 13
 Episode: 14, Episode_Reward: 11, 10_Episode_Reward_Avg 14
 Episode: 15, Episode_Reward: 9, 10_Episode_Reward_Avg 12
 Episode: 16, Episode_Reward: 23, 10_Episode_Reward_Avg 14
 Episode: