In [1]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np
import gym

class ExpReplay():
    def __init__(self, e_max=15000, e_min=100):
        self._max = e_max # maximum number of experiences
        self._min = e_min # minimum number of experiences for training
        self.exp = {'state':[], 'action':[], 'reward':[], 'next_state':[], 'done':[]} # total experiences the Agent stored
        
    def get_max(self):
        """return the maximum number of experiences"""
        return self._max
    
    def get_min(self):
        """return the minimum number of experiences"""
        return self._min
    
    def get_num(self):
        """return the curren number of experiences"""
        return len(self.exp['state'])
    
    def get_batch(self, batch_size=64):
        """random choose a batch of experiences for training"""
        idx = np.random.choice(self.get_num(), size=batch_size, replace=False)
        state = np.array([self.exp['state'][i] for i in idx])
        action = [self.exp['action'][i] for i in idx]
        reward = [self.exp['reward'][i] for i in idx]
        next_state = np.array([self.exp['next_state'][i] for i in idx])
        done = [self.exp['done'][i] for i in idx]
        return state, action, reward, next_state, done
        
    def add(self, state, action, reward, next_state, done):
        """add single experience"""
        if self.get_num()>self.get_max():
            del self.exp['state'][0]
            del self.exp['action'][0]
            del self.exp['reward'][0]
            del self.exp['next_state'][0]
            del self.exp['done'][0]
        self.exp['state'].append(state)
        self.exp['action'].append(action)
        self.exp['reward'].append(reward)
        self.exp['next_state'].append(next_state)
        self.exp['done'].append(done)
        
        
class TNET():
    """
    Target network is for calculating the maximum estimated Q-value in given action a.
    """
    def __init__(self, in_units, out_units, hidden_units=250):
        self.in_units = in_units
        self.out_units = out_units
        self.hidden_units = hidden_units
        self._model()
        
    def _model(self):
        with tf.variable_scope('tnet'):
            self.x = tf.placeholder(tf.float32, shape=(None, self.in_units))
            
            W1=tf.get_variable('W1', shape=(self.in_units, self.hidden_units), initializer=tf.random_normal_initializer())
            W2=tf.get_variable('W2', shape=(self.hidden_units, self.hidden_units), initializer=tf.random_normal_initializer())
            W3=tf.get_variable('W3', shape=(self.hidden_units, self.out_units), initializer=tf.random_normal_initializer())
            
            b1=tf.get_variable('b1', shape=(self.hidden_units), initializer=tf.zeros_initializer())
            b2=tf.get_variable('b2', shape=(self.hidden_units), initializer=tf.zeros_initializer())
 
            h1=tf.nn.tanh(tf.matmul(self.x, W1)+b1)
            h2=tf.nn.tanh(tf.matmul(h1, W2)+b2)
            self.q=tf.matmul(h2, W3)

            self.params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='tnet')
            
            
class QNET():
    def __init__(self, in_units, out_units, exp, hidden_units=250):
        # Target Network
        self.tnet = TNET(in_units, out_units)
        
        # Q network architecture
        self.in_units = in_units
        self.out_units = out_units
        self.hidden_units = hidden_units
        self._model()
        self._batch_learning_model()
        self._tnet_update()
        
        # experience replay
        self.exp = exp 
        
    def _model(self):
        """ Q-network architecture """
        with tf.variable_scope('qnet'):
            self.x = tf.placeholder(tf.float32, shape=(None, self.in_units))
            
            W1 = tf.get_variable('W1', shape=(self.in_units, self.hidden_units), initializer=tf.random_normal_initializer())
            W2 = tf.get_variable('W2', shape=(self.hidden_units, self.hidden_units), initializer=tf.random_normal_initializer())
            W3 = tf.get_variable('W3', shape=(self.hidden_units, self.out_units), initializer=tf.random_normal_initializer())
            
            b1 = tf.get_variable('b1', shape=(self.hidden_units), initializer=tf.zeros_initializer())
            b2 = tf.get_variable('b2', shape=(self.hidden_units), initializer=tf.zeros_initializer())
 
            h1 = tf.nn.tanh(tf.matmul(self.x, W1)+b1)
            h2 = tf.nn.tanh(tf.matmul(h1, W2)+b2)
            self.q = tf.matmul(h2, W3)
            
    def _batch_learning_model(self):
        """For batch learning"""
        with tf.variable_scope('qnet'):
            # TD-target
            self.target = tf.placeholder(tf.float32, shape=(None, ))
            # Action index
            self.selected_idx = tf.placeholder(tf.int32, shape=(None, 2))
            # Q-value
            self.selected_q = tf.gather_nd(self.q, self.selected_idx)
            
            self.params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='qnet')
            
            # Q-network optimization alogrithms
            loss = tf.losses.mean_squared_error(self.target, self.selected_q)
            gradients = tf.gradients(loss, self.params)
            self.train_opt = tf.train.AdamOptimizer(3e-4).apply_gradients(zip(gradients, self.params))

    def _tnet_update(self):
        """ Update Target network by using the parameters of Q-Network"""
        with tf.variable_scope('qnet'):                        
            self.update_opt = [t.assign(q) for t, q in zip(self.tnet.params, self.params)]
    
    def batch_train(self, batch_size=64):
        """Implement Double DQN Algorithm, batch training"""
        if self.exp.get_num() < self.exp.get_min():
            #The number of experiences is not enough for batch training
            return

        # get a batch of experiences
        state, action, reward, next_state, done = self.exp.get_batch(batch_size)
        state = state.reshape(batch_size, self.in_units)
        next_state = next_state.reshape(batch_size, self.in_units)
        
        # get actions by Q-network
        qnet_q_values = self.session.run(self.q, feed_dict={self.x:next_state})
        qnet_actions = np.argmax(qnet_q_values, axis=1)
        
        # calculate estimated Q-values with qnet_actions by using Target-network
        tnet_q_values = self.session.run(self.tnet.q, feed_dict={self.tnet.x:next_state})
        tnet_q = [np.take(tnet_q_values[i], qnet_actions[i]) for i in range(batch_size)]
        
        # Update Q-values of Q-network
        qnet_update_q = [r+0.95*q if not d else r for r, q, d in zip(reward, tnet_q, done)]
        
        # optimization
        indices=[[i,action[i]] for i in range(batch_size)]
        feed_dict={self.x:state, self.target:qnet_update_q, self.selected_idx:indices}
        self.session.run(self.train_opt, feed_dict)
        
    def update(self):
        """ for updatte target network"""
        self.session.run(self.update_opt)
        
    def set_session(self, sess):
        self.session = sess
        
    def get_action(self, state, e_rate):
        """ for training stage of the Agent, exploitation or exploration"""
        if np.random.random()<e_rate:
            return np.random.choice(self.out_units)
        else:
            return np.argmax(self.session.run(self.q, feed_dict={self.x: state}))
        
        
class Agent():
    def __init__(self, env):
        # set hyper parameters
        self.max_episodes = 10000
        self.max_actions = 10000
        self.exploration_rate = 1.0
        self.exploration_decay = 0.0001  
        
        # set environment
        self.env = env
        self.states = env.observation_space.shape[0]
        self.actions = env.action_space.n
        
        # Experience Replay for batch learning
        self.exp = ExpReplay()
        # the number of experience per batch for batch learning
        self.batch_size = 64 
        
        # Deep Q Network
        self.qnet = QNET(self.states, self.actions, self.exp)
        # For execute Deep Q Network
        session = tf.InteractiveSession()
        session.run(tf.global_variables_initializer())
        self.qnet.set_session(session)
        
    def train(self):
        # set hyper parameters
        max_episodes = self.max_episodes
        max_actions = self.max_actions
        exploration_rate = self.exploration_rate
        exploration_decay = self.exploration_decay
        batch_size = self.batch_size
        
        # start training
        record_rewards = []
        for i in range(max_episodes):
            total_rewards = 0
            state = self.env.reset()
            state = state.reshape(1, self.states)
            for j in range(max_actions):
                #self.env.render() # Uncomment this line to render the environment
                action = self.qnet.get_action(state, exploration_rate)
                next_state, reward, done, info = self.env.step(action)
                next_state = next_state.reshape(1, self.states)
                total_rewards += reward
                
                if done:
                    self.exp.add(state, action, (reward-100), next_state, done)
                    self.qnet.batch_train(batch_size)
                    break
                    
                self.exp.add(state, action, reward, next_state, done)
                self.qnet.batch_train(batch_size)
                
                # update target network
                if (j%25)== 0 and j>0:
                    self.qnet.update()
                # next episode
                state = next_state
                
            record_rewards.append(total_rewards)
            exploration_rate = 0.001 + (exploration_rate-0.001)*np.exp(-exploration_decay*(i+1))
            if i%100==0 and i>0:
                average_rewards = np.mean(np.array(record_rewards))
                record_rewards = []
                print("episodes: %i to %i, average_reward: %.3f, exploration: %.3f" %(i-100, i, average_rewards, exploration_rate))

if __name__ == '__main__':
    env = gym.make('CartPole-v1')
    agent = Agent(env)
    agent.train()
    

episodes: 0 to 100, average_reward: 31.109, exploration: 0.598
episodes: 100 to 200, average_reward: 109.020, exploration: 0.132


KeyboardInterrupt: 