In [68]:
import gym
import tensorflow as tf 
import numpy as np 
import random
from collections import deque

In [69]:
GAMMA = 0.9 # discount factor for target Q
INITIAL_EPSILON = 0.5 # starting value of epsilon
FINAL_EPSILON = 0.01 # final value of epsilon
REPLAY_SIZE = 10000 # experience replay buffer size
BATCH_SIZE = 32 # size of minibatch
H=20

In [70]:
class DQN():
    def __init__(self,env):
        self.replay_buffer=deque()
        
        self.epsilon=INITIAL_EPSILON
        self.action_dim=env.action_space.n
        self.state_dim=env.observation_space.shape[0]
        self.time_step=0
        
        self.creat_Q_network()
        self.creat_training_method()
        
        self.session=tf.InteractiveSession()
        self.session.run(tf.global_variables_initializer())
        
    def creat_Q_network(self):
        w1=self.weight_variable([self.state_dim,H])
        b1=self.bias_variable([H])
        
        w2=self.weight_variable([H,self.action_dim])
        b2=self.bias_variable([self.action_dim])
        
        self.state_input=tf.placeholder('float',[None,self.state_dim])
        h1=tf.nn.relu(tf.matmul(self.state_input,w1)+b1)
        self.Q_value=tf.matmul(h1,w2)+b2
    
    def creat_training_method(self):
        self.action_input=tf.placeholder("float",[None,self.action_dim])
        self.input_y=tf.placeholder("float",[None])
        Q_action=tf.reduce_sum(tf.multiply(self.Q_value,self.action_input),reduction_indices=1)
        self.cost=tf.reduce_mean(tf.square(self.input_y - Q_action))
        self.optimizer=tf.train.AdamOptimizer(0.001).minimize(self.cost)
        
    def perceive(self,state,action,reward,next_state,done):
        one_hot_action=np.zeros(self.action_dim)
        one_hot_action[action]=1
        self.replay_buffer.append((state,one_hot_action,reward,next_state,done))
        
        if(len(self.replay_buffer) > REPLAY_SIZE):
            self.replay_buffer.popleft()
            
        if(len(self.replay_buffer) > BATCH_SIZE):
            self.train_Q_network()
            
    def train_Q_network(self):
        self.time_step+=1
        miniBatch=random.sample(self.replay_buffer,BATCH_SIZE)
        state_batch=[date[0] for date in miniBatch]
        action_batch=[date[1] for date in miniBatch]
        reward_batch=[date[2] for date in miniBatch]
        next_action_batch=[date[3] for date in miniBatch]
        done_batch=[date[4] for date in miniBatch]
        
        #计算y
        y=[]
        Q_next_action_value=self.Q_value.eval(feed_dict={self.state_input:next_action_batch})
        for i in range(BATCH_SIZE):
            done=done_batch[i]
            if done:
                y.append(reward_batch[i])
            else:
                y.append(reward_batch[i]+GAMMA*np.max(Q_next_action_value[i]))
                
        self.optimizer.run(feed_dict={self.state_input:state_batch,
                                     self.action_input:action_batch,
                                     self.input_y:y})
        
    def edeepy_action(self,state):
        Q_value=self.Q_value.eval(feed_dict={self.state_input:[state]})[0]
#         self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/10000
        if random.random() < self.epsilon:
            return random.randint(0,self.action_dim-1)
        else:
            return np.argmax(Q_value)    
               
    def action(self,state):
        Q_value=self.Q_value.eval(feed_dict={self.state_input:[state]})[0]
        return np.argmax(Q_value)        
    
    def weight_variable(self,shape):
        return tf.Variable(tf.truncated_normal(shape))
    
    def bias_variable(self,shape):
        return tf.Variable(tf.constant(0.01,shape=shape))

In [71]:
ENV_NAME = 'CartPole-v0'
EPISODE = 10000 # Episode limitation
STEP = 201 # Step limitation in an episode
TEST_SIZE = 100 # The number of experiment test every 100 episode

In [72]:
def main():
    env=gym.make(ENV_NAME)
    agent=DQN(env)
    
    for episode in range(EPISODE):
        state=env.reset()
        
        for step in range(STEP):
            action=agent.edeepy_action(state)
            
            next_state,reward,done,infor=env.step(action)
            
            agent.perceive(state,action,reward,next_state,done)
            
            state=next_state
            
            if done:
                break
        
        if episode % 100 ==0 and episode!=0:
            
            reward_sum=0
            for test in range(TEST_SIZE):
                state=env.reset()
                
                for step in range(STEP):
#                     env.render()
                    action=agent.action(state)
                    state,reward,done,infor=env.step(action)                    
                    reward_sum+=reward
                    
                    if done:
                        break
            print('Average reward for episode %d : %f' %  
                     (episode,reward_sum/TEST_SIZE))  
            
            if reward_sum/TEST_SIZE > 195:
                    print("Task solved in",episode,'episodes!')
                    break            


In [73]:
if __name__ == '__main__':
    main()

[2017-06-08 15:21:46,509] Making new env: CartPole-v0


Average reward for episode 100 : 26.690000
Average reward for episode 200 : 128.770000
Average reward for episode 300 : 200.000000
Task solved in 300 episodes!
