## Tackling Pong (bonus part)
Basically borrow ideas from DQN

In [16]:
import gym
import numpy as np
import random
import tensorflow as tf
import tensorflow.contrib.slim as slim
import matplotlib.pyplot as plt
import scipy.misc
import os
%matplotlib inline

In [17]:
# the dependency setup is instructed in: https://github.com/openai/gym
env = gym.make('Pong-v0')

In [18]:
# inspect the env
print(env.action_space)
print(env.observation_space)

n_action = 6
observation_size = 210 * 160 * 3

def processState(states):
    return np.reshape(states,[observation_size]) # 84 x 84 x 3

Discrete(6)
Box(210, 160, 3)


In [19]:
# experience buffer
class experience_buffer():
    def __init__(self, buffer_size = 50000):
        self.buffer = []
        self.buffer_size = buffer_size
    
    def add(self,experience):
        if len(self.buffer) + len(experience) >= self.buffer_size:
            self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []
        self.buffer.extend(experience)
            
    def sample(self,size):
        return np.reshape(np.array( random.sample(self.buffer, size) ), [size,5])

In [20]:
def updateTargetGraph(tfVars,tau):
    total_vars = len(tfVars)
    op_holder = []
    for idx,var in enumerate(tfVars[0:total_vars//2]):
        op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value())))
    return op_holder

def updateTarget(op_holder,sess):
    for op in op_holder:
        sess.run(op)

In [21]:
class Qnetwork():
    def __init__(self,h_size):
        #The network recieves a frame from the game, flattened into an array.
        #It then resizes it and processes it through four convolutional layers.
        #We use slim.conv2d to set up our network 
        self.scalarInput =  tf.placeholder(shape=[None, observation_size],dtype=tf.float32)
        self.imageIn = tf.reshape(self.scalarInput,shape=[-1,210, 160, 3])
        self.pool1 = tf.nn.max_pool(value = self.imageIn, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'VALID',)
        self.conv1 = slim.conv2d( \
            inputs=self.pool1,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', biases_initializer=None)
        self.conv2 = slim.conv2d( \
            inputs=self.conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', biases_initializer=None)
        self.conv3 = slim.conv2d( \
            inputs=self.conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', biases_initializer=None)
        self.conv4 = slim.conv2d( \
            inputs=self.conv3,num_outputs=h_size,kernel_size=[5,5],stride=[1,1],padding='VALID', biases_initializer=None)
        # conv4 (N, 5, 2, h_size)
#         print(self.conv4)
        self.final_conv = self.conv4
        self.final_conv_size = 5 * 2 * h_size
        
        ################################################################################
        # TODO: Implement Dueling DQN                                                  #
        # We take the output from the final convolutional layer i.e. self.conv4 and    #
        # split it into separate advantage and value streams.                          #
        # Outout: self.Advantage, self.Value                                           #
        # Hint: Refer to Fig.1 in [Dueling DQN](https://arxiv.org/pdf/1511.06581.pdf)  #
        #       In implementation, use tf.split to split into two branches. You may    #
        #       use xavier_initializer for initializing the two additional linear      #
        #       layers.                                                                # 
        ################################################################################
        # split the stream
        self.splitA, self.splitV = tf.split(self.conv4, 2, axis = 3)
        # flatten the tensor
        self.flattenA = tf.reshape(self.splitA, [-1, self.final_conv_size//2])
        self.flattenV = tf.reshape(self.splitV, [-1, self.final_conv_size//2])
        # affine layers
        self.Advantage = tf.layers.dense(self.flattenA, n_action, 
                                         kernel_initializer=tf.contrib.layers.xavier_initializer())
        self.Value = tf.layers.dense(self.flattenV, 1, 
                                     kernel_initializer=tf.contrib.layers.xavier_initializer())
        ################################################################################
        #                                 END OF YOUR CODE                             #
        ################################################################################
        
        #Then combine them together to get our final Q-values. 
        #Please refer to Equation (9) in [Dueling DQN](https://arxiv.org/pdf/1511.06581.pdf)
        self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True))
        self.predict = tf.argmax(self.Qout,1)
        
        #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
        self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.actions, n_action, dtype=tf.float32)
        
        ################################################################################
        # TODO:                                                                        #
        # Obtain the loss (self.loss) by taking the sum of squares difference          #
        # between the target and prediction Q values.                                  #
        ################################################################################
        # The onehot eliminate the gradient. This mask out the Q of the actions not chonsen, and only keep the chosen one. 
        # This is also a way to prevent the update of the actions that are not chosen
        self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)
        
        self.loss = tf.reduce_mean(tf.square(self.targetQ - self.Q))
        ################################################################################
        #                                 END OF YOUR CODE                             #
        ################################################################################
        
        self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
        self.updateModel = self.trainer.minimize(self.loss)
# Qnetwork(64)

In [22]:
# Parameters
batch_size = 32 #How many experiences to use for each training step.
update_freq = 4 #How often to perform a training step.
y = .99 #Discount factor on the target Q-values
startE = 1 #Starting chance of random action
endE = 0.1 #Final chance of random action
annealing_steps = 10000. #How many steps of training to reduce startE to endE.
num_episodes = 5000 #How many episodes of game environment to train network with.
pre_train_steps = 10000 #How many steps of random actions before training begins.
max_epLength = 1000 #The max allowed length of our episode.
load_model = False #Whether to load a saved model.
path = "./classic_dqn" #The path to save our model to.
h_size = 256 #The size of the final convolutional layer before splitting it into Advantage and Value streams.
tau = 0.001 #Rate to update target network toward primary network

In [23]:
tf.reset_default_graph()
mainQN = Qnetwork(h_size)
# targetQN = Qnetwork(h_size)

init = tf.global_variables_initializer()

saver = tf.train.Saver()

trainables = tf.trainable_variables()

# targetOps = updateTargetGraph(trainables,tau)

myBuffer = experience_buffer()

#Set the rate of random action decrease. 
e = startE
stepDrop = (startE - endE)/annealing_steps

#create lists to contain total rewards and steps per episode
jList = []
rList = []
total_steps = 0

#Make a path for our model to be saved in.
if not os.path.exists(path):
    os.makedirs(path)

with tf.Session() as sess:
    sess.run(init)
    if load_model == True:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    for i in range(num_episodes):
        episodeBuffer = experience_buffer()
        #Reset environment and get first new observation
        s = env.reset()
        
        # to fire
        env.step(1)
        
        s = processState(s)
        d = False
        rAll = 0
        j = 0
        #The Q-Network
        while j < max_epLength: #If the agent takes longer than 50 moves to reach either of the blocks, end the trial.
            j+=1
            #Choose an action by greedily (with e chance of random action) from the Q-network
            if np.random.rand(1) < e or total_steps < pre_train_steps:
                a = np.random.randint(0,n_action)
            else:
                a = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:[s]})[0]
            total_steps += 1
            
            ################################################################################
            # TODO: Save the experience to our episode buffer.                             #
            # You will need to do the following:                                           #
            # (1) Get new state s1 (resized), reward r and done d from a                   #
            # (2) Add experience to episode buffer. Hint: experience includes              #
            #     s, a, r, s1 and d.                                                       #
            ################################################################################
            # get new state. It do not return an "info"
            s1, r, d, _ = env.step(a)
            # resize the state
            s1 = processState(s1)
            # add the experience
            # Note that it used exntend() method, so we need an additional dimension. 
            episodeBuffer.add( np.array([[s, a, r, s1, d]]) )
            ################################################################################
            #                                 END OF YOUR CODE                             #
            ################################################################################
            
            if total_steps > pre_train_steps:
                if e > endE:
                    e -= stepDrop
                
                if total_steps % (update_freq) == 0:
                    
                    ################################################################################
                    # TODO: Implement Double-DQN                                                   #
                    # (1) Get a random batch of experiences via experience_buffer class            #
                    #                                                                              #
                    # (2) Perform the Double-DQN update to the target Q-values                     #
                    #     Hint: Use mainQN and targetQN separately to chose an action and predict  #
                    #     the Q-values for that action.                                            #
                    #     Then compute targetQ based on Double-DQN equation                        #
                    #                                                                              #
                    # (3) Update the primary network with our target values                        #
                    ################################################################################ 
                    random_experience = myBuffer.sample(batch_size) # note that the it return a np.array
                    # we use our primary network to chose an action (actionQ)
                    # and our target network to generate the target Q-value for that action.
                    # the input state is s1
                    ''' Very wired behaviors here. Why 1d ndarray of 1d ndarray but not a 2d array'''
#                     print(type(random_experience[:,3])) # numpy.ndarray
#                     print(random_experience[0,3].shape) # (32,)
                    
#                     print(type(random_experience[:,3][0])) # numpy.ndarray
#                     print(random_experience[:,3][0].shape) # (21168,)

                    actionQ = sess.run( mainQN.predict, feed_dict = {
                            mainQN.scalarInput: np.vstack(random_experience[:,3])
                        })
                    predictQ = sess.run( mainQN.Qout, feed_dict = {
                            mainQN.scalarInput: np.vstack(random_experience[:,3])
                        })
                    
                    # get the Q value from the predictQ using actionQ (if take actions according to mainQN) 
                    # both are for the next state
                    doubleQ = predictQ[range(batch_size), actionQ]
                    # accoding to the equation
                    targetQ = random_experience[:, 2] + y * doubleQ
                    
                    # update the mainQN
                    _ = sess.run(mainQN.updateModel, feed_dict= {
                            mainQN.scalarInput: np.vstack(random_experience[:,0]),
                            mainQN.targetQ: targetQ,
                            mainQN.actions: random_experience[:, 1]
                        })
                    ################################################################################
                    #                                 END OF YOUR CODE                             #
                    ################################################################################
                           
#                     updateTarget(targetOps,sess) #Update the target network toward the primary network.
            rAll += r
            s = s1
            
            if d == True:

                break
        
        myBuffer.add(episodeBuffer.buffer)
        jList.append(j)
        rList.append(rAll)
        #Periodically save the model. 
        if i % 1000 == 0:
            saver.save(sess,path+'/model-'+str(i)+'.ckpt')
            print("Saved Model")
        if len(rList) % 10 == 0:
            print("Episode",i,"reward:",np.mean(rList[-10:]))
    saver.save(sess,path+'/model-'+str(i)+'.ckpt')
print("Mean reward per episode: " + str(sum(rList)/num_episodes))

Tensor("Conv_3/Relu:0", shape=(?, 5, 2, 64), dtype=float32)
Saved Model
Episode 9 reward: -14.5


KeyboardInterrupt: 