## Grid World Enviroment setting
* states, actions, transition probability

In [1]:
# set state
import numpy as np
nCols = 3
nRows = 4
nWalls = 1
states = []
for i in range(nCols*nRows-nWalls):
    states.append(i)
N_STATES = len(states)

terminal_state = [3,6]
win_state = [3]
lose_state = [6]
start_state = [x for x in states if x not in terminal_state]
#print(N_STATES)
#print(states)

# set map
map = -np.ones((nCols+2,nRows+2))
for i in range(nCols):
    for j in range(nRows):
        map[i+1,j+1] = 0
map[2,2] = -1 # add wall
#print(map)

# set action
actions = [0, 1, 2, 3]
N_ACTIONS = len(actions)

# states -> location
locations = []
index = 0
for i in range(nCols):
    for j in range(nRows):
        if map[i+1,j+1]==0:
            locations.append((i+1,j+1))
            index = index + 1
#print(locations) # match index with states
# action -> move
move = [(0,-1),(-1,0),(0,1),(1,0)] # match index with actions
#print(move)

# set transition probability
P = np.zeros((N_STATES,N_ACTIONS,N_STATES)) # P[S,A,S']
for s in range(N_STATES):
    for a in range(N_ACTIONS):
        current_location = locations[s]
        # heading collectly  ####################################################################################
        next_location = (current_location[0] + move[a][0],current_location[1] + move[a][1])
        
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.8
        # left error ############################################################################################
        next_location = (current_location[0] + move[a-1][0],current_location[1] + move[a-1][1])
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.1
        # right error ############################################################################################
        next_location = (current_location[0] + move[(a+1)%4][0],current_location[1] + move[(a+1)%4][1])
        
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.1
        
# rewards s,a ---  R(s,a)  ---> s'
if True:
    R = -0.02*np.ones((N_STATES,N_ACTIONS))
else:
    R = -0.5*np.ones((N_STATES,N_ACTIONS))
R[3,:] = 1
R[6,:] = -1
#print(R)
# discount factor
gamma = 0.99

# policy : given state which action would u choose
# assume that we know the policy
bad_policy = np.zeros((N_STATES,N_ACTIONS))
bad_policy[0,2] = 1
bad_policy[1,2] = 1
bad_policy[2,2] = 1
bad_policy[3,2] = 1
bad_policy[4,3] = 1
bad_policy[5,2] = 1
bad_policy[6,2] = 1
bad_policy[7,2] = 1
bad_policy[8,2] = 1
bad_policy[9,2] = 1
bad_policy[10,1] = 1

random_policy = 0.25*np.ones((N_STATES,N_ACTIONS))

optimal_policy = np.zeros((N_STATES,N_ACTIONS))
optimal_policy[0,2] = 1
optimal_policy[1,2] = 1
optimal_policy[2,2] = 1
optimal_policy[3,2] = 1
optimal_policy[4,1] = 1
optimal_policy[5,1] = 1
optimal_policy[6,1] = 1
optimal_policy[7,1] = 1
optimal_policy[8,0] = 1
optimal_policy[9,0] = 1
optimal_policy[10,0] = 1
#print(optimal_policy)

optimalWithNoise_policy = np.zeros((N_STATES,N_ACTIONS))
ep = 0.1
optimalWithNoise_policy[0,2] = 1
optimalWithNoise_policy[1,2] = 1
optimalWithNoise_policy[2,2] = 1
optimalWithNoise_policy[3,2] = 1
optimalWithNoise_policy[4,1] = 1
optimalWithNoise_policy[5,1] = 1
optimalWithNoise_policy[6,1] = 1
optimalWithNoise_policy[7,1] = 1
optimalWithNoise_policy[8,0] = 1
optimalWithNoise_policy[9,0] = 1
optimalWithNoise_policy[10,0] = 1
optimalWithNoise_policy = optimalWithNoise_policy + (ep/4)*np.ones((N_STATES,N_ACTIONS))
optimalWithNoise_policy = optimalWithNoise_policy / np.sum(optimalWithNoise_policy,axis = 1).reshape((N_STATES,1))

In [2]:
import tensorflow as tf

## TD policy evaluation using function approximation - online
* Function Approximation(linear combination)
$$V(s) = X(s)^{T}w$$
* loss function
$$J(w) = E_{\pi}[(R_{t+1}+\gamma X(s')^{T}w^{-} \:-\:X(s)^{T}w)^{2}]$$
* gradient descent
$$\nabla_{w}J(w) = -2\:*\:E_{\pi}[(R_{t+1}+\gamma X(s')^{T}w^{-}\:-\:X(s)^{T}w)]\:*\:X(s)$$
* stochastic gradient descent(batch size 1)
$$\nabla_{w}J(w) = -2\:*(R_{t+1}+\gamma X(s')^{T}w^{-}\:-\:X(s)^{T}w)\:*\:X(s)$$
* update parameter vector w
$$\Delta w = \alpha\:*\:(R_{t+1}+\gamma X(s')^{T}w^{-}\:-\:X(s)^{T}w)*X(s) $$

In [None]:
tf.reset_default_graph()
import time
start = time.time()
print("Tensorflow version : ")
print(tf.__version__)
print()

## set HyperParemeters
epoch = 10000
lr_rate = 0.01
copy_period = 1
policy = optimalWithNoise_policy # Evaluation -> follow given policy

Q_final = np.empty((N_STATES,N_ACTIONS))


# s a r s a
state_tf = tf.placeholder(tf.int32, shape = [None], name = 'state')
action_tf = tf.placeholder(tf.int32, shape = [None], name = 'action')
reward_tf = tf.placeholder(tf.float32, shape = [None], name = 'reward')
next_state_tf = tf.placeholder(tf.int32, shape = [None], name = 'next_state')
next_action_tf = tf.placeholder(tf.int32, shape = [None], name = 'next_action')
done_holder = tf.placeholder(tf.bool, shape = [None], name = 'done')
with tf.variable_scope('main_net') as scope:
    W = tf.get_variable(name='W', \
                        shape=[N_STATES + N_ACTIONS, 1], \
                        dtype=tf.float32, \
                        initializer=tf.random_uniform_initializer(-1.0, 1.0))

with tf.variable_scope('target_net') as scope:
    W_target = tf.get_variable(name='W_target', \
                        shape=[N_STATES + N_ACTIONS, 1], \
                        dtype=tf.float32, \
                        initializer=tf.random_uniform_initializer(-1.0, 1.0))

state_tf_one_hot = tf.one_hot(state_tf, N_STATES)
action_tf_one_hot = tf.one_hot(action_tf, N_ACTIONS)
next_state_tf_one_hot = tf.one_hot(next_state_tf, N_STATES)
next_action_tf_one_hot = tf.one_hot(next_action_tf, N_ACTIONS)

state_action_tf_one_hot = tf.concat([state_tf_one_hot, action_tf_one_hot],1)
next_state_action_tf_one_hot = tf.concat([next_state_tf_one_hot, next_action_tf_one_hot],1)


from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'main_net')
to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'target_net')

copy_ops = [to_vars[i].assign(from_vars[i]) for i in range(len(from_vars))]

TD_target_not_done = reward_tf + gamma * tf.matmul(next_state_action_tf_one_hot, W_target)
TD_target_done = reward_tf # this reward is actually reward + gamma * final_reward

    
Q = tf.matmul(state_action_tf_one_hot,W) # function approximation


TD_error_not_done = TD_target_not_done - Q
TD_error_done = TD_target_done - Q    
    
    
error = tf.to_float(done_holder)*TD_error_done + (1-tf.to_float(done_holder))*TD_error_not_done
loss = tf.reduce_mean(tf.square(error))
opt = tf.train.GradientDescentOptimizer(learning_rate=lr_rate)
train_ops = opt.minimize(loss,var_list = from_vars)    
    
    
with tf.Session() as sess:
    tf.global_variables_initializer().run() # initialize parameters following pre-defined recipe
    gradient_update_number = 0 
    
    for _ in range(epoch):
        done = False
        print(str(_)+"th iteration")
        s = np.random.choice(start_state) # random initial state
        a = np.random.choice(actions,p=policy[s,:])
        
        while not done:
            if gradient_update_number % copy_period == 0:
                sess.run(copy_ops) # W target is previous W to make W converge
                
            r = R[s,a]
            s1 = np.random.choice(states,p = P[s,a,:])
            a1 = np.random.choice(actions,p = policy[s1,:])
            
            if s1 in terminal_state:
                done = True            
                feed_dict = {state_tf: [s], action_tf: [a],reward_tf: [r], next_state_tf: [s1], next_action_tf: [a1], done_holder: [done]}
                sess.run(train_ops, feed_dict=feed_dict)
                gradient_update_number += 1
            
            else:
                feed_dict = {state_tf: [s], action_tf: [a],reward_tf: [r], next_state_tf: [s1], next_action_tf: [a1], done_holder: [done]}
                sess.run(train_ops, feed_dict=feed_dict)
                gradient_update_number += 1
                s = s1
                a = a1

 

    for s in range(N_STATES):
        for a in range(N_ACTIONS):
            feed_dict = {state_tf: [s], action_tf: [a]}
            Q_now = sess.run(Q, feed_dict=feed_dict)
            Q_final[s,a] = Q_now[0][0]

    
print(Q_final)    
print()   
print("it takes "+str(round(time.time()-start))+" sec")

## Discussion
* with TD, it takes long time to converge