## Grid World Enviroment setting
* states, actions, transition probability

In [8]:
# set state
import numpy as np
nCols = 3
nRows = 4
nWalls = 1
states = []
for i in range(nCols*nRows-nWalls):
    states.append(i)
N_STATES = len(states)

terminal_state = [3,6]
win_state = [3]
lose_state = [6]
start_state = [x for x in states if x not in terminal_state]
#print(N_STATES)
#print(states)

# set map
map = -np.ones((nCols+2,nRows+2))
for i in range(nCols):
    for j in range(nRows):
        map[i+1,j+1] = 0
map[2,2] = -1 # add wall
#print(map)

# set action
actions = [0, 1, 2, 3]
N_ACTIONS = len(actions)

# states -> location
locations = []
index = 0
for i in range(nCols):
    for j in range(nRows):
        if map[i+1,j+1]==0:
            locations.append((i+1,j+1))
            index = index + 1
#print(locations) # match index with states
# action -> move
move = [(0,-1),(-1,0),(0,1),(1,0)] # match index with actions
#print(move)

# set transition probability
P = np.zeros((N_STATES,N_ACTIONS,N_STATES)) # P[S,A,S']
for s in range(N_STATES):
    for a in range(N_ACTIONS):
        current_location = locations[s]
        # heading collectly  ####################################################################################
        next_location = (current_location[0] + move[a][0],current_location[1] + move[a][1])
        
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.8
        # left error ############################################################################################
        next_location = (current_location[0] + move[a-1][0],current_location[1] + move[a-1][1])
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.1
        # right error ############################################################################################
        next_location = (current_location[0] + move[(a+1)%4][0],current_location[1] + move[(a+1)%4][1])
        
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.1
        
# rewards s,a ---  R(s,a)  ---> s'
if True:
    R = -0.02*np.ones((N_STATES,N_ACTIONS))
else:
    R = -0.5*np.ones((N_STATES,N_ACTIONS))
R[3,:] = 1
R[6,:] = -1
#print(R)
# discount factor
gamma = 0.99

# policy : given state which action would u choose
# assume that we know the policy
bad_policy = np.zeros((N_STATES,N_ACTIONS))
bad_policy[0,2] = 1
bad_policy[1,2] = 1
bad_policy[2,2] = 1
bad_policy[3,2] = 1
bad_policy[4,3] = 1
bad_policy[5,2] = 1
bad_policy[6,2] = 1
bad_policy[7,2] = 1
bad_policy[8,2] = 1
bad_policy[9,2] = 1
bad_policy[10,1] = 1

random_policy = 0.25*np.ones((N_STATES,N_ACTIONS))

optimal_policy = np.zeros((N_STATES,N_ACTIONS))
optimal_policy[0,2] = 1
optimal_policy[1,2] = 1
optimal_policy[2,2] = 1
optimal_policy[3,2] = 1
optimal_policy[4,1] = 1
optimal_policy[5,1] = 1
optimal_policy[6,1] = 1
optimal_policy[7,1] = 1
optimal_policy[8,0] = 1
optimal_policy[9,0] = 1
optimal_policy[10,0] = 1
#print(optimal_policy)

optimalWithNoise_policy = np.zeros((N_STATES,N_ACTIONS))
ep = 0.1
optimalWithNoise_policy[0,2] = 1
optimalWithNoise_policy[1,2] = 1
optimalWithNoise_policy[2,2] = 1
optimalWithNoise_policy[3,2] = 1
optimalWithNoise_policy[4,1] = 1
optimalWithNoise_policy[5,1] = 1
optimalWithNoise_policy[6,1] = 1
optimalWithNoise_policy[7,1] = 1
optimalWithNoise_policy[8,0] = 1
optimalWithNoise_policy[9,0] = 1
optimalWithNoise_policy[10,0] = 1
optimalWithNoise_policy = optimalWithNoise_policy + (ep/4)*np.ones((N_STATES,N_ACTIONS))
optimalWithNoise_policy = optimalWithNoise_policy / np.sum(optimalWithNoise_policy,axis = 1).reshape((N_STATES,1))

In [9]:
import tensorflow as tf

In [None]:
tf.reset_default_graph()
print("Tensorflow version : ")
print(tf.__version__)
# Every-visit Monte Carlro Policy Evaluation
## set HyperParemeters
epoch = 1000
lr_rate = 0.01
policy = optimalWithNoise_policy # Evaluation -> follow given policy
## MC evaluation
num_visit = np.zeros(N_STATES) # N(s)
cum_gain = np.zeros(N_STATES) # S(s)
with tf.device('/gpu:0'):
    ## set tensorflow variable
    state_tf = tf.placeholder(tf.int32,shape=[None],name = "state")
    gain_tf = tf.placeholder(tf.float32,shape=[None],name = 'gain')
    #### number state -> matrix ex. 3 -> [0 0 0 1 0 0 0 0 0 0 0]
    W = tf.get_variable(name='W', \
                        shape = [N_STATES,1],\
                        dtype = tf.float32, \
                        initializer=tf.random_uniform_initializer(-1.0,1.0))
state_tf_one_hot = tf.one_hot(state_tf,N_STATES)
V = tf.matmul(state_tf_one_hot, W) # linear combination reprentaion of state value function
MC_error = gain_tf - V
loss = tf.reduce_mean(tf.square(MC_error)) #mean-square-error
opt = tf.train.GradientDescentOptimizer(learning_rate=lr_rate)
train_ops = opt.minimize(loss)

with tf.Session() as sess:
    tf.global_variables_initializer().run()
    
    reward_history = []
    simulation_history = []
    gain_history = []   
    for _ in range(epoch):
        
        done = False
        
        s = np.random.choice(start_state) # random initial state
        
        while not done:
            simulation_history.append(s)
            a = np.random.choice(actions,p=policy[s,:])
            reward_history.append(R[s,a])
            s1 = np.random.choice(states,p=P[s,a,:])
            
            if s1 in terminal_state:
                done = True
                simulation_history.append(s1)
                reward_history.append(R[s1,0])
            
            else:
                s = s1
                
        # After finish one simulation update value function -> offline
        # evaluate G(t)
        for i,r in enumerate(reward_history[::-1]):
            # G(t-1) = reward(t) + gamma * G(t)
            # if terminal G(T) = r(T)
            # To implent, i use reverse ordering
            if i==0:
                gain_history.append(r)
            else:
                gain_history.append(gamma * gain_history[i-1] + r)

        gain_history = gain_history[::-1]
        ##-------------------- This is for Exact MC
        # add G(t) to s(t)
        for i,s in enumerate(simulation_history):
            # i for find G(t)
            # S(s) = S(s) + G(t) for only first visit.
            num_visit[s]+=1
            cum_gain[s]= cum_gain[s] + gain_history[i]
        ##-------------------- This is for Function approximation MC
        for i in range(len(simulation_history)):
            feed_dict = {state_tf: [simulation_history[i]],\
                        gain_tf: [gain_history[i]]}
            sess.run(train_ops, feed_dict=feed_dict)
    # after finish all epoch
    V_final=[]
    for s in states:
        feed_dict = {state_tf: [s]}
        V_now = sess.run(V,feed_dict=feed_dict)
        V_final.append(V_now[0][0])
    print("Function Approximation result")
    print(V_final)
V = np.zeros(N_STATES)
V = cum_gain/(num_visit+1.0e-8)
print("Exact Value Function from MC")
print(V)

Tensorflow version : 
1.5.0


In [None]:
tf.reset_default_graph()

In [2]:
def get_available():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

In [4]:
get_available()

['/device:CPU:0', '/device:GPU:0']