## Grid World Enviroment setting
* states, actions, transition probability

In [1]:
# set state
import numpy as np
nCols = 3
nRows = 4
nWalls = 1
states = []
for i in range(nCols*nRows-nWalls):
    states.append(i)
N_STATES = len(states)
#print(N_STATES)
#print(states)

# set map
map = -np.ones((nCols+2,nRows+2))
for i in range(nCols):
    for j in range(nRows):
        map[i+1,j+1] = 0
map[2,2] = -1 # add wall
#print(map)

# set action
actions = [0, 1, 2, 3]
N_ACTIONS = len(actions)

# states -> location
locations = []
index = 0
for i in range(nCols):
    for j in range(nRows):
        if map[i+1,j+1]==0:
            locations.append((i+1,j+1))
            index = index + 1
#print(locations) # match index with states
# action -> move
move = [(0,-1),(-1,0),(0,1),(1,0)] # match index with actions
#print(move)

# set transition probability
P = np.zeros((N_STATES,N_ACTIONS,N_STATES)) # P[S,A,S']
for s in range(N_STATES):
    for a in range(N_ACTIONS):
        current_location = locations[s]
        # heading collectly  ####################################################################################
        next_location = (current_location[0] + move[a][0],current_location[1] + move[a][1])
        
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.8
        # left error ############################################################################################
        next_location = (current_location[0] + move[a-1][0],current_location[1] + move[a-1][1])
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.1
        # right error ############################################################################################
        next_location = (current_location[0] + move[(a+1)%4][0],current_location[1] + move[(a+1)%4][1])
        
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.1
        
# rewards s,a ---  R(s,a)  ---> s'
if True:
    R = -0.02*np.ones((N_STATES,N_ACTIONS))
else:
    R = -0.5*np.ones((N_STATES,N_ACTIONS))
R[3,:] = 1
R[6,:] = -1
#print(R)
# discount factor
gamma = 0.99

# policy : given state which action would u choose
# assume that we know the policy
bad_policy = np.zeros((N_STATES,N_ACTIONS))
bad_policy[0,2] = 1
bad_policy[1,2] = 1
bad_policy[2,2] = 1
bad_policy[3,2] = 1
bad_policy[4,3] = 1
bad_policy[5,2] = 1
bad_policy[6,2] = 1
bad_policy[7,2] = 1
bad_policy[8,2] = 1
bad_policy[9,2] = 1
bad_policy[10,1] = 1

random_policy = 0.25*np.ones((N_STATES,N_ACTIONS))

optimal_policy = np.zeros((N_STATES,N_ACTIONS))
optimal_policy[0,2] = 1
optimal_policy[1,2] = 1
optimal_policy[2,2] = 1
optimal_policy[3,2] = 1
optimal_policy[4,1] = 1
optimal_policy[5,1] = 1
optimal_policy[6,1] = 1
optimal_policy[7,1] = 1
optimal_policy[8,0] = 1
optimal_policy[9,0] = 1
optimal_policy[10,0] = 1
#print(optimal_policy)

optimalWithNoise_policy = np.zeros((N_STATES,N_ACTIONS))
ep = 0.1
optimalWithNoise_policy[0,2] = 1
optimalWithNoise_policy[1,2] = 1
optimalWithNoise_policy[2,2] = 1
optimalWithNoise_policy[3,2] = 1
optimalWithNoise_policy[4,1] = 1
optimalWithNoise_policy[5,1] = 1
optimalWithNoise_policy[6,1] = 1
optimalWithNoise_policy[7,1] = 1
optimalWithNoise_policy[8,0] = 1
optimalWithNoise_policy[9,0] = 1
optimalWithNoise_policy[10,0] = 1
optimalWithNoise_policy = optimalWithNoise_policy + (ep/4)*np.ones((N_STATES,N_ACTIONS))
optimalWithNoise_policy = optimalWithNoise_policy / np.sum(optimalWithNoise_policy,axis = 1).reshape((N_STATES,1))

# Monte Carlo Evaluation

### First-visit Monte Carlo

* Update number of visit $N(s)$ and cummulative goal $S(s)$ for the first time visited state per on simulaion.

In [70]:
# First visit Monte Carlo Evaluation for state value function V
epoch = 1000 #number of simulation

terminal_states = [3,6]
start_states = [x for x in states if x not in terminal_states]

policy = np.zeros((N_STATES,N_ACTIONS))
policy = optimal_policy
#print(policy)
num_visit = np.zeros(N_STATES) # N(s)
cum_gain = np.zeros(N_STATES) # S(s)

for _ in range(epoch):
    Isvisit = np.zeros(N_STATES)
    s = np.random.choice(start_states) #randomly choose initial state
    Isvisit[s] = 1
    done = False
    simulation_history = [] # <- (s,a) pair 
    reward_history = [] # <- reward pair 
    goal_history = []
    while not done:
        # s -> a -> reward -> s1
        a = np.random.choice(actions,p=policy[s,:]) #follow policy
        reward = R[s,a]
        simulation_history.append((s,a))
        reward_history.append(reward) # state,reward pair
        s1 = np.random.choice(states,p=P[s,a,:]) #eviroment
        
        Isvisit[s1]+=1
        
        if s1 in terminal_states:
            done = True
            if s1 == 3: #goal
                simulation_history.append((s1,0))
                reward_history.append(1.)
            else: #fail
                simulation_history.append((s1,0))
                reward_history.append(-1.)
            # evaluate G(t)
            for i,r in enumerate(reward_history[::-1]):
                # G(t-1) = reward(t) + gamma * G(t)
                # if terminal G(T) = r(T)
                # To implent, i use reverse ordering
                if i==0:
                    goal_history.append(r)
                else:
                    goal_history.append(gamma * goal_history[i-1] + r)
            
            goal_history = goal_history[::-1]
            num_visit = num_visit +(Isvisit.astype(np.bool)).astype(np.float32) #make 0 or 1
            # add G(t) to s(t)
            flag = Isvisit.astype(np.bool)
            for i,(s,a) in enumerate(simulation_history):
                # i for find G(t)
                # S(s) = S(s) + G(t) for only first visit.
                if flag[s]==True: #mutiple visit during one simulation
                    cum_gain[s]= cum_gain[s] + goal_history[i]
                    flag[s] = False
                
                    
        else:
            s = s1
            
V = np.zeros(N_STATES)
V = cum_gain/(num_visit+1.0e-8)
print(V)


[ 0.8619897   0.90557279  0.94226064  1.          0.82993086  0.71237024
 -1.          0.79198555  0.76149765  0.74570073  0.56300574]


### Every-visit Monte Carlo

* Update number of visit $N(s)$ and cummulative goal $S(s)$ for Every visited state per on simulaion.