In [1]:
import numpy as np

In [2]:
### Markov Decision Process for Small Gridworld
# 1) State-space -> self.states
# 2) Action-space -> self.actions
# 3) State transition probabilities -> self.P
# 4) Reward function -> self.rewards

SHAPE = (4,4)
UP, DOWN, LEFT, RIGHT = 0, 1, 2, 3

# Helper function to convert state index from 2d to 1d
def get_state_idx_1d(x,y,shape):
    return x*shape[1]+y

class Gridworld:
    def __init__(self, shape):
        self.shape = shape
        num_states = np.prod(shape)
        num_actions = 4
        self.states = np.arange(num_states)
        self.actions = [UP,DOWN,LEFT,RIGHT]
        self.gamma = 1
        rewards = [-1 for j in range(num_states)]
        rewards[0] = rewards[num_states-1] = 0
        self.rewards = rewards
        
        P = np.zeros((num_states,num_actions,num_states,2)) # for each state-action pair ((x,y), a) => stores P((x',y')) and expected reward
        states = np.arange(num_states).reshape(shape)
        iterator = np.nditer(states, flags=['multi_index'])

        # Probability of next state
        def get_next_state(x,y,a):
            if (x==0 and y==0) or (x==shape[0]-1 and y==shape[1]-1):
                return get_state_idx_1d(x,y,shape)
            nx = x
            ny = y
            if a == UP:
                nx = x-1
            elif a == DOWN:
                nx = x+1
            elif a == LEFT:
                ny = y-1
            else:
                 ny = y+1
            if nx < 0 or nx > shape[0]-1:
                nx = x
            if ny < 0 or ny > shape[1]-1:
                ny = y
            return get_state_idx_1d(nx, ny, shape)

        while not iterator.finished:
            x,y = iterator.multi_index
            cur_state = get_state_idx_1d(x,y,shape)
            for a in {UP,DOWN,LEFT,RIGHT}:
                next_state = get_next_state(x,y,a)
                P[cur_state][a][next_state][0] = 1
                P[cur_state][a][next_state][1] = rewards[cur_state]
            iterator.iternext()

        self.P = P

gridworld = Gridworld(SHAPE)

In [3]:
# Policy Evaluation Algorithm

def get_value_grid(values):
    n = len(values[0])
    dim = int(n**0.5)
    return np.round(values,2).reshape((dim,dim))

def policy_evaluation(policy, gridworld, threshold=1e-5):
    states = gridworld.states
    actions = gridworld.actions
    gamma = gridworld.gamma
    n = len(states)
    
    values = np.random.random((1,n))
    values[0][0] = values[0][n-1] = 0
    iter_num = 0
    print_iter = 0

    print("Initial Value Function:\n{}\n".format(get_value_grid(values)))
    
    while True:
        delta = 0
        for s in states:
            prev_val = values[0][s]
            temp = 0
            for a in actions:
                transition_prob = gridworld.P[s,a,:,0]
                reward_fun = gridworld.P[s,a,:,1]
                temp += policy[s][a] * np.multiply(transition_prob,(reward_fun + gamma*values)).sum()
            values[0][s] = temp
            delta = max(delta, abs(values[0][s] - prev_val))
        if delta < threshold:
            break
        
        if iter_num == 0 or iter_num == 2**print_iter:
            print("Iteration {}".format(iter_num))
            print("Current Value Function:\n{}\n".format(get_value_grid(values)))
            print_iter += 1
        iter_num += 1

    print("Final Value Function:\n{}\n".format(get_value_grid(values)))
    print ("Converged in {} iterations.".format(iter_num))
    return values

In [4]:
states = gridworld.states
actions = gridworld.actions
policy = np.zeros((len(states),len(actions)))+0.25

values = policy_evaluation(policy, gridworld, threshold=1e-5)

Initial Value Function:
[[0.   0.13 0.68 0.48]
 [0.06 0.25 0.66 0.96]
 [0.8  0.73 0.72 0.58]
 [0.34 0.23 0.45 0.  ]]

Iteration 0
Current Value Function:
[[ 0.   -0.74 -0.73 -0.7 ]
 [-0.72 -1.02 -1.02 -1.05]
 [-0.71 -1.19 -1.29 -1.44]
 [-0.95 -1.36 -1.55  0.  ]]

Iteration 2
Current Value Function:
[[ 0.   -2.5  -3.33 -3.6 ]
 [-2.52 -3.62 -4.22 -4.37]
 [-3.44 -4.3  -4.57 -3.95]
 [-3.79 -4.5  -4.    0.  ]]

Iteration 4
Current Value Function:
[[ 0.   -4.2  -5.87 -6.43]
 [-4.24 -5.92 -6.83 -6.97]
 [-5.96 -6.88 -6.87 -5.68]
 [-6.55 -7.04 -5.71  0.  ]]

Iteration 8
Current Value Function:
[[  0.    -7.06 -10.02 -11.01]
 [ -7.08  -9.49 -10.75 -10.85]
 [-10.05 -10.77 -10.2   -8.17]
 [-11.06 -10.88  -8.18   0.  ]]

Iteration 16
Current Value Function:
[[  0.   -10.56 -15.05 -16.55]
 [-10.56 -13.78 -15.41 -15.46]
 [-15.05 -15.41 -14.13 -11.11]
 [-16.56 -15.47 -11.11   0.  ]]

Iteration 32
Current Value Function:
[[  0.   -13.15 -18.78 -20.66]
 [-13.15 -16.96 -18.87 -18.88]
 [-18.78 -18.87 -17.