## Iterative Policy Evaluations

In [1]:
# Policy Evaluation on Python

import numpy as np

class GridWorld:
    def __init__(self):
        self.state = np.zeros([4, 4]) # 4x4 grid
        self.action_space = [0, 1, 2, 3] # Up, Down, Left, Right

        self.goal_pos = {'y' : 3, 'x' : 3}
        self.y_min, self.x_min, self.y_max, self.x_max = 0, 0, 3, 3

        self.gamma = 1.0

    def reset(self):
        self.state = np.zeros([4, 4])
        return self.state
    
    def s_next(self, s_t, a_t):
        pos = s_t

        if (a_t == 0):
            pos[0] = max(s_t[0] - 1, self.y_min)
        elif (a_t == 1):
            pos[0] = min(s_t[0] + 1, self.y_max)
        elif (a_t == 2):
            pos[1] = max(s_t[1] - 1, self.x_min)
        elif (a_t == 3):
            pos[1] = min(s_t[1] + 1, self.x_max)
        else:
            assert False, "Invalid action"
        
        return pos
    
    def reward(self, s_t, a_t, s_next):
        if (s_t[0] == self.goal_pos['y'] and s_t[1] == self.goal_pos['x']):
            return  0
        else:
            return -1

In [5]:
def policy_evaluation(env, policy):

    delta = 1
    theta = 0.01

    loop_count = 0
    while (delta >= theta):
        delta = 0
        y, x = env.state.shape
        new_state = np.zeros((y, x))

        for i in range(y * x - 1):
            v_s = 0
            s_t = np.divmod(i, y)

            for a_t in env.action_space:
                pi_a = policy[i][a_t]
                p_ss = 1.0
                s_t1 = env.s_next(list(s_t), a_t)
                reward = env.reward(s_t, a_t, s_t1)

                v_s = v_s + pi_a * p_ss * (reward + env.gamma * env.state[s_t1[0], s_t1[1]])
            
            new_state[s_t[0], s_t[1]] = v_s

        value_delta = np.sum(np.abs(new_state - env.state))
        env.state = new_state

        delta = max(delta, value_delta)

        loop_count += 1
        print(f"[{loop_count}] Delta : {delta}")
            
    return env.state

In [6]:
env = GridWorld()
policy = list()

for s in range(16):
    pi = np.array([0.25, 0.25, 0.25, 0.25])
    policy.append(pi)

res = policy_evaluation(env, policy)
print(res.reshape(4, 4))

[1] Delta : 15.0
[2] Delta : 14.5
[3] Delta : 14.125
[4] Delta : 13.78125
[5] Delta : 13.46875
[6] Delta : 13.171875
[7] Delta : 12.890625
[8] Delta : 12.619384765625
[9] Delta : 12.358154296875
[10] Delta : 12.104400634765625
[11] Delta : 11.85809326171875
[12] Delta : 11.617958068847656
[13] Delta : 11.383915901184082
[14] Delta : 11.155274629592896
[15] Delta : 10.931927978992462
[16] Delta : 10.713471204042435
[17] Delta : 10.499791122972965
[18] Delta : 10.290631527081132
[19] Delta : 10.085883170366287
[20] Delta : 9.885369663126767
[21] Delta : 9.688989738759119
[22] Delta : 9.49661156568618
[23] Delta : 9.3081424066404
[24] Delta : 9.123476347296673
[25] Delta : 8.942528328504523
[26] Delta : 8.765208237020943
[27] Delta : 8.591437486539633
[28] Delta : 8.42113614051064
[29] Delta : 8.25423096130434
[30] Delta : 8.090648987349121
[31] Delta : 7.930321420373927
[32] Delta : 7.773180405994669
[33] Delta : 7.619160896299107
[34] Delta : 7.468199024496759
[35] Delta : 7.32023298601

## Policy Improvement

In [None]:
def policy_improvement(env, policy):
    gamma = 1.0
    y, x = env.state.shape

    for i in range(y*x):
        s_t = np.divmod(i, y)
        action_values = np.zeros(len(env.action_space))
        for a in env.action_space:
            # Bellman Eqn.
            action_value = 0
            p_s_next = 1.0
            s_next = env.s_next(list(s_t), a)
            reward = env.reward(s_t, a, s_next)

            action_value = p_s_next * (reward + gamma * env.state[s_next[0], s_next[1]])
            action_values[a] = action_value

        # max_value input with argmax()
        a_max = action_values.argmax() 

        policy[i][:] = 0
        policy[i][a_max] = 1

    return policy

In [6]:
env = GridWorld()

policy = []
for s in range(16):
    pi = np.array([0.25, 0.25, 0.25, 0.25])
    policy.append(pi)

value_vector = np.zeros(16)
delta = 5
cnt = 0
Delta = 0

while (True):
    env.reset()
    new_value_vector = policy_evaluation(env, policy)
    Delta = 0
    Delta = max(Delta, np.sum(np.abs(new_value_vector[0] - value_vector[0])))

    if (Delta >= delta):
        value_vector = new_value_vector
        policy = policy_improvement(env, policy)
        print(policy)
        cnt += 1
        print(f"[{cnt}] Delta : {Delta}")
        
    else:
        break

[1] Delta : 15.0
[2] Delta : 14.5
[3] Delta : 14.125
[4] Delta : 13.78125
[5] Delta : 13.46875
[6] Delta : 13.171875
[7] Delta : 12.890625
[8] Delta : 12.619384765625
[9] Delta : 12.358154296875
[10] Delta : 12.104400634765625
[11] Delta : 11.85809326171875
[12] Delta : 11.617958068847656
[13] Delta : 11.383915901184082
[14] Delta : 11.155274629592896
[15] Delta : 10.931927978992462
[16] Delta : 10.713471204042435
[17] Delta : 10.499791122972965
[18] Delta : 10.290631527081132
[19] Delta : 10.085883170366287
[20] Delta : 9.885369663126767
[21] Delta : 9.688989738759119
[22] Delta : 9.49661156568618
[23] Delta : 9.3081424066404
[24] Delta : 9.123476347296673
[25] Delta : 8.942528328504523
[26] Delta : 8.765208237020943
[27] Delta : 8.591437486539633
[28] Delta : 8.42113614051064
[29] Delta : 8.25423096130434
[30] Delta : 8.090648987349121
[31] Delta : 7.930321420373927
[32] Delta : 7.773180405994669
[33] Delta : 7.619160896299107
[34] Delta : 7.468199024496759
[35] Delta : 7.32023298601

In [18]:
policy

[array([0., 1., 0., 0.]),
 array([0., 1., 0., 0.]),
 array([0., 1., 0., 0.]),
 array([0., 1., 0., 0.]),
 array([0., 1., 0., 0.]),
 array([0., 1., 0., 0.]),
 array([0., 1., 0., 0.]),
 array([0., 1., 0., 0.]),
 array([0., 1., 0., 0.]),
 array([0., 1., 0., 0.]),
 array([0., 1., 0., 0.]),
 array([0., 1., 0., 0.]),
 array([0., 0., 0., 1.]),
 array([0., 0., 0., 1.]),
 array([0., 0., 0., 1.]),
 array([0., 1., 0., 0.])]

In [24]:
new_value_vector

array([[-6., -5., -4., -3.],
       [-5., -4., -3., -2.],
       [-4., -3., -2., -1.],
       [-3., -2., -1.,  0.]])