# Policy Evaluation for Gridworld
![gridword_policy_eval](../images/lecture_1/grid_world_policy_evaluation.png)


In [1]:
import numpy as np
from envs import GridworldEnv
env=GridworldEnv()
print("number of states: "+str(env.observation_space.n))
print("number of available actions: "+str(env.action_space.n))
print("transition matrix P:")
env.P

number of states: 16
number of available actions: 4
transition matrix P:


{0: {0: [(1.0, 0, 0.0, True)],
  1: [(1.0, 0, 0.0, True)],
  2: [(1.0, 0, 0.0, True)],
  3: [(1.0, 0, 0.0, True)]},
 1: {0: [(1.0, 1, -1.0, False)],
  1: [(1.0, 2, -1.0, False)],
  2: [(1.0, 5, -1.0, False)],
  3: [(1.0, 0, -1.0, True)]},
 2: {0: [(1.0, 2, -1.0, False)],
  1: [(1.0, 3, -1.0, False)],
  2: [(1.0, 6, -1.0, False)],
  3: [(1.0, 1, -1.0, False)]},
 3: {0: [(1.0, 3, -1.0, False)],
  1: [(1.0, 3, -1.0, False)],
  2: [(1.0, 7, -1.0, False)],
  3: [(1.0, 2, -1.0, False)]},
 4: {0: [(1.0, 0, -1.0, True)],
  1: [(1.0, 5, -1.0, False)],
  2: [(1.0, 8, -1.0, False)],
  3: [(1.0, 4, -1.0, False)]},
 5: {0: [(1.0, 1, -1.0, False)],
  1: [(1.0, 6, -1.0, False)],
  2: [(1.0, 9, -1.0, False)],
  3: [(1.0, 4, -1.0, False)]},
 6: {0: [(1.0, 2, -1.0, False)],
  1: [(1.0, 7, -1.0, False)],
  2: [(1.0, 10, -1.0, False)],
  3: [(1.0, 5, -1.0, False)]},
 7: {0: [(1.0, 3, -1.0, False)],
  1: [(1.0, 7, -1.0, False)],
  2: [(1.0, 11, -1.0, False)],
  3: [(1.0, 6, -1.0, False)]},
 8: {0: [(1.0, 4

In [2]:
def q_state_action(v: np.ndarray, state: int, action: int, P, gamma):
    q = 0
    for (probability, s_next, reward, _) in P[state][action]:
        q += probability*(reward+gamma*v[s_next])
    return q


In [3]:
"""For simplicity, let's assume policy is fixed"""
from typing import List


def policy_evaluation(policy: List[int], num_states: int, P, gamma, epsilon):
    v_prev = np.zeros(num_states)
    while True:
        v_next = np.zeros(num_states)
        for s_i in range(num_states):
            action = policy[s_i]
            v_next[s_i] = q_state_action(v_prev, s_i, action, P, gamma)
        if np.sum(np.power(v_next-v_prev, 2)) < epsilon:
            break
        v_prev = v_next
    return v_prev


In [4]:
def policy_improvement(v: np.ndarray, policy: List, num_actions: int, num_states: int, P, gamma):
    new_policy = policy.copy()
    is_policy_stable=True
    for s_i in range(num_states):
        qs = [q_state_action(v, s_i, action, P, gamma)
              for action in range(num_actions)]
        idx_maxqs = np.argmax(qs)
        if idx_maxqs != policy[s_i]:
            new_policy[s_i] = idx_maxqs
            is_policy_stable=False
    return new_policy,is_policy_stable


In [5]:
def policy_iteration(P,num_states:int,num_actions:int,gamma=0.9,epsilon=1e-3):
    policy=[0]*num_states
    v_prev=policy_evaluation(policy,num_states,P,gamma,epsilon)
    while True:
        policy,is_policy_stable=policy_improvement(v_prev,policy,num_actions,num_states,P,gamma)
        if is_policy_stable:
            break
        else:
            v_next=policy_evaluation(policy,num_states,P,gamma,epsilon)
            v_prev=v_next
    return policy
        

In [6]:
policy_iteration(env.P, env.observation_space.n, env.action_space.n,gamma=0.9)


[0, 3, 3, 2, 0, 0, 0, 2, 0, 0, 1, 2, 0, 1, 1, 0]