# Policy Evaluation for Gridworld
![gridword_policy_eval](../images/lecture_1/grid_world_policy_evaluation.png)


In [8]:
import gymnasium as gym
import numpy as np


In [9]:
from envs import GridworldEnv
env=GridworldEnv()
env.P

{0: {0: [(1.0, 0, 0.0, True)],
  1: [(1.0, 0, 0.0, True)],
  2: [(1.0, 0, 0.0, True)],
  3: [(1.0, 0, 0.0, True)]},
 1: {0: [(1.0, 1, -1.0, False)],
  1: [(1.0, 2, -1.0, False)],
  2: [(1.0, 5, -1.0, False)],
  3: [(1.0, 0, -1.0, True)]},
 2: {0: [(1.0, 2, -1.0, False)],
  1: [(1.0, 3, -1.0, False)],
  2: [(1.0, 6, -1.0, False)],
  3: [(1.0, 1, -1.0, False)]},
 3: {0: [(1.0, 3, -1.0, False)],
  1: [(1.0, 3, -1.0, False)],
  2: [(1.0, 7, -1.0, False)],
  3: [(1.0, 2, -1.0, False)]},
 4: {0: [(1.0, 0, -1.0, True)],
  1: [(1.0, 5, -1.0, False)],
  2: [(1.0, 8, -1.0, False)],
  3: [(1.0, 4, -1.0, False)]},
 5: {0: [(1.0, 1, -1.0, False)],
  1: [(1.0, 6, -1.0, False)],
  2: [(1.0, 9, -1.0, False)],
  3: [(1.0, 4, -1.0, False)]},
 6: {0: [(1.0, 2, -1.0, False)],
  1: [(1.0, 7, -1.0, False)],
  2: [(1.0, 10, -1.0, False)],
  3: [(1.0, 5, -1.0, False)]},
 7: {0: [(1.0, 3, -1.0, False)],
  1: [(1.0, 7, -1.0, False)],
  2: [(1.0, 11, -1.0, False)],
  3: [(1.0, 6, -1.0, False)]},
 8: {0: [(1.0, 4

In [10]:
env.action_space.n

4

In [11]:
env.observation_space.n

16

In [12]:
def q_state_action(v: np.ndarray, state: int, action: int, P, gamma) -> float:
    q = 0
    for (probability, s_next, reward, _) in P[state][action]:
        q += probability*(reward+gamma*v[s_next])
    return q


In [13]:
def policy_evaluation(policy: np.ndarray, num_state: int, P, gamma=0.9, epsilon=1e-3) -> np.ndarray:
    """policy: np.ndarry with shape (env.observation_space.n, env.action_space.n), denotes the probability of taking each action under each state"""
    v_prev = np.zeros(num_state)
    while True:
        v_next = np.zeros(num_state)
        for s_i in range(num_state):
            for action, action_probability in enumerate(policy[s_i]):
                v_next[s_i] += action_probability * \
                    q_state_action(v_prev, s_i, action, P, gamma)
        if np.sum(np.power(v_next-v_prev, 2)) < epsilon:
            break
        v_prev = v_next
    return v_prev


In [14]:
# random policy, the agent can go LEFT,RIGHT,UP,DOWN with equal probability in grid world
policy=np.ones((env.observation_space.n,env.action_space.n))/env.action_space.n
policy_evaluation(policy,env.observation_space.n,env.P,gamma=1,epsilon=1e-3)

array([  0.        , -13.88940057, -19.83611156, -21.81659914,
       -13.88940057, -17.85562398, -19.83720786, -19.83611156,
       -19.83611156, -19.83720786, -17.85562398, -13.88940057,
       -21.81659914, -19.83611156, -13.88940057,   0.        ])