# Import gym library

In [6]:
import gym

In [7]:
env = gym.make('FrozenLake-v1')

  deprecation(
  deprecation(


# Model based environment
We have access to the P dictionnary. 

From each state *(s)* and taking any possible action *(a)*, we get the probability to visit next state *(s')*, the reward we obtain from it *(r)* and wether or not it causes the environment to end *(done).*

***(s') = (p(s'|s,a), s', r(s,a,s'), done)***

## States
4 * 4 tiles = 16 states

## Actions

0. LEFT

1. DOWN

2. RIGHT

3. UP


In [8]:
"""
Here, from state 6 and taking action 2,
we have 1/3 chance to get to state 10, 7 or 2, giving us 0 reward.
The environment stops if we end up in state 7 because it is a hole.
"""
env.P[6][2]

[(0.3333333333333333, 10, 0.0, False),
 (0.3333333333333333, 7, 0.0, True),
 (0.3333333333333333, 2, 0.0, False)]

In [9]:
# Define number of actions and number of states
nS = env.observation_space.n
nA = env.action_space.n

# Policy Iteration

In [10]:
import numpy as np

In [11]:
def compute_q_value_for_s_a(env, V, s, a, gamma):
    q = 0
    for (p_sPrime, sPrime, r_sasPrime, done) in env.P[s][a]:
        q += p_sPrime * (r_sasPrime + gamma * V[sPrime])
    return q

In [12]:
def evaluate_policy(env, pi, V, gamma, theta):
    V_updated = np.copy(V)
    improved = True

    while True:
        delta = 0
        for s in range(nS):
            V_new = 0
            for a in range(nA):
                prob_a = pi[s][a]
                q_s_a = compute_q_value_for_s_a(env, V_updated, s, a, gamma)
                V_new += prob_a * q_s_a
            delta = max(delta, np.abs(V_new - V_updated[s]))
            V_updated[s] = V_new
        if (delta < theta):
            break

    if(np.array_equal(V, V_updated)):
        improved = False

    return V_updated, improved

In [13]:
def improve_policy(env, pi, V, gamma):
    for s in range(nS):
        q_s = np.zeros([nA, 1])

        for a in range(nA):
            q_s[a] = compute_q_value_for_s_a(env, V, s, a, gamma)
        
        best_a = np.argmax(q_s)
        pi[s] = np.eye(nA)[best_a]

    return pi

In [14]:
pi = np.ones([nS, nA]) * 0.25
V = np.zeros([nS, 1])

gamma = 0.99
theta = 0.0001

In [15]:
i = 0
while True:
    i+=1
    V, improved = evaluate_policy(env, pi, V, gamma, theta)
    pi = improve_policy(env, pi, V, gamma)

    if (improved == False):
        print(f"Completed after {i} iterations.")
        break

Completed after 586 iterations.


In [16]:
pi

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.]])