In [58]:
import gym
import numpy as np
import time
from IPython import display

In [59]:
# env = gym.make('FrozenLake-v0')
env = gym.make('FrozenLake8x8-v0')

In [60]:
env.P[0][0]

[(0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 8, 0.0, False)]

In [61]:
env.observation_space.n

64

In [62]:
env.action_space.n

4

In [63]:
def value_iteration(env, max_iters, gamma):
    v_values = np.zeros(env.observation_space.n)

    for i in range(max_iters):
        prev_v_values = np.copy(v_values)

        # Compute the value for state
        for state in range(env.observation_space.n):
            q_values = []
            # Compute the q-value for each action
            for action in range(env.action_space.n):
                q_value = 0
                # Loop through each possible outcome
                for prob, next_state, reward, done in env.P[state][action]:
                    q_value += prob * (reward + gamma * prev_v_values[next_state])
                
                q_values.append(q_value)
            
            # Select the best action
            best_action = np.argmax(q_values)
            v_values[state] = q_values[best_action]
        
        # Check convergence
        if np.all(np.isclose(v_values, prev_v_values)):
            print(f'Converged at {i}-th iteration.')
            break
    
    return v_values

In [64]:
v_values = value_iteration(env, max_iters=1000, gamma=0.9)

Converged at 117-th iteration.


In [65]:
def policy_extraction(env, v_values, gamma=0.9):
    policy = np.zeros(env.observation_space.n, dtype=np.int)

    # Compute the best action for each state in the game
    # Compute q-value for each (state-action) pair in the game
    for state in range(env.observation_space.n):
        q_values = []
        # Compute q_value for each action
        for action in range(env.action_space.n):
            q_value = 0
            # Loop through each possible outcome
            for prob, next_state, reward, done in env.P[state][action]:
                q_value += prob * (reward + gamma * v_values[next_state])
            q_values.append(q_value)
        
        # Select the best action
        best_action = np.argmax(q_values)
        policy[state] = best_action
    
    return policy

In [66]:
policy = policy_extraction(env, v_values, gamma=0.9)

In [67]:
policy

array([3, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 1, 3, 3, 0, 0, 2, 3,
       2, 1, 3, 3, 3, 1, 0, 0, 2, 1, 3, 3, 0, 0, 2, 1, 3, 2, 0, 0, 0, 1,
       3, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 1, 1, 0])

In [68]:
def play(env, policy):
    state = env.reset()
    total_reward = 0
    done = False
    steps = 0
    time.sleep(1)
    display.clear_output(wait=True)
    while not done:
        action = policy[state]
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        print(f'Step {steps}')
        env.render()
        time.sleep(0.2)
        if not done:
           display.clear_output(wait=True)
        state = next_state

    return total_reward

In [69]:
play(env, policy)

Step 63
  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m


1.0

In [70]:
def play_multiple_times(env, policy, max_episodes):
    success = 0

    for i in range(max_episodes):
        reward = play(env, policy)

        if reward > 0:
            success += 1
    
    print(f'Number of successes: {success}/{max_episodes}')

In [71]:
# play_multiple_times(env, policy, 1000)