In [1]:
import gym
env = gym.make("FrozenLake8x8-v0").env

In [None]:
"""
Documentation from https://gym.openai.com/envs/FrozenLake-v0/
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)
"""

In [2]:
# reset and display environment
env.reset()
env.render()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


In [3]:
# actions: 4 possible actions ["Left", "Down", "Right", "Up"]
# https://github.com/openai/gym/blob/master/gym/envs/toy_text/frozen_lake.py
print("Action Space {}".format(env.action_space))
# states: 8x8=64 possible positions for the agent
print("State Space {}".format(env.observation_space))

Action Space Discrete(4)
State Space Discrete(64)


In [4]:
action = 1
state, reward, done, info = env.step(action)
env.render()
print(state, reward, done, info)

  (Down)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
8 0.0 False {'prob': 0.3333333333333333}


In [5]:
# Reward table called P: states x actions
# {action: [(probability, nextstate, reward, done)]}
# probability of taking the action can be lower than one:
# As the ice is slippery, the move won't always be the intended one
env.P[19]

{0: [(1.0, 19, 0, True)],
 1: [(1.0, 19, 0, True)],
 2: [(1.0, 19, 0, True)],
 3: [(1.0, 19, 0, True)]}

## Run random moves and print animated result

In [6]:
env.reset()

imgs = []

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    
    imgs.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward,
        'done': done
        }
    )

In [7]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames, sleep_time=0.1):
    for i, img in enumerate(imgs):
        clear_output(wait=True)
        print(img['frame'])
        print("Step: {}".format(i))
        print("State: {}".format(img['state']))
        print("Action: {}".format(img['action']))
        print("Reward: {}".format(img['reward']))
        print("Done: {}".format(img['done']))
        sleep(sleep_time)

In [8]:
print_frames(imgs)

  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
F[41mH[0mHFFFHF
FHFFHFHF
FFFHFFFG

Step: 7
State: 41
Action: 2
Reward: 0.0
Done: True


## Create q-table

In [9]:
import numpy as np
q_table = np.zeros([env.observation_space.n, env.action_space.n])

## Fill q-table

In [10]:
import random

# parameter for q-table
alpha = 0.1
gamma = 0.9
epsilon = 0.4

for i in range(200000):
    state = env.reset()

    done = False    
    while not done:
        # probability of taking a random action
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        # take the action that lead to better reward
        else:
            action = np.argmax(q_table[state])

        # do a move
        next_state, reward, done, info = env.step(action)

        ## set specific reward to add penalties
        # if in a hole
        if done and reward != 1:
            reward = -10
        # if on a frozen position
        elif reward == 0:
            reward = -0.5
        # if goal reached
        else:
            reward = 20

        # get current reward for the given action in the current state
        current_expected_reward = q_table[state, action]

        # get max possible calculated reward from next state
        next_state_max_reward = np.max(q_table[next_state])

        # update q-table with formula
        updated_reward = (1 - alpha) * current_expected_reward + alpha * (reward + gamma * next_state_max_reward)
        q_table[state, action] = updated_reward

        state = next_state
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Try n°{i}")


Try n°199900


In [11]:
env.reset()

imgs = []

done = False

while not done:
    action = np.argmax(q_table[state])
    state, reward, done, info = env.step(action)
    
    imgs.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward,
        'done': done
        }
    )

In [16]:
print_frames(imgs)

  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m

Step: 30
State: 63
Action: 2
Reward: 1.0
Done: True
