In [1]:
import gym

env = gym.make("FrozenLake8x8-v0").env

In [None]:
"""
Documentation from https://gym.openai.com/envs/FrozenLake-v0/
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)
"""

In [2]:
# reset and display environment
env.reset()
env.render()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


In [3]:
# action space are possible moves: ["Left", "Down", "Right", "Up"] (https://github.com/openai/gym/blob/master/gym/envs/toy_text/frozen_lake.py)
print("Action Space {}".format(env.action_space))
# states are the possible positions : 4x4
print("State Space {}".format(env.observation_space))

Action Space Discrete(4)
State Space Discrete(64)


In [4]:
state, reward, done, info = env.step(1)
env.render()
print(state, reward, done, info)

  (Down)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
1 0.0 False {'prob': 0.3333333333333333}


In [5]:
# Reward table called P: states x actions
# {action: [(probability, nextstate, reward, done)]}
# probability of taking the action can be lower than one: "the ice is slippery, so you won't always move in the direction you intend."
env.P[4]

{0: [(0.3333333333333333, 4, 0.0, False),
  (0.3333333333333333, 3, 0.0, False),
  (0.3333333333333333, 12, 0.0, False)],
 1: [(0.3333333333333333, 3, 0.0, False),
  (0.3333333333333333, 12, 0.0, False),
  (0.3333333333333333, 5, 0.0, False)],
 2: [(0.3333333333333333, 12, 0.0, False),
  (0.3333333333333333, 5, 0.0, False),
  (0.3333333333333333, 4, 0.0, False)],
 3: [(0.3333333333333333, 5, 0.0, False),
  (0.3333333333333333, 4, 0.0, False),
  (0.3333333333333333, 3, 0.0, False)]}

## Run random moves and print animated result

In [6]:
env.reset()

imgs = []

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    
    imgs.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward,
        'done': done
        }
    )

In [9]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames, sleep_time=0.2):
    for i, img in enumerate(imgs):
        clear_output(wait=True)
        print(img['frame'])
        print("Step: {}".format(i))
        print("State: {}".format(img['state']))
        print("Action: {}".format(img['action']))
        print("Reward: {}".format(img['reward']))
        print("Done: {}".format(img['done']))
        sleep(sleep_time)

In [10]:
print_frames(imgs)

  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFF[41mH[0mFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

Step: 20
State: 35
Action: 2
Reward: 0.0
Done: True


## Create q-table

In [11]:
import numpy as np
q_table = np.zeros([env.observation_space.n, env.action_space.n])

## Fill q-table

In [12]:
import random

# parameter for q-table
alpha = 0.1
gamma = 0.9
epsilon = 0.4

for i in range(200000):
    state = env.reset()

    done = False    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])

        next_state, reward, done, info = env.step(action)
        # set specific reward to add penalties
        if done and reward != 1:
            reward = -10
        elif reward == 0:
            reward = -0.5
        else:
            reward = 20

        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        state = next_state
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Try n°{i}")


Try n°199900


In [13]:
env.reset()

imgs = []

done = False

while not done:
    action = np.argmax(q_table[state])
    state, reward, done, info = env.step(action)
    
    imgs.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward,
        'done': done
        }
    )

In [18]:
print_frames(imgs, 0.1)

  (Right)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m

Step: 51
State: 63
Action: 2
Reward: 1.0
Done: True
