In [1]:
import gym
import numpy as np
import random
from tqdm import tqdm
from gym.envs.registration import register

In [2]:
MAX_TIME_STEPS = 100

register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=MAX_TIME_STEPS,
    reward_threshold=0.8196, # optimum = .8196, changing this seems have no influence
)

In [3]:
env = gym.make("FrozenLakeNotSlippery-v0")

In [4]:
q_table = np.zeros((env.observation_space.n, env.action_space.n))

In [5]:
NUM_EPISODES = 10000
LEARNING_RATE = 0.1
DISCOUNT_RATE = 0.9
epsilon = 1

# def get_action(observation, epsilon):
#     rn = random.uniform(0,1)
#     if rn > epsilon:
#         # do exploitation
#         if np.where(q_table[observation] == 0)[0].sum() == env.action_space.n: action = env.action_space.sample()
#         else: action = np.argmax(q_table[observation])
#     else:
#         # do exploration
#         action = env.action_space.sample()
#     return action

def exploration_action():
    return env.action_space.sample()

def exploitation_action(observation):
    return np.argmax(q_table[observation])

def update_q_table(q_table, cur_ob, new_ob, action, reward):
    # apply belmann equation
    q_table[cur_ob, action] = q_table[cur_ob, action] + LEARNING_RATE*(reward + DISCOUNT_RATE*(np.max(q_table[new_ob])) - q_table[cur_ob, action])
    return q_table
    
for i_episode in tqdm(range(NUM_EPISODES),total=NUM_EPISODES):
    observation = env.reset()
    for t in range(100):
        action = exploration_action()
        cur_ob = observation
        observation, reward, done, info = env.step(action)
        update_q_table(q_table, cur_ob, observation, action, reward)
        if done : 
            break
env.close()

100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [00:01<00:00, 9587.69it/s]


In [6]:
q_table

array([[0.53144084, 0.59048982, 0.5904898 , 0.53144083],
       [0.53144083, 0.        , 0.65609978, 0.59048979],
       [0.59048978, 0.72899977, 0.59048975, 0.65609977],
       [0.65609975, 0.        , 0.59048968, 0.59048972],
       [0.5904898 , 0.6560998 , 0.        , 0.53144083],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.8099998 , 0.        , 0.65609959],
       [0.        , 0.        , 0.        , 0.        ],
       [0.65609977, 0.        , 0.72899979, 0.5904898 ],
       [0.65609972, 0.80999958, 0.80999983, 0.        ],
       [0.72899946, 0.89999989, 0.        , 0.72899948],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.80999789, 0.89999967, 0.72899862],
       [0.80998849, 0.89999865, 0.99999996, 0.80999712],
       [0.        , 0.        , 0.        , 0.        ]])

In [7]:
def test():
    observation = env.reset()
    for t in range(100):
        env.render()
        action = exploitation_action(observation)
        cur_ob = observation
        observation, reward, done, info = env.step(action)
        if done : 
            print('Episode ended after {} timesteps...'.format(t+1))
            break

    env.close()

In [8]:
test()


SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
Episode ended after 6 timesteps...
