In [1]:
"""
Model-free Control for OpenAI FrozenLake env (https://gym.openai.com/envs/FrozenLake-v0/)
Bolei Zhou for IERG6130 course example
"""
import gym,sys,numpy as np
from gym.envs.registration import register

In [2]:
no_slippery = True
render_last = True # whether to visualize the last episode in testing

In [3]:
# -- hyperparameters--
num_epis_train = 10000
num_iter = 100
learning_rate = 0.01
discount = 0.8
eps = 0.3

In [4]:
if no_slippery == True:
    # the simplified frozen lake without slippery (so the transition is deterministic)
    register(
        id='FrozenLakeNotSlippery-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery': False},
        max_episode_steps=1000,
        reward_threshold=0.78, # optimum = .8196
    )
    env = gym.make('FrozenLakeNotSlippery-v0')
else:
    # the standard slippery frozen lake
    env = gym.make('FrozenLake-v0')

In [5]:
q_learning_table = np.zeros([env.observation_space.n,env.action_space.n])

In [6]:
# -- training the agent ----
for epis in range(num_epis_train):
    state = env.reset()
    for iter in range(num_iter):
        if np.random.uniform(0, 1) < eps:
            action = np.random.choice(env.action_space.n)
        else:
            action = np.argmax(q_learning_table[state,:])
        state_new, reward, done,_ = env.step(action)
        q_learning_table[state,action] = q_learning_table[state, action] + learning_rate * (reward + discount*np.max(q_learning_table[state_new,:]) - q_learning_table[state, action])
        state = state_new
        if done: break

In [7]:
print(np.argmax(q_learning_table,axis=1))
print(np.around(q_learning_table,6))

[1 0 0 0 1 0 1 0 2 2 1 0 0 2 2 0]
[[2.34263e-01 3.27677e-01 1.30567e-01 2.37921e-01]
 [2.27968e-01 0.00000e+00 2.43700e-03 1.91240e-02]
 [3.23450e-02 9.41700e-03 0.00000e+00 9.00000e-05]
 [0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
 [2.76927e-01 4.09598e-01 0.00000e+00 2.20370e-01]
 [0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
 [0.00000e+00 4.86645e-01 0.00000e+00 1.17000e-03]
 [0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
 [3.50856e-01 0.00000e+00 5.11999e-01 2.81964e-01]
 [3.27977e-01 4.42630e-01 6.40000e-01 0.00000e+00]
 [4.05087e-01 8.00000e-01 0.00000e+00 2.42706e-01]
 [0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
 [0.00000e+00 1.11932e-01 7.64365e-01 1.06050e-01]
 [4.31344e-01 6.78247e-01 1.00000e+00 5.19312e-01]
 [0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]]


In [8]:
if no_slippery == True:
    print('---Frozenlake without slippery move-----')
else:
    print('---Standard frozenlake------------------')

---Frozenlake without slippery move-----


In [9]:
# visualize no uncertainty
num_episode = 500
rewards = 0
for epi in range(num_episode):
    s = env.reset()
    for _ in range(100):
        action  = np.argmax(q_learning_table[s,:])
        state_new, reward_episode, done_episode, _ = env.step(action)
        if epi == num_episode -1 and render_last:
            env.render()
        s = state_new
        if done_episode:
            if reward_episode==1:
                rewards += 1
            break

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


In [10]:
print('---Success rate=%.3f'%(rewards*1.0 / num_episode))
print('-------------------------------')

---Success rate=1.000
-------------------------------
