In [1]:
import numpy as np
import gym
import random

In [2]:
env = gym.make("FrozenLake-v0")

In [3]:
action_size = env.action_space.n
state_size = env.observation_space.n

In [4]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


## Step 3: Create the hyperparameters ⚙️
- Here, we'll specify the hyperparameters

In [5]:
total_episodes = 10000        # Total episodes
learning_rate = 0.7           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005             # Exponential decay rate for exploration prob

## Step 4: The Q learning algorithm 🧠
- Now we implement the Q learning algorithm:
<img src="qtable_algo.png" alt="Q algo"/>

In [22]:
rewards = []
for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        #epsln greedy
        exp_exp_tradeoff = random.uniform(0, 1)

        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        else:
            action = env.action_space.sample()

        new_state, reward, done, info = env.step(action)

        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        total_rewards += reward
        
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.4787
[[2.72742580e-01 1.27848450e-01 1.22873512e-01 5.16037924e-02]
 [5.27903276e-02 2.66535868e-03 8.94477376e-05 1.24592669e-01]
 [1.57056775e-02 1.48950382e-02 2.11678421e-02 1.04147267e-01]
 [1.43219842e-02 1.33516320e-03 3.19049461e-03 8.57053708e-02]
 [3.04885157e-01 6.80459921e-02 7.76183517e-02 4.53572917e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.69328258e-01 2.56490430e-05 1.37598051e-03 1.45701969e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.26361020e-02 1.76019221e-01 2.29292409e-02 4.45198318e-01]
 [6.95565449e-02 6.79868413e-01 1.46928900e-01 7.44936402e-02]
 [7.27866122e-01 2.98787760e-03 4.82809151e-02 5.22869164e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.01583336e-02 2.42672989e-01 8.78891923e-01 5.90993041e-02]
 [4.46421602e-01 9.94378294e-01 3.73231518e-01 4.73709175e-01]
 [0.00000000e+00 0.00000000e+00

In [7]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
****************************************************
EPISODE  1
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 44
****************************************************
EPISODE  2
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 34
****************************************************
EPISODE  3
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 14
****************************************************
EPISODE  4
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 22


In [27]:
qtable = np.load('frozenlake50k.npy')

In [28]:
qtable

array([[1.19275933e-02, 1.06638167e-02, 1.29004182e-01, 1.23703189e-02],
       [5.66596921e-03, 2.19786584e-02, 5.56448552e-03, 1.52687872e-01],
       [7.53786289e-03, 1.06542242e-02, 8.59945709e-03, 1.57668984e-01],
       [1.12627468e-02, 2.20352946e-03, 1.54872854e-04, 1.08232483e-01],
       [9.74109065e-02, 4.31822782e-02, 4.18546005e-03, 7.49370363e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.39567155e-01, 2.93500747e-06, 5.92865031e-07, 3.34320506e-08],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.42141545e-03, 2.57195005e-02, 5.01695509e-03, 5.25938181e-01],
       [8.91795382e-04, 6.62975536e-01, 1.35455681e-03, 7.86434344e-04],
       [8.46053756e-01, 1.35333901e-04, 3.62208966e-04, 2.44142604e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.89565276e-02, 2.41107199e-03, 7.47268178e

In [31]:
from IPython.display import clear_output
import time
r = 0
_ = env.reset()
for i in range(2):
    r = 0
    _ = env.reset()
    for step in range(max_steps):        
            # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        new_state, reward, done, info = env.step(action)    
        r += reward
        env.render()
        print(i,step)
        time.sleep(0.3)
        if done:
                # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()  
            print(r)
            time.sleep(2)
#             time.sleep(0.1)
                # We print the number of step it took.
            print("Number of steps", step)
            break
        clear_output(wait=True)
        state = new_state
    

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
1 78
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
1.0
Number of steps 78


In [29]:
qtable

array([[3.14991432e-01, 1.27364740e-01, 1.00949066e-01, 1.00788304e-01],
       [1.47916905e-02, 1.15377539e-02, 2.06704590e-05, 1.07740871e-01],
       [1.04253938e-02, 1.18793387e-02, 9.20682999e-02, 7.20686071e-02],
       [1.00683982e-02, 7.79418609e-03, 9.48760992e-04, 6.99295907e-02],
       [5.41013124e-01, 3.50847315e-03, 4.74362771e-02, 2.33429349e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [8.22613501e-04, 6.77050935e-05, 2.86096554e-02, 2.70650396e-07],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.43960236e-02, 9.56940555e-03, 1.64824200e-02, 6.28427806e-01],
       [6.12204323e-02, 7.86358745e-01, 4.22077988e-02, 2.98263135e-02],
       [9.24924486e-01, 5.09685525e-03, 2.17881203e-02, 1.97336607e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.66685499e-02, 1.91222312e-01, 9.22602536e