In [None]:
!pip install numpy
!pip install gym

import numpy as np
import gym
import random



The goal of this game is to go from the starting state (S) to the goal state (G) by walking only on frozen tiles (F) and avoid holes (H). However, the ice is slippery, so you won't always move in the direction you intend (stochastic environment)

In [None]:
env = gym.make("FrozenLake-v0") #selecting an environment

action_size = env.action_space.n
state_size = env.observation_space.n

# Create our Q table (64x4)
qtable = np.zeros((state_size, action_size))
print(qtable)


[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [None]:
total_episodes = 20000       # Total episodes
learning_rate = 0.7          # Learning rate
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005            # Exponential decay rate for exploration prob

In [None]:
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
      
        exp_exp_tradeoff = random.uniform(0, 1)
        
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        else:
            action = env.action_space.sample()
            
        
        new_state, reward, done, info = env.step(action)

        # update the table
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        state = new_state
        
        #if we're dead : finish episode
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)
    

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.4965
[[1.52325107e-01 2.63747971e-02 2.41794324e-02 9.16267454e-02]
 [8.83244265e-03 4.26863631e-04 4.17398217e-03 7.90928077e-02]
 [7.26867495e-03 1.05946802e-02 1.22299848e-02 3.98676648e-02]
 [4.46339989e-03 3.10979017e-03 4.29980150e-03 3.10512658e-02]
 [2.29772571e-01 1.79279259e-02 8.10617061e-03 1.65250650e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.77113437e-04 2.43558141e-04 6.23321404e-02 4.16837622e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.51027527e-02 1.75680346e-01 5.46257610e-02 4.91842297e-01]
 [8.28924645e-03 6.96406732e-01 1.50638216e-02 7.70968664e-03]
 [5.44523671e-01 3.40919657e-03 3.47419719e-02 3.47792479e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.84523242e-01 6.75633205e-02 7.42142532e-01 6.50774891e-02]
 [1.10953258e-01 9.24253552e-01 2.86890502e-01 2.31024235e-01]
 [0.00000000e+00 0.00000000e+00

In [None]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("--------------------------------------------------------------")
    print("EPISODE ", episode)

    for step in range(max_steps):
      
        action = np.argmax(qtable[state,:])        
        new_state, reward, done, info = env.step(action)
        
        if done:
            env.render()
            if new_state == 15:
                print("WE ARE THE CHAMPIONS")
            else:
                print("FATALITY")
            
            print("Number of steps", step)
            
            break
        state = new_state
env.close()

--------------------------------------------------------------
EPISODE  0
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
WE ARE THE CHAMPIONS
Number of steps 11
--------------------------------------------------------------
EPISODE  1
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
FATALITY
Number of steps 50
--------------------------------------------------------------
EPISODE  2
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
WE ARE THE CHAMPIONS
Number of steps 20
--------------------------------------------------------------
EPISODE  3
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
WE ARE THE CHAMPIONS
Number of steps 56
--------------------------------------------------------------
EPISODE  4
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
WE ARE THE CHAMPIONS
Number of steps 36


Trying algortihm with a different learning rate

In [None]:
total_episodes = 20000        # Total episodes
learning_rate = 0.5           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005            # Exponential decay rate for exploration prob

In [None]:
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
      
        exp_exp_tradeoff = random.uniform(0, 1)
        
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        else:
            action = env.action_space.sample()
            
        
        new_state, reward, done, info = env.step(action)

        # update the table
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        state = new_state
        
        #if we're dead : finish episode
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)
    

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.5225
[[0.09644137 0.0741405  0.19985092 0.07449617]
 [0.01066086 0.08292338 0.02025739 0.15208378]
 [0.17990775 0.02987693 0.04371614 0.04332038]
 [0.0020905  0.03419568 0.01713777 0.0426111 ]
 [0.25061562 0.08440413 0.04324309 0.08592872]
 [0.         0.         0.         0.        ]
 [0.00701813 0.00689937 0.34375864 0.01407261]
 [0.         0.         0.         0.        ]
 [0.01156982 0.10289315 0.15081127 0.33689173]
 [0.14082371 0.50368531 0.17084432 0.13095966]
 [0.70770846 0.06172032 0.00804478 0.05001747]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.15243704 0.33754092 0.57491194 0.00806185]
 [0.31412203 0.88745135 0.41258074 0.42848649]
 [0.         0.         0.         0.        ]]


In [None]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("--------------------------------------------------------------")
    print("EPISODE ", episode)

    for step in range(max_steps):
      
        action = np.argmax(qtable[state,:])        
        new_state, reward, done, info = env.step(action)
        
        if done:
            env.render()
            if new_state == 15:
                print("WE ARE THE CHAMPIONS")
            else:
                print("FATALITY")
            
            print("Number of steps", step)
            
            break
        state = new_state
env.close()

--------------------------------------------------------------
EPISODE  0
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
WE ARE THE CHAMPIONS
Number of steps 14
--------------------------------------------------------------
EPISODE  1
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
WE ARE THE CHAMPIONS
Number of steps 18
--------------------------------------------------------------
EPISODE  2
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
FATALITY
Number of steps 14
--------------------------------------------------------------
EPISODE  3
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
FATALITY
Number of steps 7
--------------------------------------------------------------
EPISODE  4
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
WE ARE THE CHAMPIONS
Number of steps 19
