In [1]:
import numpy as np
import gym
import random
import time


import IPython
import IPython.display
from IPython.display import clear_output

In [4]:
env = gym.make("FrozenLake-v0")

In [10]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros(( state_space_size, action_space_size))

print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [12]:
# Game's Ruleset
# State Description     Reward
# S     Start             0
# F     Frozen Surface    0
# H     Hole - GameOver   0
# G     Goal - GameOver   1

In [33]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.01
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001


In [34]:
rewards_all_episodes = []

#Q-Learning Algorithm
for episode in range(num_episodes):
    state = env.reset() #start a new ep
    
    done = False #flag for end of an ep
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        
        #Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1) #limit to decide if explore or exploit
        
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:]) #exploit - choose the most valuable action greedly
        else:
            action = env.action_space.sample() #explore - choose randomly another action
        
        new_state, reward, done, info = env.step(action) #update the env and get the feedbacks
        
        #Update Q-Table for Q(s,a) with Bellman Equation's Output
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        
        state = new_state #update the agent to next start upon the aforechosen action
        rewards_current_episode += reward
        
        if done == True: #if episode finished
            break
            
    # Exploration Rate Decay
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    
    #Building the policy
    rewards_all_episodes.append(rewards_current_episode)

# Calculate and print the avg reward per thousand eps
rewards_per_thousand_episodes = np.split(
    np.array(rewards_all_episodes),num_episodes/1000)

count = 1000

print("AVG Reward per 1000s Eps (~Winning Rate)")
for r in rewards_per_thousand_episodes:
    print(count, ":", str(sum(r/1000)))
    count += 1000
    
# Print Updated Q-Table
print("Q-Table")
print(q_table)

AVG Reward per 1000s Eps (~Winning Rate)
1000 : 0.06300000000000004
2000 : 0.21300000000000016
3000 : 0.4200000000000003
4000 : 0.6130000000000004
5000 : 0.6570000000000005
6000 : 0.6930000000000005
7000 : 0.6590000000000005
8000 : 0.6730000000000005
9000 : 0.7110000000000005
10000 : 0.6660000000000005
Q-Table
[[0.5357752  0.52250493 0.5239079  0.52218561]
 [0.32965524 0.35389848 0.3147426  0.50010783]
 [0.43062433 0.43036291 0.42452065 0.46778721]
 [0.30090278 0.34029703 0.30133387 0.45450189]
 [0.55105784 0.38803851 0.36786375 0.36740855]
 [0.         0.         0.         0.        ]
 [0.3064856  0.21187465 0.30578149 0.16162041]
 [0.         0.         0.         0.        ]
 [0.41572713 0.39799189 0.38257207 0.57885499]
 [0.46686112 0.61847357 0.46790598 0.42872343]
 [0.5757802  0.50032777 0.38518384 0.32920903]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.4653914  0.5715849  0.73263064 0.52783442]
 [0.74193296 0.86765785 0.8053

In [36]:
for episode in range(3):
    state = env.reset()
    done = False 
    print("-----------------Episode(",episode,")-------------------")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)
        
        if done:
            IPython.display.clear_output(wait=True)
            env.render()
            if reward == 1:
                print("******You reached the Goal!!!*******")
                time.sleep(3)
            else:
                print("******You fell through the hole!!!*******")
                time.sleep(3)
            IPython.display.clear_output(wait=True)
            break
            
        state = new_state
IPython.display.clear_output(wait=True)
print("The end")
env.close()
            
            

The end
