In [14]:
# -*- coding: utf-8 -*-
"""
Created on Thu May 14 17:21:04 2020

@author: KRC
"""

import numpy as np
import gym
import random
import time
from IPython.display import clear_output

env = gym.make("FrozenLake-v0",is_slippery=True) #Stochastic when slippery, else deterministic

action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.002

rewards_all_episodes = []



### Training

In [15]:
for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode): 
      exploration_rate_threshold = random.uniform(0, 1)
      
      if exploration_rate_threshold > exploration_rate:
          action = np.argmax(q_table[state,:]) 
      else:
          action = env.action_space.sample()

      new_state, reward, done, info = env.step(action)

      q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
      learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

      state = new_state
      rewards_current_episode += reward 

      if done == True: 
          break

    exploration_rate = min_exploration_rate + \
    (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)

    rewards_all_episodes.append(rewards_current_episode)




### Q-Table

In [16]:
rewards_per_thosand_episodes = np.array_split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000    

for r in rewards_per_thosand_episodes:
    print(count, ":",str(sum(r/1000)))
    count += 1000

print("\n\n********Q-table********\n")
print(q_table)


1000 : 0.03000000000000002
2000 : 0.4230000000000003
3000 : 0.6490000000000005
4000 : 0.6740000000000005
5000 : 0.6460000000000005
6000 : 0.6760000000000005
7000 : 0.6650000000000005
8000 : 0.6850000000000005
9000 : 0.6740000000000005
10000 : 0.6550000000000005


********Q-table********

[[0.50330531 0.46973102 0.4625756  0.45643163]
 [0.2331282  0.16489422 0.16040824 0.42066935]
 [0.36789212 0.27160271 0.25504602 0.23417668]
 [0.03830253 0.05152301 0.03508598 0.30402531]
 [0.52823113 0.4215968  0.35141014 0.29822365]
 [0.         0.         0.         0.        ]
 [0.12203111 0.14814948 0.28970537 0.04970642]
 [0.         0.         0.         0.        ]
 [0.35130342 0.33265985 0.25837806 0.56202216]
 [0.39816056 0.63523028 0.49715926 0.39637035]
 [0.5895792  0.37127417 0.32074235 0.31021129]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.47880518 0.6385495  0.73190019 0.54054984]
 [0.67398595 0.86421289 0.65860101 0.7149813 ]
 [0.  

### Agent plays Frozen Lake by playing the best action from each state according to the Q-table

In [21]:
#remove the comment clear output to see action in real time

for episode in range(3):
    state = env.reset()
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)

    for step in range(max_steps_per_episode):        
        #clear_output(wait=True)
        env.render()
        print("Reward: {:.2f}".format(reward))
        print(info)
        time.sleep(0.3)

        action = np.argmax(q_table[state,:])        
        new_state, reward, done, info = env.step(action)
        
        if done:
            #clear_output(wait=True)
            env.render()
            print("Reward: {:.2f}".format(reward))
            print(info)
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
            #clear_output(wait=True)
            break            
            
        state = new_state
        
env.close()

*****EPISODE  1 *****





[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 1.00
{'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
SFFF
[41mF[

  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00
{'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
Reward: 0.00