In [5]:
import gym
import numpy as np
import random
import time
from IPython.display import clear_output

In [6]:
env = gym.make('FrozenLake-v0')

In [11]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

In [12]:
q_table = np.zeros((state_space_size, action_space_size))

In [21]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.001
exploration_decay_rate = 0.001

In [22]:
rewards_all_episodes = []

#Q-learning
for episode in range(num_episodes):
  state = env.reset()
  done = False
  rewards_current_episode = 0

  for step in range(max_steps_per_episode):

    #exploration-exploitation trade-off:
    exploration_rate_threshold = random.uniform(0,1)
    if exploration_rate_threshold > exploration_rate:
      action = np.argmax(q_table[state,:])
    else:
      action = env.action_space.sample()
    new_state, reward, done, info = env.step(action)

    #update Q-table for Q(s,a)
    q_table[state, action] = q_table[state,action]*(1-learning_rate) + learning_rate *(reward + discount_rate* np.max(q_table[new_state,:]))

    state = new_state
    rewards_current_episode += reward
    if done == True:
      break
    #exploration rate decay
  exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
  rewards_all_episodes.append(rewards_current_episode)

    

In [23]:
#calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print('average reward per thousand episodes')
for r in rewards_per_thousand_episodes:
  print(count, ':', str(sum(r/1000)))
  count += 1000


#Print updated Q-table
print('Q-table')
print(q_table)


average reward per thousand episodes
1000 : 0.04500000000000003
2000 : 0.21100000000000016
3000 : 0.4020000000000003
4000 : 0.5900000000000004
5000 : 0.6890000000000005
6000 : 0.6710000000000005
7000 : 0.7080000000000005
8000 : 0.7290000000000005
9000 : 0.7260000000000005
10000 : 0.7450000000000006
Q-table
[[0.56506556 0.48012166 0.48104739 0.48680785]
 [0.41199821 0.35048139 0.28740836 0.5210417 ]
 [0.41082484 0.40819537 0.40962314 0.49702218]
 [0.32773965 0.33348709 0.2542463  0.47702364]
 [0.58417088 0.28823446 0.4044961  0.40863996]
 [0.         0.         0.         0.        ]
 [0.16716443 0.15085248 0.33267606 0.10216576]
 [0.         0.         0.         0.        ]
 [0.44701534 0.46475379 0.40021172 0.64821434]
 [0.31977207 0.71162708 0.43565127 0.29513377]
 [0.68099165 0.39100202 0.33551247 0.29803679]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.44353644 0.55508376 0.78549535 0.47330635]
 [0.72733433 0.8774639  0.69869269

In [26]:
#visualize the process
for episode in range(3):
  state = env.reset()
  done = False
  print(f"episode{episode+1}")
  time.sleep(1)

  for step in range(max_steps_per_episode):
    clear_output(wait = True)
    env.render() # visually see the agent
    time.sleep(0.3)

    action = np.argmax(q_table[state,:])
    new_state, reward, done, info = env.step(action)

    if done:
      clear_output(wait = True)
      env.render()
      if reward == 1:
        print('the end')
        time.sleep(0.3)
      else:
        print('fall into a hole')
        time.sleep(0.3)
      clear_output(wait = True)
      break
    state = new_state

env.close()


  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
the end


array([0, 1, 1, ..., 0, 0, 0])

In [25]:
q_table

array([[0.56506556, 0.48012166, 0.48104739, 0.48680785],
       [0.41199821, 0.35048139, 0.28740836, 0.5210417 ],
       [0.41082484, 0.40819537, 0.40962314, 0.49702218],
       [0.32773965, 0.33348709, 0.2542463 , 0.47702364],
       [0.58417088, 0.28823446, 0.4044961 , 0.40863996],
       [0.        , 0.        , 0.        , 0.        ],
       [0.16716443, 0.15085248, 0.33267606, 0.10216576],
       [0.        , 0.        , 0.        , 0.        ],
       [0.44701534, 0.46475379, 0.40021172, 0.64821434],
       [0.31977207, 0.71162708, 0.43565127, 0.29513377],
       [0.68099165, 0.39100202, 0.33551247, 0.29803679],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.44353644, 0.55508376, 0.78549535, 0.47330635],
       [0.72733433, 0.8774639 , 0.69869269, 0.77608202],
       [0.        , 0.        , 0.        , 0.        ]])