In [1]:
import gym
import numpy as np
import time
from IPython.display import clear_output

In [2]:
NUM_EPISODES = 10_000
MAX_STEPS_PER_EPISODE = 1000

LEARNING_RATE = 1e-1
DISCOUNT_RATE = .99
MAX_EXPLORATION_RATE = 1
MIN_EXPLORATION_RATE = 1e-2
EXPLORATION_DECAY_RATE = 1e-3

In [3]:
exploration_rate = MAX_EXPLORATION_RATE

env = gym.make('FrozenLake-v1', render_mode='ansi')

action_space_size = env.action_space.n
state_space_size = env.observation_space.n
q_table = np.zeros((state_space_size, action_space_size))

In [8]:
def run_q_learning() -> list[float]:
  global exploration_rate
  episodes_reward: list[float] = list()
  for episode in range(NUM_EPISODES):
    state, observation = env.reset()
    done = False
    rewards_current_episode = 0
    for step in range(MAX_STEPS_PER_EPISODE):
      exploration_rate_threshold = np.random.uniform(0, 1)
      if exploration_rate_threshold > exploration_rate:
          action = np.argmax(q_table[state, :])
      else:
          action = env.action_space.sample()
      new_state, reward, done, truncated, info = env.step(action)
      q_table[state, action] = q_table[state, action] * (1 - LEARNING_RATE) + \
        LEARNING_RATE * (reward + DISCOUNT_RATE * np.max(q_table[new_state, :]))
      state = new_state
      rewards_current_episode += reward
      if done: break

    exploration_rate = MIN_EXPLORATION_RATE + (MAX_EXPLORATION_RATE - MIN_EXPLORATION_RATE) * np.exp(-EXPLORATION_DECAY_RATE * episode)
    episodes_reward.append(rewards_current_episode)

  return episodes_reward

In [5]:
def run_game():
  for episode in range(3):
    state, observation = env.reset()
    done = False
    print('Episode', episode + 1)
    time.sleep(1)
    for step in range(MAX_STEPS_PER_EPISODE):
      clear_output(wait=True)
      print(env.render())
      time.sleep(.3)

      action = np.argmax(q_table[state, :])
      new_state, reward, done, truncated, info = env.step(action)
      if truncated:
        print('The goal is failed')
        break
      
      if done:
        clear_output(wait=True)
        print(env.render())
        if reward == 1:
          print('The goal is reached')
        else:
          print('The goal is failed')
        time.sleep(3)
        clear_output(wait=True)
        break
      state = new_state

In [6]:
def show_score(episodes_reward: list[float]):
  rewards_per_thousand_episodes = np.split(np.array(episodes_reward), NUM_EPISODES / 1000)
  count = 1000
  for r in rewards_per_thousand_episodes:
    print(count, str(sum(r / 1000)))
    count += 1000
  print('\nq_tabble:')
  print(q_table)

In [9]:
episodes_reward = run_q_learning()
show_score(episodes_reward)

1000 0.04500000000000003
2000 0.18500000000000014
3000 0.4040000000000003
4000 0.5810000000000004
5000 0.7140000000000005
6000 0.7260000000000005
7000 0.7810000000000006
8000 0.7630000000000006
9000 0.7440000000000005
10000 0.7300000000000005

q_tabble:
[[0.44480623 0.44410118 0.44397135 0.44388917]
 [0.29842049 0.22497557 0.31134099 0.42998582]
 [0.40460213 0.41073472 0.39858591 0.41879196]
 [0.26056392 0.31724715 0.21872503 0.41409896]
 [0.45458627 0.37479793 0.43156364 0.35981271]
 [0.         0.         0.         0.        ]
 [0.32835878 0.15183088 0.1750311  0.13329   ]
 [0.         0.         0.         0.        ]
 [0.40084182 0.35989647 0.35737617 0.47854796]
 [0.39420656 0.53519484 0.38353808 0.4023924 ]
 [0.52739983 0.35525754 0.35967594 0.37845706]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.36341002 0.51800079 0.63148473 0.55408558]
 [0.71457    0.84612721 0.71651533 0.70365685]
 [0.         0.         0.         0.    

In [12]:
run_game()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m

The goal is reached


In [None]:
env.close()