In [2]:
import numpy as np
import gymnasium as gym
import random
import time
from IPython.display import clear_output

In [3]:
env = gym.make('FrozenLake-v1', render_mode='ansi')

## Create the Q-Table
We create the Q-Table and initialize all the Q-values to zero for each state-action pair.
The number of rows in the table is equivalent to the size of the **state space** (or **observation space**) and the number of columns is equivalent to the size of the **action space**.

In [5]:
action_space_size = env.action_space.n
observation_space_size = env.observation_space.n
print(action_space_size)
print(observation_space_size)

4
16


In [6]:
q_table = np.zeros((observation_space_size, action_space_size))
q_table
# Now, we have the q_table filled with zeros where the rows correspond to each state of the environment and each columns correspond to the possible actions

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

## Initialise the Q-Learning Parameters (Hyperparameters)

We need to initialise all the parameter used mainly:
- The number of maximum episodes we are going to train the agent on
- The maximum number of step per episode
- The learning rate (which is **alpha**)
- The discount rate (which is **gamma**)
- The exploration rate (which is **epsilon**)
- The minimum and maximum exploration rate (epsilon). Because epsilon is a number between zero and one, we set the it to be (0.01 <= eps <= 1)
- The exploration decay rate (**epsilon decay**). This corresponds to the decay we substract to the exploration rate each episodes.

In [7]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1 # Alpha. We choose 0.1 as an arbitrary value
discount_rate = 0.99 # Gamma. We choose 0.99 as an arbitrary value

exploration_rate = 1 # Epsilon. First epsilon value is always 1 because at the start, we don't have any information useful in our Q-Table (all zeros)
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_rate_decay = 0.01 # We choose 0.001 to decay the exploration rate, again it is arbitrary.

## Q-Learning Algorithm
Here, we code the Q-Learning algorithm training loop which will update the Q-Table according thanks to the Q-Learning formula and the others hyperparameters.

Q-Value formula:
$$\begin{equation*} q^{new}\left( s,a\right) =\left( 1-\alpha \right) ~\underset{\text{old value} }{\underbrace{q\left( s,a\right) }\rule[-0.05in]{0in}{0.2in} \rule[-0.05in]{0in}{0.2in}\rule[-0.1in]{0in}{0.3in}}+\alpha \overset{\text{ learned value}}{\overbrace{\left(
                                                R_{t+1}+\gamma \max_{a^{^{\prime }}}q\left( s^{\prime },a^{\prime }\right) \right) }} \end{equation*}$$

This formula is used to update the Q-Values in the Q-Table.

In [8]:
rewards_all_episodes = [] # Just store the sequential list of reward for each episodes for further statistics calculation

for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    rewards_current_episode = 0

    for step in range(max_steps_per_episode):
        # We choose to either explore the environment or exploit the known Q_value. This exploration depends on the current exploration rate which gives just the probability for the agent to explore or exploit.
        exploration_rate_threshold = random.uniform(0, 1) # Generate a random number between 0 and 1 (0 and 1 are included)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state]) # argmax return the indices of the maximum values
        else:
            action = env.action_space.sample()

        # We can now take the action
        new_state, reward, terminated, truncated, info = env.step(action)

        # After the action has been taken, we needs to update the Q-value for that state-action par in the Q-table.
        # This is done using the Q-Value formula.
        # Alpha = learning_rate
        # Gamma = discount_rate
        q_table[state, action] = (1 - learning_rate) * q_table[state, action] + learning_rate * (reward + discount_rate * np.max(q_table[new_state]))

        state = new_state #  The new state is now the current state
        rewards_current_episode += reward

        if terminated:
            break

    # After this episode, we need to decay the exploration rate using the Exponential decay formula:
    # https://en.wikipedia.org/wiki/Exponential_decay
    # The exponential decay formula just means that the exploration rate decreases or decays at a rate proportional to its current value.
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_rate_decay * episode)

    rewards_all_episodes.append(rewards_current_episode)

## Statistics calculation
At the end of the algorithm, we want to calculate the average reward per thousand episodes

In [9]:
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
print("Average reward per thousand episodes:\n")
count = 1000
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

Average reward per thousand episodes:

1000 :  0.0
2000 :  0.0
3000 :  0.0
4000 :  0.0
5000 :  0.0
6000 :  0.0
7000 :  0.0
8000 :  0.0
9000 :  0.0
10000 :  0.0


## Test the trained agent on a game of Frozen Lake


In [11]:
test_env = gym.make('FrozenLake-v1', render_mode='human')
for episode in range(3):
    state, _ = test_env.reset()
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        print(test_env.render())
        time.sleep(0.3)

        # Take the best action from the Q-Table
        action = np.argmax(q_table[state])
        new_state, reward, terminated, truncated, info = test_env.step(action)

        if terminated:
            clear_output(wait=True)
            print(test_env.render())
            if reward == 1:
                print("The agent reached the goal!")
                time.sleep(3)
            else:
                print("The agent fell through a hole!")
                time.sleep(3)
                clear_output(wait=True)
            break

        state = new_state

test_env.close()

None
The agent fell through a hole!
