**1. Try a number of games**

In [81]:
import gymnasium as gym
import time

env = gym.make("FrozenLake-v1", render_mode="human")
observation, info = env.reset()
i = 0

while i < 2:
   action = env.action_space.sample()  # this is where you would insert your policy
   observation, reward, terminated, truncated, info = env.step(action)

   time.sleep(1)

   if terminated or truncated:
      observation, info = env.reset()
      i += 1
      print(f"Finish game number: {i}")

env.close()

print("Finish all the games.")

Finish game number: 1
Finish game number: 2
Finish all the games.


**2. Training**

In [82]:
import numpy as np
import gymnasium as gym

In [83]:
#env = gym.make("FrozenLake-v1", render_mode="human")
env = gym.make("FrozenLake-v1")
n_observations = env.observation_space.n
n_actions = env.action_space.n

In [84]:
# Initialize the Q-table to 0

Q_table = np.zeros((n_observations,n_actions))
Q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

**Define the parameters and hyperparameters**

In [85]:
# number of episode we will run
n_episodes = 10000

# maximum of iteration per episode
max_iter_episode = 100

# initialize the exploration probability to 1
exploration_proba = 1

# exploration decreasing decay for exponential decreasing
exploration_decreasing_decay = 0.001

# minimum of exploration proba
min_exploration_proba = 0.01

# discounted factor
gamma = 0.99

# learning rate
lr = 0.1

To evaluate the agent training, let's save the total rewards he gets from the environment

In [86]:
rewards_per_episode = list()

**Main Loop**

In [87]:
# we iterate over episodes
for e in range(n_episodes):
    # we initialize the first state of the episode
    current_state = env.reset()
    current_state = current_state[0]
    done = False

    # sum the rewards that the agent gets from the environment
    total_episode_reward = 0

    for i in range(max_iter_episode):
        # we sample a float from a uniform distribution over 0 and 1
        # if the sampled float is less than the exploration proba
        #     the agent selects arandom action
        #else
        #     he exploits his knowledge using the bellman equation
    
        if np.random.uniform(0,1) < exploration_proba:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_table[current_state,:])

        # The environment runs the chosen action and returns
        # the next state, a reward and true if the episode is ended.
        next_state, reward, done, _, _ = env.step(action)

        # We update our Q-table using the Q-learning iteration
        Q_table[current_state, action] = (1-lr)*Q_table[current_state, action] + lr*(reward + gamma*max(Q_table[next_state,:]))
        total_episode_reward = total_episode_reward + reward
        # If the episode is finished, we leave the for loop
        if done:
            break
        current_state = next_state

    # We update the exploration proba using exponential decay formula
    exploration_proba = max(min_exploration_proba, np.exp(-exploration_decreasing_decay*e))
    rewards_per_episode.append(total_episode_reward)

**3. Testing:**

In [88]:
ps_size = 1000
print(f"Mean reward per {ps_size} episodes")
for i in range(int(n_episodes/ps_size)):
    print((i+1)*1000, ": mean episode reward: ", \
          np.mean(rewards_per_episode[ps_size*i:ps_size*(i+1)]))

Mean reward per 1000 episodes
1000 : mean episode reward:  0.036
2000 : mean episode reward:  0.231
3000 : mean episode reward:  0.45
4000 : mean episode reward:  0.624
5000 : mean episode reward:  0.693
6000 : mean episode reward:  0.67
7000 : mean episode reward:  0.683
8000 : mean episode reward:  0.682
9000 : mean episode reward:  0.657
10000 : mean episode reward:  0.682


In [91]:
Q_table

array([[0.58526136, 0.50505667, 0.4884826 , 0.49338984],
       [0.40840329, 0.34154234, 0.33024599, 0.47870712],
       [0.42178925, 0.44170914, 0.42226984, 0.45424909],
       [0.24642001, 0.29195009, 0.33660252, 0.44258713],
       [0.60794649, 0.4350159 , 0.44969407, 0.35416925],
       [0.        , 0.        , 0.        , 0.        ],
       [0.20538788, 0.19301332, 0.2358002 , 0.10694198],
       [0.        , 0.        , 0.        , 0.        ],
       [0.42418241, 0.44314333, 0.42257733, 0.64523714],
       [0.4353566 , 0.69782982, 0.36059143, 0.44166379],
       [0.62100963, 0.40957889, 0.43523118, 0.2338013 ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.41595712, 0.59270378, 0.79210743, 0.51557045],
       [0.7596728 , 0.90422247, 0.79680574, 0.79045736],
       [0.        , 0.        , 0.        , 0.        ]])

In [89]:
import gymnasium as gym
import time

env = gym.make("FrozenLake-v1", render_mode="human")
observation, info = env.reset()
e = 0

while e < 10:
   action = np.argmax(Q_table[observation,:])  # this is where you would insert your policy
   observation, reward, terminated, truncated, info = env.step(action)

   time.sleep(1)

   if terminated or truncated:
      observation, info = env.reset()
      e += 1
      print(f"Finish game number: {e} - Reward: {reward}")

env.close()

print("Finish all the games.")

Finish game number: 1 - Reward: 1.0
Finish game number: 2 - Reward: 1.0
Finish game number: 3 - Reward: 1.0
Finish game number: 4 - Reward: 1.0
Finish game number: 5 - Reward: 1.0
Finish game number: 6 - Reward: 1.0
Finish game number: 7 - Reward: 1.0
Finish game number: 8 - Reward: 0.0
Finish game number: 9 - Reward: 1.0
Finish game number: 10 - Reward: 1.0
Finish all the games.
