In [1]:
import random

import imageio
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
from tqdm.notebook import tqdm

from implementation import SoftmaxPolicy, Qlearning, DynaQ
from utils import evaluate_agent, record_video

In [2]:
# Initialise the environment
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True, render_mode="rgb_array")

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)
for _ in range(100):
    # this is where you would insert your policy
    action = env.action_space.sample()

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)
    env.render()

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()

# Making Q-learning method

### init Q-table

In [3]:
state_space = env.observation_space.n
print("There are ", state_space, " possible states")

action_space = env.action_space.n
print("There are ", action_space, " possible actions")

There are  16  possible states
There are  4  possible actions


### Learning 

<h5> Learning parameters

In [6]:
# Training parameters
n_training_episodes = 300000  # Total training episodes
learning_rate = 0.7          # Learning rate

# Evaluation parameters
n_eval_episodes = 1000        # Total number of test episodes

# Environment parameters
env_id = "FrozenLake-v1"     # Name of the environment
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate
eval_seed = []               # The evaluation seed of the environment

# Exploration parameters
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05            # Minimum exploration probability
decay_rate = 0.0005            # Exponential decay rate for exploration prob

In [7]:
policy = SoftmaxPolicy()
qlearning = Qlearning(env, policy)
Q_frozenlake = qlearning.train(n_training_episodes, max_steps, learning_rate, gamma)

  0%|          | 0/300000 [00:00<?, ?it/s]

In [11]:
Q_frozenlake

array([[4.76057590e-01, 4.64429274e-01, 4.70817212e-01, 4.75698962e-01],
       [3.74581527e-02, 4.69703158e-01, 1.24284116e-02, 4.77547575e-01],
       [4.43446173e-01, 4.57659679e-01, 5.72009568e-01, 4.62357700e-01],
       [3.97828892e-01, 4.52244595e-01, 1.25669982e-01, 4.31205002e-01],
       [5.36925895e-01, 5.21625565e-01, 1.71845055e-01, 4.56848119e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.58468519e-01, 1.91032576e-01, 3.76238061e-02, 3.05369918e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.55352278e-01, 5.86001428e-01, 4.59113721e-02, 6.09123747e-01],
       [1.80520226e-01, 6.52822068e-01, 4.90040369e-01, 1.96392604e-01],
       [7.15383264e-01, 5.72564648e-02, 6.05134553e-03, 1.22522214e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.39727821e-01, 6.87962115e-02, 7.33774619e

In [12]:
# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Q_frozenlake, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

  0%|          | 0/1000 [00:00<?, ?it/s]

Mean_reward=0.55 +/- 0.50


In [13]:
record_video(env, Q_frozenlake, 'record.mp4')