<h2> In this notebook we provide way of usage our algorithms to solve other Enviroments (like "FrozenLake", "Taxi" etc.)

In [9]:
import random
import imageio
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
from tqdm.notebook import tqdm

from robotq.implementation import SoftmaxPolicy, Qlearning # import policy ant Q-learnig
from robotq.utils import evaluate_agent, record_video

<h5> Load env.

In [5]:
# Initialise the environment
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True, render_mode="rgb_array")
# env = gym.make("Taxi-v3", render_mode="rgb_array")

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)
for _ in range(100):
    # this is where you would insert your policy
    action = env.action_space.sample()

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)
    env.render()

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()

<h5> Enviroment size

In [6]:
state_space = env.observation_space.n
print("There are ", state_space, " possible states")

action_space = env.action_space.n
print("There are ", action_space, " possible actions")

There are  16  possible states
There are  4  possible actions


### Learning 

<h5> Learning parameters

In [7]:
# Training parameters
n_training_episodes = 30000  # Total training episodes
learning_rate = 0.7          # Learning rate

# Evaluation parameters
n_eval_episodes = 1000        # Total number of test episodes

# Environment parameters
env_id = "FrozenLake-v1"     # Name of the environment
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate
eval_seed = []               # The evaluation seed of the environment

In [8]:
policy = SoftmaxPolicy()
qlearning = Qlearning(env, policy)
Q_table = qlearning.train(n_training_episodes, max_steps, learning_rate, gamma)

  0%|          | 0/30000 [00:00<?, ?it/s]

In [12]:
Q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 2.75200369,  3.94947757,  2.75200369,  3.94947757,  5.20997639,
        -5.05052243],
       [ 7.93349184,  9.40367562,  7.93349184,  9.40367562, 10.9512375 ,
         0.40367562],
       ...,
       [10.9512375 , 12.58025   , 10.9512375 ,  9.40367562,  1.9512375 ,
         1.9512375 ],
       [ 5.20997639,  6.53681725,  5.20997639,  6.53681725, -3.79002361,
        -3.79002361],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ]], shape=(500, 6))

<h5> Evaluate trained agent

In [6]:
# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Q_table, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

  0%|          | 0/1000 [00:00<?, ?it/s]

Mean_reward=0.17 +/- 0.38


<h5> Demonstration

In [8]:
record_video(env, Q_table, f'{env_id}.mp4')