In [None]:
#!pip install gymnasium --q

In [None]:
import numpy as np

import matplotlib.pyplot as plt

import matplotlib.animation
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.rc('animation', html='jshtml')

seed = 12345
slippery = False
interval = 200

np.random.seed(seed)

In [None]:
def plot_animation(frames, repeat=False, interval=interval):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = matplotlib.animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch

def render_env(e):
    plt.imshow(e.render())
    plt.xticks([])
    plt.yticks([])
    plt.show()

In [None]:
import gymnasium as gym

In [None]:
env = gym.make('FrozenLake-v1', render_mode = 'rgb_array', is_slippery = slippery)
env.reset(seed=seed)
plt.imshow(env.render())
plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
num_episodes = 25000
max_steps_per_episode = 400
alpha = 0.1
gamma = 0.9
eps = 0.15

In [None]:
def policy(e, Q, state, epsilon):
    if np.random.random(1) < epsilon:
        action = e.action_space.sample()
    else:
        action = np.argmax(Q[state])
    return action

In [None]:
def qlearning(e, alpha, gamma, num_episodes, max_steps, epsilon, eseed = seed):
    qtable = np.zeros((e.observation_space.n, e.action_space.n)) # Initialize the Q table to zeros
    for episode in range(num_episodes):
        state, _ = e.reset(seed=eseed)
        action = policy(e, qtable, state, epsilon)
        done, terminated = False, False
        
        while not done:
            next_state, reward, terminated, done, _ = e.step(action)
            next_action = policy(e, qtable, next_state, epsilon)
            
            qtable[state, action] = qtable[state, action] + alpha*(reward + gamma*np.max(qtable[next_state]) - qtable[state,action])
            state= next_state
            action = next_action
    return qtable

In [None]:
Q = qlearning(e=env,
              alpha=alpha,
              gamma=gamma,
              num_episodes=num_episodes,
              epsilon = eps,
              max_steps = max_steps_per_episode
             )

In [None]:
def test_agent(e, qtable, eseed=seed):
    images = [] 
    terminated = False
    state,_ = env.reset(seed=eseed)
    img = env.render()
    images.append(img)
    while True:
        action = np.argmax(qtable[state][:])
        next_state, reward, terminated, done, info = env.step(action)
        img = env.render()
        images.append(img)
        state = next_state
        if terminated or done:
            break
        
    return images

In [None]:
frames = test_agent(env, Q)

In [None]:
plot_animation(frames)