In [1]:
import gym
import random
import numpy as np

In [2]:
class DoubleQLearning:
    def __init__(self, n_states: int, n_actions: int) -> None:
        self.n_states = n_states
        self.n_actions = n_actions

        self.table_1 = np.zeros((n_states, n_actions))
        self.table_2 = np.zeros((n_states, n_actions))

    def __getitem__(self, state: int) -> list[float]:
        return self.table_1[state] + self.table_2[state]

    def update_values(
        self, state: int, next_state: int, action: int, alpha: float, reward: float, gamma: float
    ) -> None:
        if np.random.random() < 0.5:
            self.table_1[state, action] += alpha * (
                reward
                + gamma * self.table_2[next_state][np.argmax(self.table_1[next_state])]
                - self.table_1[state][action]
            )
        else:
            self.table_2[state, action] += alpha * (
                reward
                + gamma * self.table_1[next_state][np.argmax(self.table_2[next_state])]
                - self.table_2[state][action]
            )


class QLearning:
    def __init__(self, n_states: int, n_actions: int) -> None:
        self.n_states = n_states
        self.n_actions = n_actions

        self.table = np.zeros((n_states, n_actions))

    def __getitem__(self, state: int) -> list[float]:
        return self.table[state]

    def update_values(
        self, state: int, next_state: int, action: int, alpha: float, reward: float, gamma: float
    ) -> None:
        self.table[state, action] += alpha * (
            reward + gamma * np.max(self.table[next_state]) - self.table[state, action]
        )


class Sarsa:
    def __init__(self, n_states: int, n_actions: int) -> None:
        self.n_states = n_states
        self.n_actions = n_actions

        self.table = np.zeros((n_states, n_actions))

    def __getitem__(self, state: int) -> list[float]:
        return self.table[state]

    def update_values(
        self, state: int, next_state: int, action: int, alpha: float, reward: float, gamma: float
    ) -> None:
        average_qvalue = self.calculate_average_qvalue(self.table[next_state], epsilon=0.1)

        self.table[state, action] += alpha * (reward + average_qvalue - self.table[state, action])

    def calculate_average_qvalue(self, values, epsilon=0):
        max_value = max(values)
        n_actions = len(values)
        n_greedy_actions = 0
        for v in values:
            if v == max_value:
                n_greedy_actions += 1

        non_greedy_action_probability = epsilon / n_actions
        greedy_action_probability = (
            (1 - epsilon) / n_greedy_actions
        ) + non_greedy_action_probability

        result = 0
        for v in values:
            if v == max_value:
                result += v * greedy_action_probability
            else:
                result += v * non_greedy_action_probability

        return result

In [3]:
def train_agent(agent, n_games: int = 5000, alpha: float = 0.7, gamma: float = 0.9) -> None:
    env = gym.make(
        "FrozenLake-v1",
        map_name="8x8",
        is_slippery=False,
    )

    for episodes in range(n_games):
        done = False
        state, _ = env.reset()

        while not done:
            if np.max(agent[state]) > 0:
                action = np.argmax(agent[state])
            else:
                action = env.action_space.sample()

            new_state, reward, done, *_ = env.step(action)
            agent.update_values(state, new_state, action, alpha, reward, gamma)
            state = new_state

        if episodes % 500 == 0:
            print(f"Game {episodes} ended with reward = {reward}.")

    env.close()

In [4]:
def test_agent(agent, n_games: int = 5) -> None:
    env = gym.make(
        "FrozenLake-v1",
        render_mode="human",
        map_name="8x8",
        is_slippery=False,
    )

    total_reward = 0
    for _ in range(n_games):
        done = False
        state, _ = env.reset()

        while not done:
            action = np.argmax(agent[state])
            new_state, reward, done, *_ = env.step(action)
            state = new_state
            total_reward += reward

    print(f"Average reward score {n_games} games: {total_reward / n_games * 100}%")
    env.close()

In [None]:
size = 8 * 8
n_actions = 4
seed = 42

q_table = QLearning(size, n_actions)
sarsa = Sarsa(size, n_actions)
double_q_table = DoubleQLearning(size, n_actions)

print("Training Q-Learning agent...")
np.random.seed(seed)
random.seed(seed)
train_agent(q_table)

print("\nTraining sarsa agent...")
np.random.seed(seed)
random.seed(seed)
train_agent(sarsa)

print("\nTraining Double Q-Learning agent...")
np.random.seed(seed)
random.seed(seed)
train_agent(double_q_table)

In [8]:
test_agent(sarsa)

  if not isinstance(terminated, (bool, np.bool8)):


Average reward score 5 games: 100.0%
