In [1]:
pip install gymnasium


Collecting gymnasiumNote: you may need to restart the kernel to use updated packages.

  Using cached gymnasium-0.29.1-py3-none-any.whl (953 kB)
Collecting farama-notifications>=0.0.1
  Using cached Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [6]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

env = gym.make("Taxi-v3")


In [7]:
def monte_carlo(env, num_episodes, gamma=0.9, epsilon=0.1):
    Q = np.zeros([env.observation_space.n, env.action_space.n])
    returns_sum = np.zeros([env.observation_space.n, env.action_space.n])
    returns_count = np.zeros([env.observation_space.n, env.action_space.n])
    cumulative_rewards = []

    for i in range(num_episodes):
        state = env.reset()[0]
        episode = []
        done = False
        total_reward = 0

        while not done:
            if np.random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])
            next_state, reward, done, _, _ = env.step(action)
            episode.append((state, action, reward))
            state = next_state
            total_reward += reward

        cumulative_rewards.append(total_reward)

        G = 0
        for state, action, reward in reversed(episode):
            G = gamma * G + reward
            if not (state, action) in [(x[0], x[1]) for x in episode[:-1]]:
                returns_sum[state, action] += G
                returns_count[state, action] += 1
                Q[state, action] = returns_sum[state, action] / returns_count[state, action]

    return Q, cumulative_rewards


In [8]:
def q_learning(env, num_episodes, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q = np.zeros([env.observation_space.n, env.action_space.n])
    cumulative_rewards = []

    for i in range(num_episodes):
        state = env.reset()[0]
        done = False
        total_reward = 0

        while not done:
            if np.random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward

            best_next_action = np.argmax(Q[next_state])
            td_target = reward + gamma * Q[next_state, best_next_action]
            td_delta = td_target - Q[state, action]
            Q[state, action] += alpha * td_delta

            state = next_state

        cumulative_rewards.append(total_reward)

    return Q, cumulative_rewards


In [None]:
num_episodes = 3000

Q_mc, mc_rewards = monte_carlo(env, num_episodes)
Q_ql, ql_rewards = q_learning(env, num_episodes)

plt.plot(mc_rewards, label='Monte Carlo')
plt.plot(ql_rewards, label='Q-Learning')
plt.xlabel('Episode')
plt.ylabel('Cumulative Reward')
plt.title('Monte Carlo vs Q-Learning on Taxi-v3')
plt.legend()
plt.show()
