In [26]:
import gymnasium as gym


In [27]:
import gymnasium as gym
import numpy as np
import random


In [34]:
# Create Taxi Environment
env = gym.make("Taxi-v3", render_mode="ansi")

In [29]:
# Initialize Q-table
state_space = env.observation_space.n
action_space = env.action_space.n
Q = np.zeros((state_space, action_space))

In [31]:
# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1
epsilon_decay = 0.999
episodes = 10000
min_epsilon = 0.01
max_steps = 100

In [32]:
for ep in range(episodes):
    state = env.reset()[0]
    done = False
    total_reward = 0

    for step in range(max_steps):
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else: 
            action = np.argmax(Q[state])
        
        next_state, reward, done, truncated, info = env.step(action)

        #Update Q-value
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
        state = next_state
        total_reward += reward
        if done:
            break

    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    if (ep+1) % 500 == 0:
        print(f"Episode {ep+1}/{episodes} completed.")
print("\nTraining completed!")

Episode 500/10000 completed.
Episode 1000/10000 completed.
Episode 1500/10000 completed.
Episode 2000/10000 completed.
Episode 2500/10000 completed.
Episode 3000/10000 completed.
Episode 3500/10000 completed.
Episode 4000/10000 completed.
Episode 4500/10000 completed.
Episode 5000/10000 completed.
Episode 5500/10000 completed.
Episode 6000/10000 completed.
Episode 6500/10000 completed.
Episode 7000/10000 completed.
Episode 7500/10000 completed.
Episode 8000/10000 completed.
Episode 8500/10000 completed.
Episode 9000/10000 completed.
Episode 9500/10000 completed.
Episode 10000/10000 completed.

Training completed!


In [None]:
#Evaluate
total_rewards = 0
for ep in range(6):
    state = env.reset()[0]
    done = False
    ep_reward = 0
    while not done:
        action = np.argmax(Q[state])
        state, reward, done, truncated, info = env.step(action)
        ep_reward += reward
    total_rewards += ep_reward

print(f"Average reward over 100 test epoodes: {total_rewards/100:.2f}")

In [35]:
#Demo
state = env.reset()[0]
done = False
steps = 0
while not done and steps < 50:
    action = np.argmax(Q[state])
    state, reward, done, truncated, info = env.step(action)
    print(env.render())
    steps += 1

env.close()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)

+---------+
|R: | : :[34;1mG[0m|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)

+---------+
|R: | : :[34;1m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)

+---------+
|R: | : :[42mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)

+---------+
|R: | : :G|
| : | : :[42m_[0m|
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)

+---------+
|R: | : :G|
| : | :[