# Reinforcement Learning (Part B)
B. Solve the Taxi problem using reinforcement learning where the agent acts as a taxi
driver to pick up a passenger at one location and then drop the passenger off at their
destination.

In [1]:
import numpy as np
import gymnasium as gym

In [2]:
# Create Taxi environment with ANSI render mode (so it prints in notebook)
env = gym.make("Taxi-v3", render_mode="ansi")

In [3]:
# Initialize Q-table
state_size = env.observation_space.n   # total states
action_size = env.action_space.n       # total actions
Q = np.zeros((state_size, action_size))

In [4]:
# Hyperparameters
alpha = 0.1      # Learning rate
gamma = 0.6      # Discount factor
epsilon = 0.1    # Exploration-exploitation
episodes = 10000 # Training episodes

In [5]:
# Training loop
for episode in range(episodes):
    state, _ = env.reset()
    done = False

    while not done:
        # Exploration-exploitation tradeoff
        if np.random.uniform(0, 1) < epsilon: # epsilon-greedy policy
            action = env.action_space.sample()  # explore = random actions
        else:
            action = np.argmax(Q[state])        # exploit = best known actions

        next_state, reward, done, truncated, info = env.step(action)

        # Q-Learning update = Bellman Equation
        Q[state, action] = Q[state, action] + alpha * (
            reward + gamma * np.max(Q[next_state]) - Q[state, action]
        )

        state = next_state

print("Training finished!\n")

Training finished!



In [6]:
# ---------------- Testing the trained agent ----------------
state, _ = env.reset()
print(env.render())   # Initial state

done = False
steps = 0
while not done:
    action = np.argmax(Q[state])  # Best action
    state, reward, done, truncated, info = env.step(action)

    # Print environment after each step
    print(env.render())

    steps += 1

+---------+
|R: | : :G|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+


+---------+
|R: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)

+---------+
|R: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)

+---------+
|R: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)

+---------+
|R: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+
  (South)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[42mY[0m| : |[35mB[0m: |
+---------+
  (Pickup)

+---------+
|R: | : :G|
| : | :

In [7]:
print(f"Episode finished in {steps} steps with reward {reward}")
env.close()

Episode finished in 15 steps with reward 20
