In [None]:
import gymnasium as gym
import numpy as np
import random

In [34]:
# Create Taxi Environment
env = gym.make("Taxi-v3", render_mode="ansi")

In [29]:
# Initialize Q-table
state_space = env.observation_space.n
action_space = env.action_space.n
Q = np.zeros((state_space, action_space))

In [53]:
alpha = 0.1          # Learning rate (keep or try 0.15)
gamma = 0.95         # Discount factor (INCREASE THIS)
epsilon = 1.0        # Start with full exploration
epsilon_decay = 0.995  # Slower decay
episodes = 12000     # Keep the same
min_epsilon = 0.01   # Keep the same
max_steps = 200      # Increase this

In [54]:
for ep in range(episodes):
    state = env.reset()[0]
    done = False
    total_reward = 0

    for step in range(max_steps):
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else: 
            action = np.argmax(Q[state])
        
        next_state, reward, done, truncated, info = env.step(action)

        #Update Q-value
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
        state = next_state
        total_reward += reward
        if done:
            break

    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    if (ep+1) % 500 == 0:
        print(f"Episode {ep+1}/{episodes} completed.")
print("\nTraining completed!")

Episode 500/12000 completed.
Episode 1000/12000 completed.
Episode 1500/12000 completed.
Episode 2000/12000 completed.
Episode 2500/12000 completed.
Episode 3000/12000 completed.
Episode 3500/12000 completed.
Episode 4000/12000 completed.
Episode 4500/12000 completed.
Episode 5000/12000 completed.
Episode 5500/12000 completed.
Episode 6000/12000 completed.
Episode 6500/12000 completed.
Episode 7000/12000 completed.
Episode 7500/12000 completed.
Episode 8000/12000 completed.
Episode 8500/12000 completed.
Episode 9000/12000 completed.
Episode 9500/12000 completed.
Episode 10000/12000 completed.
Episode 10500/12000 completed.
Episode 11000/12000 completed.
Episode 11500/12000 completed.
Episode 12000/12000 completed.

Training completed!


In [55]:
#Evaluate
total_rewards = 0
for ep in range(100):
    state = env.reset()[0]
    done = False
    truncated = False
    ep_reward = 0
    steps = 0
    max_eval_steps = 200  # Safety limit
    
    while not (done or truncated) and steps < max_eval_steps:
        action = np.argmax(Q[state])
        state, reward, done, truncated, info = env.step(action)
        ep_reward += reward
        steps += 1
    
    total_rewards += ep_reward

print(f"Average reward over 100 test episodes: {total_rewards/100:.2f}")

Average reward over 100 test episodes: 7.75


In [56]:
#Demo
state = env.reset()[0]
done = False
steps = 0
while not done and steps < 50:
    action = np.argmax(Q[state])
    state, reward, done, truncated, info = env.step(action)
    print(env.render())
    steps += 1

env.close()

+---------+
|R: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (West)

+---------+
|R: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)

+---------+
|R: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[42mB[0m: |
+---------+
  (Pickup)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|[35mY[0m| : |B: |
+---------+
  (North)

+---------+
|R: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)

+---------+
|R: | : :G|
| : | : : |
| : :