In [2]:
import numpy as np
import gymnasium as gym 
import random

In [40]:
n_episodes = 200000
current_epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.001
decay_rate = 0.0001
Reward_list = []

max_steps = 100
gamma = 0.99
alpha = 0.1
epsilon_max = 1.0
epsilon_min = 0.01
epsilon_decay = 0.001

In [41]:
def print_policy(Q, env, cols, rows):
    def action_to_symbol(action):
        return ['↓', '↑', '→', '←', 'p', 'd'][action]

    policy = np.zeros((rows, cols), dtype=str)
    for state in range(env.observation_space.n):
        if np.sum(Q[state] == 0):
            policy[state // cols, state % cols] = 'o'
        else:
            best_action = np.argmax(Q[state])
            policy[state // cols, state % cols] = action_to_symbol(best_action)
    
    desc = env.unwrapped.desc
    for i in range(rows):
        for j in range(cols):
            if desc[i][j] in b'|':
                policy[i, j] = desc[i][j].decode('utf-8')
    policy[0,0] = 'S'

    print('LEARNED POLICY')
    print()
    for row in policy:
        print(' '.join(row))

In [42]:
env = gym.make('Taxi-v3')

Q = np.zeros((env.observation_space.n, env.action_space.n))
N = np.zeros((env.observation_space.n, env.action_space.n))

def epsilon_greedy_policy(state, epsilon):
    if np.random.random() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state])

total_rewards = []

for episode in range(n_episodes):
    state, info = env.reset()
    epsilon = max(epsilon_min, epsilon_max - (epsilon_max - epsilon_min) * (episode / n_episodes))

    episode_states = []
    episode_actions = []
    episode_rewards = []

    for step in range(max_steps):
        action = epsilon_greedy_policy(state, epsilon)

        next_state, reward, done, _, _ = env.step(action)

        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)

        if done:
            break

        state = next_state

    G = 0
    for t in range(len(episode_states) -1, -1, -1):
        state = episode_states[t]
        action = episode_actions[t]
        G = gamma * G + episode_rewards[t]

        N[state, action] += 1
        Q[state, action] += (alpha * (G - Q[state, action]))
    
    total_rewards.append(sum(episode_rewards))

    if (episode + 1) % 1000 == 0:
        avg_reward = np.mean(total_rewards[-1000:])
        print(f"Episode: {episode + 1}, Average Reward (1000ep): {avg_reward:.2f}")

print('Training Complete')

Episode: 1000, Average Reward (1000ep): -391.62
Episode: 2000, Average Reward (1000ep): -389.46
Episode: 3000, Average Reward (1000ep): -387.92
Episode: 4000, Average Reward (1000ep): -389.66
Episode: 5000, Average Reward (1000ep): -385.15
Episode: 6000, Average Reward (1000ep): -388.55
Episode: 7000, Average Reward (1000ep): -385.45
Episode: 8000, Average Reward (1000ep): -387.34
Episode: 9000, Average Reward (1000ep): -383.06
Episode: 10000, Average Reward (1000ep): -381.17
Episode: 11000, Average Reward (1000ep): -380.45
Episode: 12000, Average Reward (1000ep): -380.25
Episode: 13000, Average Reward (1000ep): -381.25
Episode: 14000, Average Reward (1000ep): -378.51
Episode: 15000, Average Reward (1000ep): -376.75
Episode: 16000, Average Reward (1000ep): -374.76
Episode: 17000, Average Reward (1000ep): -376.70
Episode: 18000, Average Reward (1000ep): -376.01
Episode: 19000, Average Reward (1000ep): -374.14
Episode: 20000, Average Reward (1000ep): -372.00
Episode: 21000, Average Rewar

In [48]:
# Test the learned policy
n_test_episodes = 100
test_rewards = []

for _ in range(n_test_episodes):
    state = env.reset()[0]
    episode_reward = 0
    
    for _ in range(max_steps):
        action = np.argmax(Q[state])
        next_state, reward, done, _, _ = env.step(action)
        episode_reward += reward
        if done:
            break
        state = next_state
    
    test_rewards.append(episode_reward)

print(f"Average reward over {n_test_episodes} test episodes: {np.mean(test_rewards):.2f}")

Average reward over 100 test episodes: -35.87


In [49]:
def action_to_symbol(action):
    return ['↓', '↑', '→', '←', 'p', 'd'][action]

def summarize_policy(Q, env, n=10):
    print("Sample policy (first {} states):".format(n))
    ACTION_MEANINGS = ['South (↓)', 'North (↑)', 'East (→)', 'West (←)', 'Pickup (p)', 'Dropoff (d)']
    for state in range(n):
        best_action = np.argmax(Q[state])
        symbol = action_to_symbol(best_action)
        print(f"State {state}: {symbol} ({ACTION_MEANINGS[best_action]})")

summarize_policy(Q, env, 100)

Sample policy (first 100 states):
State 0: ↓ (South (↓))
State 1: p (Pickup (p))
State 2: p (Pickup (p))
State 3: ↓ (South (↓))
State 4: ↓ (South (↓))
State 5: ↓ (South (↓))
State 6: ↓ (South (↓))
State 7: → (East (→))
State 8: ↓ (South (↓))
State 9: → (East (→))
State 10: ↓ (South (↓))
State 11: ↓ (South (↓))
State 12: → (East (→))
State 13: ↓ (South (↓))
State 14: ↓ (South (↓))
State 15: ↓ (South (↓))
State 16: d (Dropoff (d))
State 17: ↓ (South (↓))
State 18: ↓ (South (↓))
State 19: p (Pickup (p))
State 20: ↓ (South (↓))
State 21: ← (West (←))
State 22: ↓ (South (↓))
State 23: ← (West (←))
State 24: ↓ (South (↓))
State 25: ↓ (South (↓))
State 26: ← (West (←))
State 27: ↓ (South (↓))
State 28: ← (West (←))
State 29: ↓ (South (↓))
State 30: ↓ (South (↓))
State 31: ← (West (←))
State 32: ↓ (South (↓))
State 33: ↓ (South (↓))
State 34: ↓ (South (↓))
State 35: ↓ (South (↓))
State 36: ← (West (←))
State 37: ← (West (←))
State 38: ← (West (←))
State 39: p (Pickup (p))
State 40: ↓ (South (↓

In [50]:
# Simulate one episode with the learned policy
env = gym.make('Taxi-v3', render_mode='human')
n_test_episodes = 3
test_rewards = []
env.reset()
env.render()

for _ in range(n_test_episodes):
    state = env.reset()[0]
    episode_reward = 0
    
    for _ in range(max_steps):
        action = np.argmax(Q[state])
        next_state, reward, done, _, _ = env.step(action)
        env.render()
        episode_reward += reward
        if done:
            break
        state = next_state
    
    test_rewards.append(episode_reward)

print(f"Average reward over {n_test_episodes} test episodes: {np.mean(test_rewards):.2f}")

Average reward over 3 test episodes: 8.00


In [51]:
print_policy(Q,env, 11, 7)

IndexError: index 7 is out of bounds for axis 0 with size 7