In [8]:
import gymnasium as gym
import numpy as np
import imageio.v3 as iio # Library to handle image/video file saving
import time
import os

# --- Part 1: Training the Q-Learning Agent (Same as before) ---

# 1. Environment and Q-Table Setup
# Use the default render_mode for training (None), as we don't need frames yet.
env = gym.make("Taxi-v3")
Q_matrix = np.zeros((env.observation_space.n, env.action_space.n))

# 2. Hyperparameters
LEARNING_RATE = 0.4
DISCOUNT_FACTOR = 0.6
EXPLORATION_RATE = 0.7
NUM_EPISODES = 10000

# 3. Training Loop
print("--- Starting Training ---")
for episode in range(NUM_EPISODES):
    current_state, _ = env.reset()
    done = False

    while not done:
        # Epsilon-Greedy Strategy:
        if np.random.uniform(0, 1) < EXPLORATION_RATE:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(Q_matrix[current_state]) # Exploit

        # Take action and observe results
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Q-Learning Update Formula
        old_value = Q_matrix[current_state, action]
        next_max = np.max(Q_matrix[next_state])
        new_value = (1 - LEARNING_RATE) * old_value + LEARNING_RATE * (reward + DISCOUNT_FACTOR * next_max)
        Q_matrix[current_state, action] = new_value

        current_state = next_state

    EXPLORATION_RATE = max(0.01, EXPLORATION_RATE * 0.9995)

    if episode % 1000 == 0:
        print(f"Episode: {episode}")

print("--- Training Complete ---")
env.close()


# --- Part 2: Visualization and GIF Generation ---

# 1. Setup for Rendering
# Create a new environment instance with 'rgb_array' to get the pixel data
env_render = gym.make("Taxi-v3", render_mode="rgb_array")
state, info = env_render.reset()
done = False
total_reward = 0
steps = 0
FRAME_COLLECTION = []

print("\n" + "="*40 + "\n")
print("--- Testing and Generating GIF ---")

# Capture the initial state frame
FRAME_COLLECTION.append(env_render.render())

while not done:
    # 2. Exploit: Use the trained Q-matrix to choose the best action
    action = np.argmax(Q_matrix[state])

    # 3. Take the action
    state, reward, terminated, truncated, info = env_render.step(action)
    done = terminated or truncated

    total_reward += reward
    steps += 1

    # 4. Capture the frame after the action is taken
    FRAME_COLLECTION.append(env_render.render())

    # Stop if the path is excessively long (safety break)
    if steps > 100:
        print("Stopping visualization after 100 steps to prevent infinite loop.")
        break

# 5. Save the collected frames as a GIF
gif_filename = "taxi_driver_path.gif"
try:
    # duration is the time (in seconds) between each frame
    iio.imwrite(gif_filename, FRAME_COLLECTION, duration=0.5, loop=0)
    print(f"\nSuccessfully created visualization: **{gif_filename}**")
    print(f"Episode finished in {steps} steps with a total reward of: {total_reward}")

except Exception as e:
    print(f"Error saving GIF: {e}")
    print("Ensure you have 'imageio' installed (`pip install imageio`).")


# Clean up the environment
env_render.close()

--- Starting Training ---
Episode: 0
Episode: 1000
Episode: 2000
Episode: 3000
Episode: 4000
Episode: 5000
Episode: 6000
Episode: 7000
Episode: 8000
Episode: 9000
--- Training Complete ---


--- Testing and Generating GIF ---

Successfully created visualization: **taxi_driver_path.gif**
Episode finished in 15 steps with a total reward of: 6


In [4]:
import gymnasium as gym
import numpy as np
import time
from IPython.display import clear_output # For use in Jupyter/Colab notebooks

# --- Part 1: Training the Q-Learning Agent ---

# 1. Environment and Q-Table Setup
# Use render_mode="ansi" for console output during training if desired, but "rgb_array"
# or "human" is needed for the final visualization (Part 2).
env = gym.make("Taxi-v3")
Q_matrix = np.zeros((env.observation_space.n, env.action_space.n))

# 2. Hyperparameters
LEARNING_RATE = 0.4    # alpha
DISCOUNT_FACTOR = 0.6  # gamma
EXPLORATION_RATE = 0.7 # epsilon (starting value)
NUM_EPISODES = 10000

# 3. Training Loop
print("--- Starting Training ---")
for episode in range(NUM_EPISODES):
    current_state, _ = env.reset()
    done = False

    while not done:
        # Epsilon-Greedy Strategy:
        if np.random.uniform(0, 1) < EXPLORATION_RATE:
            action = env.action_space.sample()  # Explore: Choose random action
        else:
            action = np.argmax(Q_matrix[current_state]) # Exploit: Choose max Q-value

        # Take action and observe results
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Q-Learning Update Formula
        # Q(s,a) = (1-lr) * Q(s,a) + lr * (Reward + gamma * max(Q(s', a')))
        old_value = Q_matrix[current_state, action]
        next_max = np.max(Q_matrix[next_state])
        new_value = (1 - LEARNING_RATE) * old_value + LEARNING_RATE * (reward + DISCOUNT_FACTOR * next_max)
        Q_matrix[current_state, action] = new_value

        current_state = next_state

    # Simple decay for the exploration rate over time
    EXPLORATION_RATE = max(0.01, EXPLORATION_RATE * 0.9995)

    if episode % 1000 == 0:
        print(f"Episode: {episode}")

print("--- Training Complete ---")
print("Q-Table head (State 0-4):")
print(Q_matrix[:5])
print("\n" + "="*40 + "\n")


# --- Part 2: Visualization of the Trained Agent ---

# Re-create the environment with 'human' rendering mode for real-time visualization
# Note: For Jupyter/Colab, you might need 'rgb_array' and imageio to save a GIF.
env_render = gym.make("Taxi-v3", render_mode="human")
# Reset the environment for the demonstration
state, info = env_render.reset()
done = False
total_reward = 0
steps = 0

print("--- Testing and Visualizing Trained Agent ---")

while not done:
    # 1. Exploit: Use the trained Q-matrix to choose the best action
    action = np.argmax(Q_matrix[state])

    # 2. Take the action
    state, reward, terminated, truncated, info = env_render.step(action)
    done = terminated or truncated

    total_reward += reward
    steps += 1

    # 3. Render the environment
    # The 'human' render mode displays the window automatically.

    # Optional: Slow down the visualization
    time.sleep(0.5)

    # Optional: Clear output for clean console display (useful for 'ansi' render_mode)
    # clear_output(wait=True)

# The visualization output will look like a 5x5 grid in a separate window,
# with the blue/yellow taxi ('T') moving to the passenger ('R', 'G', 'Y', 'B'),
# picking them up (the taxi turns green with the passenger inside),
# and dropping them off at the destination.
#

print(f"\nEpisode finished in {steps} steps with a total reward of: {total_reward}")

# Clean up the environment
env.close()
env_render.close()

--- Starting Training ---
Episode: 0
Episode: 1000
Episode: 2000
Episode: 3000
Episode: 4000
Episode: 5000
Episode: 6000
Episode: 7000
Episode: 8000
Episode: 9000
--- Training Complete ---
Q-Table head (State 0-4):
[[  0.           0.           0.           0.           0.
    0.        ]
 [ -2.41837019  -2.36395111  -2.41837066  -2.3639511   -2.27325184
  -11.363905  ]
 [ -1.87014402  -1.45030817  -1.87014405  -1.45024209  -0.7504
  -10.45024006]
 [ -2.36395069  -2.27325179  -2.36395106  -2.27324301  -2.1220864
  -11.27325083]
 [ -2.49618753  -2.49661449  -2.49618836  -2.49649829 -11.48632629
  -11.49325828]]


--- Testing and Visualizing Trained Agent ---

Episode finished in 11 steps with a total reward of: 10


In [3]:
for e in range(episodes):
    current_state, _ = env.reset()
    done = False

    while not done:
        # 1. Choose action: Explore (random) or Exploit (max Q-value)
        if np.random.uniform(0, 1) < exploration_factor:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(Q_matrix[current_state]) # Exploit

        # 2. Take action and observe results
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # 3. Update the Q-Table using the Q-Learning formula
        # Q(s,a) = (1-lr) * Q(s,a) + lr * (Reward + discount_factor * max(Q(s', a')))
        Q_matrix[current_state, action] = (1.0 - lr) * Q_matrix[current_state, action] + \
                                           lr * (reward + discount_factor * np.max(Q_matrix[next_state]))

        current_state = next_state