Reference video: 
https://www.youtube.com/watch?v=9fAnzZ6xzhA&list=PL58zEckBH8fBW_XLPtIPlQ-mkSNNx0tLS&index=4

In [1]:
# Import necessary libraries
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl

### Training with Q Table

In [21]:
def taxi(name, episodes, is_training=True, render=False):
    env = gym.make(
        'Taxi-v3', 
        render_mode='human' if render else None
    )
    
    # Define Q Table
    if is_training:
        q = np.zeros((env.observation_space.n, env.action_space.n))
    else:
        f = open(f'q_tables/q_{name}.pkl', 'rb')
        q = pkl.load(f)
        f.close()
    
    # Define the hyperparameters
    learning_rate_a = 0.9   # alpha or learning rate
    discount_factor_g = 0.9 # gamma or discount factor
    
    # Define policy parameters: we use the Epsilon Greedy Algorithm
    epsilon = 1                     # 1 = 100% random actions
    epsilon_decay_rate = 0.0001     # decay rate of epsilon
    rng = np.random.default_rng()   # random number generator
    
    # Keep track of the rewards per episode
    rewards_per_episode = np.zeros(episodes)
    
    # Iterate through all episodes
    for i in range(episodes):
        state = env.reset()[0]  
        terminated = False      # True when the agent falls into a hole or reaches the goal
        truncated = False       
        
        while(not terminated and not truncated):
            if is_training and rng.random() < epsilon:
                action = env.action_space.sample()  # Actions: 0-left, 1-down, 2-right, 3-up
            else:
                action = np.argmax(q[state, :])
            
            new_state, reward, terminated, truncated, _ = env.step(action)
            
            # Apply the Q Learning formula after taking a step
            if is_training:
                q[state, action] = q[state, action] + learning_rate_a * (
                    reward + discount_factor_g * np.max(q[new_state, :]) - q[state, action]
                )
            
            state = new_state
        
        # Decrease epsilon after each episode until it reaches 0
        epsilon = max(epsilon - epsilon_decay_rate, 0)  
        
        # Decrease the learning rate if epsilon reaches 0
        if epsilon == 0:
            learning_rate_a = 0.0001
            
        # Set episode reward to 1 if reward = 1
        if reward == 1:
            rewards_per_episode[i] = 1
    
    # Close the environment
    env.close()
    
    # Save the Q Table
    if is_training:
        f = open(f'q_tables/q_{name}.pkl', 'wb')
        pkl.dump(q, f)
        f.close()  
        print('Training completed!')

In [22]:
# Train with 15000 episodes
name = 'taxi'
taxi(name, 15000)

Training completed!


In [23]:
# Watch the trained agent
taxi(name, 3, is_training=False, render=True)