In [1]:
import sys
sys.path.append("/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages")
import gymnasium as gym
import numpy as np
import seaborn
from tqdm import tqdm #progress bar


In [2]:
from collections import defaultdict
import matplotlib.pyplot as plt #drawing plots
from matplotlib.patches import Patch #draw shapes
from IPython.display import clear_output

In [3]:
env=gym.make("Taxi-v3")

In [4]:
done=False
observation, info=env.reset()

In [5]:
action =env.action_space.sample()
observation,reward,terminated,truncated,info =env.step(action)

In [7]:
class TaxiAgent:
    def __init__(self, learning_rate:float,initial_epsilon:float,epsilon_decay:float,final_epsilon:float,discount_factor:float = 0.95):
        #Initialize RL agent with empty dictionary of state-action values, learning rate, epsilon.
        #discount_factor: the discount factor for computing the Q-value.
        self.q_values= defaultdict(lambda:np.zeros(env.action_space.n))
        self.lr=learning_rate
        self.discount_factor=discount_factor
        self.epsilon=initial_epsilon
        self.epsilon_decay=epsilon_decay
        self.final_epsilon=final_epsilon
        self.training_error=[]
    def get_action(self, obs)->int:
        if np.random()<self.epsilon:
            return env.action_space.sample()
        else:
            return int(np.argmax(self.q_values[obs]))
    def update(self, obs:tuple[int,int,bool], action:int, reward:float, termianted:bool, next_obs:tuple[int,int,bool])->int:
        future_q_value=0
        if not terminated:
            future_q_value=np.max(self.q_values[next_obs])
        temporal_difference= (reward+self.discount_factor*future_q_value-self.q_values[obs][action])
        self.q_values[obs][action]=self.q_values[obs][action]+self.lr*temporal_difference
        self.training_error.append(temporal_difference)
    def decay_epsilon(self):
        self.epsilon=max(self.final_epsilon, self.epsilon-self.epsilon_decay)


In [8]:
learning_rate=0.01
n_episodes=10000
start_epsilon=1.0
epsilon_decay= start_epsilon/(n_episodes/2)
final_epsilon=0.1

In [9]:
agent = TaxiAgent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon
)