In [1]:
import gym
import numpy as np

In [2]:
# Type of environment, options include:
# Taxi-v3, CliffWalking-v0
env_type = "Taxi-v3"
# Create the environment
env = gym.make(env_type, render_mode=None)

# Number of possible states and actions
num_states = env.observation_space.n 
num_actions = env.action_space.n

# Action-value function, 
# initialized to 0 for all states and actions
Q = np.zeros([num_states, num_actions])

## Training

In [3]:
# Number of episodes to train on
episodes = 500
# Return (accumulation of all rewards over an episode)
G = 0
# Step size
alpha = 0.618

for episode in range(1, episodes+1):
    terminated, truncated = False, False
    G, reward = 0, 0
    state, info = env.reset()
    firstState = state
    while not (terminated):
        action = np.argmax(Q[state, :]) 
        state2, reward, terminated, truncated, info = env.step(action)
        Q[state, action] += alpha * (reward + np.max(Q[state2, :]) - Q[state, action]) 
        G += reward
        state = state2

    if episode % 10 == 0:
        print(f'Episode: {episode} Total Reward: {G}')

Episode: 10 Total Reward: -172
Episode: 20 Total Reward: -232
Episode: 30 Total Reward: -276
Episode: 40 Total Reward: -54
Episode: 50 Total Reward: 1
Episode: 60 Total Reward: -283
Episode: 70 Total Reward: -157
Episode: 80 Total Reward: 1
Episode: 90 Total Reward: -19
Episode: 100 Total Reward: -35
Episode: 110 Total Reward: -100
Episode: 120 Total Reward: -2
Episode: 130 Total Reward: 13
Episode: 140 Total Reward: 12
Episode: 150 Total Reward: -66
Episode: 160 Total Reward: 12
Episode: 170 Total Reward: 11
Episode: 180 Total Reward: -157
Episode: 190 Total Reward: -68
Episode: 200 Total Reward: 8
Episode: 210 Total Reward: 10
Episode: 220 Total Reward: 12
Episode: 230 Total Reward: 5
Episode: 240 Total Reward: 1
Episode: 250 Total Reward: -5
Episode: 260 Total Reward: 10
Episode: 270 Total Reward: 15
Episode: 280 Total Reward: -4
Episode: 290 Total Reward: 6
Episode: 300 Total Reward: 6
Episode: 310 Total Reward: 6
Episode: 320 Total Reward: 8
Episode: 330 Total Reward: -10
Episode:

## Inferencing

In [4]:
# Run the algorithm in inferencing mode and 
# observe its performance
env = gym.make(env_type, render_mode="human")
state, info = env.reset(seed=42)
G = 0
num_steps = 0
terminated = False

while not terminated:
    action = np.argmax(Q[state, :]) 
    state, reward, terminated, truncated, info = env.step(action)
    G += reward
    num_steps += 1

print(f'Total Reward: {G}, Steps Taken: {num_steps}')

env.close()

Total Reward: 8, Steps Taken: 13
