In [1]:
import gym
import numpy as np
import pandas as pd

In [2]:
env = gym.make('Taxi-v3')
env.render()

+---------+
|R: | : :[35mG[0m|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [3]:
def train_and_evaluate(env, epochs, q_table):


    #0.01 seems to be the best fit for our RL game it gave back the best results
    learning_rate = 0.1
    #0.5 discount factor seems the fair amount for a test run
    discount_factor = 0.5
    #0.5 exploration gave back the best results; the higher it goes, the more risk the agent takes
    exploration = 0.1

    results = []

    for episode in range(epochs):
        state = env.reset()
        total_reward = 0
        total_steps = 0
        done = False

        while not done:
            if np.random.uniform(0, 1) < exploration:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])

            next_state, reward, done, _ = env.step(action)

            q_table[state, action] = (1 - learning_rate) * q_table[state, action] + learning_rate * (
                        reward + discount_factor * np.max(q_table[next_state, :]))

            state = next_state
            total_reward += reward
            total_steps += 1

        results.append([episode, total_reward, total_steps])

    return results, q_table

In [4]:
dfs = []
observation_space_size = env.observation_space.n
action_space_size = env.action_space.n
q_table = np.zeros((observation_space_size, action_space_size))  # Initialize Q-table

epochs = 1001
results, q_table = train_and_evaluate(env, epochs, q_table)
df = pd.DataFrame(results, columns=["Epoch", "Reward", "Steps"])

dfs.append(df)

# Update Q-table with the results from 1000 epochs
q_table_1000 = np.copy(q_table)

epochs = 5001
results, q_table_5000 = train_and_evaluate(env, epochs, q_table_1000)
df = pd.DataFrame(results, columns=["Epoch", "Reward", "Steps"])

dfs.append(df)

# Update Q-table with the results from 5000 epochs
q_table_5000 = np.copy(q_table_5000)

epochs = 10001
results, q_table_10000 = train_and_evaluate(env, epochs, q_table_5000)
df = pd.DataFrame(results, columns=["Epoch", "Reward", "Steps"])
dfs.append(df)
q_table=q_table_10000
    
    
df_1000_epochs = dfs[0]
df_5000_epochs = dfs[1]
df_10000_epochs = dfs[2]

In [5]:
for index in range(0, len(df_1000_epochs), 100):
    row = df_1000_epochs.iloc[index]
    data = pd.DataFrame([row], columns=["Epoch", "Reward", "Steps"])
    print(data.to_string(index=False))

 Epoch  Reward  Steps
     0    -578    200
 Epoch  Reward  Steps
   100    -254    200
 Epoch  Reward  Steps
   200    -202    169
 Epoch  Reward  Steps
   300    -120    105
 Epoch  Reward  Steps
   400     -54     66
 Epoch  Reward  Steps
   500     -83     77
 Epoch  Reward  Steps
   600    -245    200
 Epoch  Reward  Steps
   700    -122    116
 Epoch  Reward  Steps
   800    -113     89
 Epoch  Reward  Steps
   900    -191    158
 Epoch  Reward  Steps
  1000       0     21


In [6]:
for index in range(0, len(df_5000_epochs), 500):
    row = df_5000_epochs.iloc[index]
    data = pd.DataFrame([row], columns=["Epoch", "Reward", "Steps"])
    print(data.to_string(index=False))

 Epoch  Reward  Steps
     0     -65     50
 Epoch  Reward  Steps
   500     -81     75
 Epoch  Reward  Steps
  1000     -24     45
 Epoch  Reward  Steps
  1500      -1     22
 Epoch  Reward  Steps
  2000      -4     16
 Epoch  Reward  Steps
  2500       8     13
 Epoch  Reward  Steps
  3000      -3     24
 Epoch  Reward  Steps
  3500     -22     25
 Epoch  Reward  Steps
  4000       0     21
 Epoch  Reward  Steps
  4500      11     10
 Epoch  Reward  Steps
  5000      -4     16


In [7]:
for index in range(0, len(df_10000_epochs), 1000):
    row = df_10000_epochs.iloc[index]
    data = pd.DataFrame([row], columns=["Epoch", "Reward", "Steps"])
    print(data.to_string(index=False))

 Epoch  Reward  Steps
     0      -1     22
 Epoch  Reward  Steps
  1000       8     13
 Epoch  Reward  Steps
  2000       6     15
 Epoch  Reward  Steps
  3000      -2     14
 Epoch  Reward  Steps
  4000     -15     18
 Epoch  Reward  Steps
  5000      11     10
 Epoch  Reward  Steps
  6000     -19     22
 Epoch  Reward  Steps
  7000       5     16
 Epoch  Reward  Steps
  8000      -5     17
 Epoch  Reward  Steps
  9000       3     18
 Epoch  Reward  Steps
 10000      10     11


In [8]:
#Phase 2

In [9]:
def train_and_evaluate(env, epochs, discount_factor):
    observation_space_size = env.observation_space.n
    action_space_size = env.action_space.n
    
    q_table = np.zeros((observation_space_size, action_space_size))
    
    learning_rate = 0.1
    exploration = 0.2
    
    results = []
    
    for episode in range(epochs):
        state = env.reset()
        total_reward = 0
        total_steps = 0
        done = False
        
        while not done:
            if np.random.uniform(0, 1) < exploration:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])
            
            next_state, reward, done, _ = env.step(action)
            
            q_table[state, action] = (1 - learning_rate) * q_table[state, action] + learning_rate * (reward + discount_factor * np.max(q_table[next_state, :]))
            
            state = next_state
            total_reward += reward
            total_steps += 1
        
        results.append([episode, total_reward, total_steps, q_table])
    
    return results

def evaluate_agent(env, q_table, episodes):
    total_rewards = []
    total_steps = []
    
    for _ in range(episodes):
        state = env.reset()
        episode_reward = 0
        episode_steps = 0
        done = False
        
        while not done:
            action = np.argmax(q_table[state, :])
            state, reward, done, _ = env.step(action)
            
            episode_reward += reward
            episode_steps += 1
        
        total_rewards.append(episode_reward)
        total_steps.append(episode_steps)
    
    avg_reward = np.mean(total_rewards)
    avg_steps = np.mean(total_steps)
    
    return avg_reward, avg_steps

In [10]:
env = gym.make('Taxi-v3')
epochs = 10000
discount_factors = [0.3, 0.5, 0.9]
results = []

In [11]:
for discount_factor in discount_factors:
    training_results = train_and_evaluate(env, epochs, discount_factor)
    final_q_table = training_results[-1][3]  # Retrieve the final Q-Table after training
    avg_reward, avg_steps = evaluate_agent(env, final_q_table, episodes=10)
    results.append([discount_factor, avg_reward, avg_steps])

df_results = pd.DataFrame(results, columns=["Discount Factor", "Average Reward", "Average Steps"])


In [12]:
df_results

Unnamed: 0,Discount Factor,Average Reward,Average Steps
0,0.3,-33.2,50.0
1,0.5,7.5,13.5
2,0.9,9.5,11.5
