In [1]:
import gym
import time
import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt
from IPython.display import clear_output

In [2]:
env = gym.make('Taxi-v3')

In [3]:
episodes = 10

for episode in range(1, episodes):
    state = env.reset() #Reset the environment to its original state.
    done = False 
    score = 0
    
    while not done:
        env.render() #Render the environment.
        # time.sleep(0.2)
        state, reward, done, info = env.step(env.action_space.sample()) #Pass a random action.
        score += reward
        
        #Clear output.
        clear_output(wait = True)
    print('Episode: {}\n Score: {}'.format(episode, score))
    
env.close()

Episode: 9
 Score: -821


In [14]:
## Create a Q-table.
actions = env.action_space.n
state = env.observation_space.n

print(f'There are {actions} number of possible actions to take.\n')
print(f'There are {state} possible observation states')

#Initialize q table.
q_table = np.zeros((state, actions))

#Create parameters for the q-learning algo.
num_eposides = 1_000_000
max_steps_per_episode = 100

learning_rate = 0.01
discount_rate = 0.99 #Priority for immediate rewards compared to future rewards.
exploration_rate = 1#Probability of exploring over exploitation.
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.01
rewards_all_episodes = []

There are 6 number of possible actions to take.

There are 500 possible observation states


In [15]:
#Q-learning algo.
for episode in tqdm(range(num_eposides)):
    state = env.reset()
    done = False
    rewards_current_episode = 0

    
    for step in range(max_steps_per_episode):
        
        #Exploration vs exploitation tradeoff.
        exploration_threshold = np.random.uniform(0, 1)
        if exploration_threshold > exploration_rate:
            
            #Look in the q table and take the associated action.
            action = np.argmax(q_table[state,:])
            
        else:
            action = env.action_space.sample() #Take random acion in our action space.
            
            
        new_state, reward, done, info = env.step(action)
        
        #Compute the q values for the q table.
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        
        state = new_state
        
        rewards_current_episode += reward
        
        if done == True:
            break
        
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    
    rewards_all_episodes.append(rewards_current_episode)
    
print('***Training finished.***')    

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1000000/1000000 [23:50<00:00, 698.86it/s]

***Training finished.***





In [16]:
q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 4.98492965,  5.98380248,  5.23264329,  5.68127241,  9.6220697 ,
        -1.00982196],
       [ 8.26583817,  8.82253363,  8.11292257,  9.40390082, 14.11880599,
         2.5231171 ],
       ...,
       [-0.77690634, 10.18454092, -0.6602651 , -0.64753761, -0.8976585 ,
        -0.79884103],
       [-2.18565877, -2.18946634, -2.18586309,  7.81124109, -2.20267622,
        -2.29433851],
       [-0.019999  ,  0.14018377,  0.1372446 , 18.3982542 ,  0.03592773,
        -0.1       ]])

In [17]:
#Calculate and print average reward per thousand episodes.
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_eposides / 1000)
count = 1000

for r in rewards_per_thousand_episodes:
    print(count, ':', str(sum(r / 1000)))
    count += 1000

1000 : -141.605999999998
2000 : -128.39399999999858
3000 : -121.72799999999877
4000 : -106.77699999999932
5000 : -93.19499999999964
6000 : -80.27799999999993
7000 : -65.39500000000018
8000 : -52.4090000000001
9000 : -34.752999999999936
10000 : -22.663999999999987
11000 : -14.578000000000033
12000 : -5.969000000000003
13000 : -1.7150000000000134
14000 : 1.4819999999999995
15000 : 3.7229999999999794
16000 : 5.459999999999973
17000 : 5.84499999999998
18000 : 6.6569999999999725
19000 : 6.320999999999979
20000 : 6.996999999999974
21000 : 7.153999999999967
22000 : 7.232999999999967
23000 : 7.348999999999967
24000 : 7.251999999999967
25000 : 7.509999999999965
26000 : 7.240999999999967
27000 : 7.529999999999958
28000 : 7.299999999999961
29000 : 7.548999999999969
30000 : 7.419999999999961
31000 : 7.45099999999996
32000 : 7.468999999999971
33000 : 7.280999999999965
34000 : 7.367999999999967
35000 : 7.405999999999958
36000 : 7.4569999999999625
37000 : 7.560999999999964
38000 : 7.27199999999996
39

In [8]:
#Visualize agent.
for episode in range(10):
    state = env.reset()
    done = False
    
    print(f'Episode is: {episode}')
    time.sleep(1)
    
    
    for step in range(max_steps_per_episode):
        clear_output(wait = True)
        env.render()
        time.sleep(0.4)
        
        action = np.argmax(q_table[state, :])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait = True)
            env.render()
            time.sleep(0.4)
            
            if reward == 1:
                print('***Reached goal.***')
                time.sleep(2)
                clear_output(wait = True)
                
            else:
                print('***Failed!***')
                clear_output(wait = True)
                time.sleep(2)
            break
        
        state = new_state
        
env.close()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)
***Failed!***
