In [42]:
import numpy as np
import gym
import random

In [44]:
env = gym.make('Taxi-v2')

In [45]:
action_size = env.action_space.n
state_size = env.observation_space.n

In [46]:
q_table = np.zeros((state_size, action_size))
q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [47]:
total_episodes = 50000
alpha = 0.9
max_steps = 500
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.001

In [48]:
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        tradeoff = random.uniform(0,1)
        
        if tradeoff > epsilon:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)
        
        q_table[state, action] = q_table[state, action] + \
                                    alpha * (reward + gamma*np.max(q_table[new_state,:]) - q_table[state, action])
            
        total_rewards += reward
        state = new_state
        
        if done == True:
            break
        
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    rewards.append(total_rewards)
    
print("Score over time: " + str(sum(rewards)/total_episodes))
print(q_table)

Score over time: 3.06962
[[  0.           0.           0.           0.           0.
    0.        ]
 [244.70475208 258.63658114 244.70475208 258.63658114 273.30166436
  249.63658114]
 [273.30166436 288.73859406 273.30166435 288.73859406 304.98799375
  279.73859406]
 ...
 [304.69112122 322.092625   304.95912193 287.51781125 287.45241386
  295.86237587]
 [242.05286163 219.40639763 244.61892931 258.63658114 235.37693873
  235.56950597]
 [349.54906052 326.69000868 347.62702199 379.         339.63686903
  347.24208238]]


In [50]:
env.reset()

for episode in range(500):
    state = env.reset()
    step = 0
    done = False
    
    print("****************************************************")
    print("EPISODE ", episode)
    
    for step in range(max_steps):
        action = np.argmax(q_table[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            env.render()
            print("Number of steps: ", step)
            print("Reward is: ", reward)
            break
        state = new_state
        
print("Score over time: " + str(sum(rewards)/total_episodes))
env.close()
    

****************************************************
EPISODE  0
+---------+
|[35m[42mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Number of steps:  9
Reward is:  20
****************************************************
EPISODE  1
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Number of steps:  8
Reward is:  20
****************************************************
EPISODE  2
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
Number of steps:  11
Reward is:  20
****************************************************
EPISODE  3
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
Number of steps:  14
Reward is:  20
****************************************************
EPISODE  4
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m

Reward is:  20
****************************************************
EPISODE  158
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Number of steps:  11
Reward is:  20
****************************************************
EPISODE  159
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
Number of steps:  12
Reward is:  20
****************************************************
EPISODE  160
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (Dropoff)
Number of steps:  7
Reward is:  20
****************************************************
EPISODE  161
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Number of steps:  13
Reward is:  20
****************************************************
EPISODE  162
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| 

Number of steps:  13
Reward is:  20
****************************************************
EPISODE  301
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
Number of steps:  16
Reward is:  20
****************************************************
EPISODE  302
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (Dropoff)
Number of steps:  12
Reward is:  20
****************************************************
EPISODE  303
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (Dropoff)
Number of steps:  14
Reward is:  20
****************************************************
EPISODE  304
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (Dropoff)
Number of steps:  16
Reward is:  20
****************************************************
EPISODE  305
+---------+
|R: | : :G|
| : : : : 

+---------+
|[35m[42mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Number of steps:  9
Reward is:  20
****************************************************
EPISODE  431
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
Number of steps:  13
Reward is:  20
****************************************************
EPISODE  432
+---------+
|[35m[42mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Number of steps:  12
Reward is:  20
****************************************************
EPISODE  433
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (Dropoff)
Number of steps:  12
Reward is:  20
****************************************************
EPISODE  434
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
Number of steps:  15
Re