In [1]:
import numpy as np
import gym
import random

In [2]:
env = gym.make('Taxi-v2')
env.render()

+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | :[43m [0m|
|[34;1mY[0m| : |B: |
+---------+



In [3]:
action_size = env.action_space.n
state_size = env.observation_space.n
print(action_size, state_size)

6 500


In [4]:
qtable = np.zeros((state_size, action_size))

In [5]:
total_episodes = 50000
test_episodes = 100
max_steps = 99

learning_rate = 0.7
gamma = 0.618

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.1
decay_rate = 0.1


In [6]:
for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        exp_exp_tradeoff = random.uniform(0, 1)
        
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
        else:
            action = env.action_space.sample()
        
        new_state, reward, done, info = env.step(action)
        
        qtable[state, action] += learning_rate * (reward + gamma * max(qtable[new_state , :]) - qtable[state, action])
        
        state = new_state
        
        if done:
            break
    
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * (episode+1))

In [7]:
qtable

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ -1.89494435,  -1.44813002,  -1.89494435,  -1.44813002,
         -0.72512948, -10.44813002],
       [ -0.72512948,   0.4447743 ,  -0.72512948,   0.4447743 ,
          2.33782249,  -8.5552257 ],
       ...,
       [  1.02613511,   5.40100727,   1.95002038,  -0.25512368,
         -9.84115874,  -7.        ],
       [ -2.06406381,  -1.44813002,  -1.94568019,  -1.75250301,
         -9.62466593, -10.89816025],
       [ 18.37380265,  10.34918209,  18.21933513,  31.35602094,
          9.24535897,   7.90399906]])

In [8]:
env.reset()

174

In [12]:
rewards = []

for episode in range(test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        env.render()
        action = np.argmax(qtable[state, :])
        new_state, reward, done, info = env.step(action)
        total_rewards += reward
        if done:
            rewards.append(total_rewards)
            print('-----Episode Done------')
            break
        
        state = new_state
        
env.close()
print("Score over time "+ str(sum(rewards)/test_episodes))

+---------+
|R: | : :[35mG[0m|
| : : :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : : :[42m_[0m: |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | :[42m_[0m:[35mG[0m|
| : : : : |
| : : : : |
| 

  (South)
+---------+
|R: | : :G|
| : : : : |
| : : :[42m_[0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
| : :[42m_[0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
| :[42m_[0m: : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
|[42m_[0m: : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
|[42m_[0m| : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (South)
-----Episode Done------
+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m:[43m [0m|
+---------+

+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : :[43m