In [1]:
import gym
import numpy as np
import random
import math

## Define environment

In [2]:
env = gym.make("Taxi-v3")

In [3]:
env.reset()
env.render()

+---------+
|R: | : :G|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



## Actions

In [4]:
#Sample actions for exploration:
env.action_space.n, env.observation_space.n
# print(env.step(1))
# env.render()

(6, 500)

In [5]:
state = env.encode(4, 2, 3, 2) 
print("State:", state)
env.s = state
env.render()

State: 454
+---------+
|R: | : :G|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



## Initialization

In [18]:
num_episodes = 15000 #20000 #60000
gamma = 0.95 #0.99
learning_rate = 0.1 #0.95 #0.85
epsilon = 0.3#1 #0.15 #0.1

# initialize the Q table
Q = np.zeros([env.observation_space.n, env.action_space.n])
Q

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

## Training the Q-table

In [19]:

def train_Qtable(Q, env, num_episodes, epsilon, gamma, lr_rate):
    '''
    function trains Q table with given parameters
    Args:
        Q (numpy array): Q table which will be updated
        env (gym environment)
        num_episodes (int): number of games that will be played during training
        epsilon (int): probability threshold
        gamma (int): discount rate
        lr_rate (int): learning rate
    Returns:
        Q_optimal (numpy array): updated Q table which is converged to optimal
    '''

    Q_old = Q.copy()
    for i in range(num_episodes):
        # define initial state
        state = env.reset()
        done = False
        while done == False:
            # First we select an action:
            if random.uniform(0, 1) < epsilon: # take a random number
                action = env.action_space.sample() # Explore action space
            else:
                action = np.argmax(Q[state,:]) # Exploit learned values
            # Then we perform the action and receive the feedback from the environment
            new_state, reward, done, info = env.step(action)
            # Finally we learn from the experience by updating the Q-value of the selected action
            update = reward + (gamma*np.max(Q[new_state,:])) - Q[state, action]
            Q[state,action] += learning_rate*update 
            if (Q_old == Q).all():
                print("Q table has been converged to optimal in {}th iteration ".format(i))
                return Q
            Q_old = Q.copy()
            state = new_state

    # even if Q table will not converge to optimal return latest updated Q table
    return Q


In [20]:
# train Q table
Q_optimal = train_Qtable(Q, env, num_episodes, epsilon, gamma, learning_rate)

Q table has been converged to optimal in 1481th iteration 


In [21]:
print("Q table with optimal values:\n", Q_optimal )

Q table with optimal values:
 [[ 0.          0.          0.          0.          0.          0.        ]
 [-1.66748262 -1.60343154 -3.42653199 -0.24645887  5.07306024 -8.50226698]
 [ 3.1020709   2.90835883 -0.17783154  1.67353439 10.93790206 -2.6644929 ]
 ...
 [-0.63943171  5.98714514 -1.16298197 -0.10183063 -5.05831399 -3.97650524]
 [-2.56659323 -2.54138504 -2.30179111  1.18526024 -5.2850076  -9.26522599]
 [-0.271       1.00335038  2.65660139 16.25273594  0.143404   -0.15508807]]


## Using the Q-table

In [23]:
def launch_game(Q, env):
    '''
    launch game with optimal Q value
    Args:
        Q (numpy array): Q table with optimal values
        env (gym environment)
    '''

    # define initial state
    state = env.reset()
    env.render()
    done = False
    while done == False:
        # Take the action (index) with the maximum expected discounted future reward given that state
        action = np.argmax(Q[state,:])
        state, reward, done, info = env.step(action)
        env.render()

In [24]:
print("\nlaunch game with optimal Q values\n")
launch_game(Q_optimal, env)


launch game with optimal Q values

+---------+
|[34;1mR[0m: | : :[35mG[0m|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+

+---------+
|[34;1m[43mR[0m[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[42mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Pickup)
+---------+
|R: | : :[35mG[0m|
|[42m_[0m: | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
|[42m_[0m: : : : |
| | : | : |
|Y| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| :[42m_[0m: : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : :[42m_[0m: : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : :[4