In [1]:
import gym
import random

#Red — 0 , Green — 1, Yellow — 2, and Blue — 3 for pick up
streets = gym.make("Taxi-v3").env #New versions keep getting released; if -v3 doesn't work, try -v2 or -v4
streets.render()

+---------+
|R: | : :[35mG[0m|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [2]:
##Red — 0 , Green — 1, Yellow — 2, and Blue — 3 for pick up
#Each state is defined by a 4 entries tuple: （taxi_row, taxi_col, passenger_location, destination)
initial_state = streets.reset()

streets.s = initial_state
print(streets.render(mode='ansi'))
#State Space:  25 possible taxi positions, 5 possible locations of the passenger
# 25*5*4 = 500

+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+




In [3]:
import numpy as np
#Action space:6 --> N,S,E,W, DROP-OFF, PICKUP
#Rewards: CORRECT FINAL DEST. +20, STEP -1, INCORRECT PICK/DROP -10

q_table = np.zeros([streets.observation_space.n, streets.action_space.n]) # 500 , 6
# a 2D array that represent every possible state and action in the virtual space and initialize all of them to 0
learning_rate = 0.1
discount_factor =  0.5 
exploration = 0.1
epochs = 10000
c = 0
avg_re = 0;
avg_step = 0;

for taxi_run in range(epochs): #Start training (the agent plays the number of epochs)
    state = streets.reset()
    done = False
    reward_steps = 0
    steps=0

    while not done:#each epoch/play contains this number of actions, starting from pickup a passenger until drop-off
        steps +=1
       # print("#####",steps)
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Explore a random action
        else:
            action = np.argmax(q_table[state]) # Return the action with the highest q-value
            
        next_state, reward, done, info = streets.step(action) # Do the above action
        
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        # see RL-2 PPT file --- slide# 5
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        
        reward_steps += reward
        
        q_table[state, action] = new_q
        #print(streets.render(mode='ansi'))       
        state = next_state
    if taxi_run % 1000 ==0 :
        print('StepNUM:',c , 'Steps:',steps , "Reward:",reward_steps)
        c+=1
print('StepNUM:',c , 'Steps:',steps , "Reward:",reward_steps)


StepNUM: 0 Steps: 1305 Reward: -3228
StepNUM: 1 Steps: 28 Reward: -7
StepNUM: 2 Steps: 23 Reward: -2
StepNUM: 3 Steps: 19 Reward: 2
StepNUM: 4 Steps: 15 Reward: -3
StepNUM: 5 Steps: 14 Reward: 7
StepNUM: 6 Steps: 10 Reward: 11
StepNUM: 7 Steps: 14 Reward: 7
StepNUM: 8 Steps: 9 Reward: 12
StepNUM: 9 Steps: 14 Reward: 7
StepNUM: 10 Steps: 20 Reward: -8


In [4]:
from IPython.display import clear_output
from time import sleep
lengths=[]
for tripnum in range(1, 11):
    state = streets.reset()
   
    done = False
    trip_length = 0
    steps=0 
    while not done and trip_length < 25:
        action = np.argmax(q_table[state])
        next_state, reward, done, info = streets.step(action)
        steps+=1
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(streets.render(mode='ansi'))
        sleep(.2)
        state = next_state
        trip_length += 1
    lengths.append(trip_length)
    avg_re += reward  
    avg_step += steps
    sleep(.2)
print('AVG_Reward:' ,avg_re/10 ,' AVG_Steps:',avg_step/10)
avg_len=sum(lengths)/10
print(avg_len)

Trip number 10 Step 11
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

AVG_Reward: 20.0  AVG_Steps: 13.6
13.6
