<a href="https://colab.research.google.com/github/ArrudaJF/Q_Learning_Research/blob/main/Q_Taxi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
#import gym
import numpy as np
import random as rd

#-----------------------------------------------------------------
# the values alpha and gamma are, respectively, 
#           the learning rate and the discount factor
#-----------------------------------------------------------------
alpha = 0.2
gamma = 0.6
epsilon_max = 0.9
ep_decay_rate = 0.1
epsilon_min = 0.01
nA = 6
dim = 5

episodes = 1000
"""
actions are defined as:
  0 = north
  1 = east
  2 = south
  3 = west
  4 = pickup
  5 = dropoff
"""

actions = np.ones(nA)
actions[0] = -1
actions[1] = dim
actions[3] = -dim
actions[4] = 10
actions[5] = 10


states = np.zeros(dim*dim).reshape(dim,dim)

#for each state (i, j) we assign a exclusive integer i + 5*j
#initial state with a boolean that keeps track weather 
#                     the agent picked the passenger or not
current_state = 0
passenger = False

# pickup and dropoff states, states are fixed for this first part
states[0,4] = 5
states[4,3] = 5
pickup_state = 0 + 5*4
final_state = 4 + 3*5

qtable = np.zeros((dim**2, 6))
qtable_passenger = np.zeros((dim**2, 6))

print(np.shape(qtable))
print(actions)
print(states)

(25, 6)
[-1.  5.  1. -5. 10. 10.]
[[0. 0. 0. 0. 5.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 5. 0.]]


In [3]:
def step(state, action, passenger):

  #-----------------------------------------------------------------------------
  # if the agent is at the left-most states of the environment,
  #         it can't go further to the left, and it works the same for each
  #                 direction that the set of actions allows the agent to go
  #-----------------------------------------------------------------------------
  # also, there are specific (in this case) states where the agent can pickup or
  # dropoff passenger (0, 4) states[20], and passenger = True
  #-----------------------------------------------------------------------------

  if action == 0 and state not in [dim*x for x in range(dim)]:
    return int(state + actions[action]), -1, passenger, False

  elif action == 1 and state not in range(dim*(dim -1), dim*dim):
    return int(state + actions[action]), -1, passenger, False

  elif action == 2 and state not in [dim*x -1 for x in range(1, dim + 1)]:
    return int(state + actions[action]), -1, passenger, False

  elif action == 3 and state not in range(dim):
    return int(state + actions[action]), -1, passenger, False

  if action == 4 and state == 19 and passenger == False:
    return int(state), 5, True, False

  if action == 5 and state == 20 and passenger == True:
    return int(state), 10, False, True

  return int(state), -20, passenger, False

In [4]:
def update_Q(current_state, action, next_state, reward, qtable):
  #-----------------------------------------------------------------------------
  #        q(s,a) = (1 - alpha)*q(s,a) + alpha*(reward + gamma*q(s',a'))
  #-----------------------------------------------------------------------------
  #    alpha is the learning rate and gamma the discount factor
  #-----------------------------------------------------------------------------
  # in this case s is the current state, s' is the next state
  #        and a' is the best action it could possibly take at the next state
  #-----------------------------------------------------------------------------
  qtable[current_state, action] = (1 - alpha)*qtable[current_state, action] + alpha*(reward + gamma*np.max(qtable[next_state, :]))

In [5]:
def exploration_rate(epsilon, epsilon_min, ep_decay_rate):
  if epsilon > epsilon_min:
    epsilon = epsilon - ep_decay_rate
  else:
    epsilon = epsilon_min
  return epsilon

In [21]:
for _ in range(episodes):
  #-----------------------------------------------------------------------------
  # we reset the agent's position to a random state 
  #                    passenger-status to False and done to False
  #                         before starting another episode
  #-----------------------------------------------------------------------------
  current_state = rd.randint(0,24)
  passenger = False
  done = False
  
  for _ in range(100):
  #-----------------------------------------------------------------------------
  # this part is important because it forces the agent to explore the 
  #       environment more in order to make its decision more valuable
  #-----------------------------------------------------------------------------
    epsilon = rd.random()
    if epsilon > epsilon_max:
      act_taken = np.argmax(qtable[current_state])
    else:
      i = [0,1,2,3,4,5]
      i.remove(np.argmax(qtable[current_state]))
      act_taken = rd.choice(i)


    next_state, reward, passenger, done = step(current_state, act_taken, passenger)
    update_Q(current_state, act_taken, next_state, reward, qtable)
    current_state = next_state
    epsilon_max = exploration_rate(epsilon_max, epsilon_min, ep_decay_rate)
    
    if passenger == True:
      break

#-----------------------------------------------------------------------------
#
#        passenger was picked
#
#-----------------------------------------------------------------------------
  epsilon_max = 0.9
  for _ in range(100):
    epsilon = rd.random()
    if epsilon > epsilon_max:
      act_taken = np.argmax(qtable_passenger[current_state])
    else:
      i = [0,1,2,3,4,5]
      i.remove(np.argmax(qtable_passenger[current_state]))
      act_taken = rd.choice(i)


    next_state, reward, passenger, done = step(current_state, act_taken, passenger)
    update_Q(current_state, act_taken, next_state, reward, qtable_passenger)
    current_state = next_state
    epsilon_max = exploration_rate(epsilon_max, epsilon_min, ep_decay_rate)

    if done == True:
      break


In [22]:
for i in range(dim):
  for j in range(dim):
    if np.argmax(qtable[i + dim*j]) == 0:
      print("^", end=' ')
    elif np.argmax(qtable[i + dim*j]) == 1:
      print(">", end=' ')
    elif np.argmax(qtable[i + dim*j]) == 2:
      print("v", end=' ')
    elif np.argmax(qtable[i + dim*j]) == 3:
      print("<", end=' ')
    elif np.argmax(qtable[i + dim*j]) == 4:
      print("P", end=' ')
    elif np.argmax(qtable[i + dim*j]) == 5:
      print("D", end=' ')

  print(" ")


v > v v v  
v > > v <  
> > > v <  
v v v v <  
> > > P <  


In [23]:
for i in range(dim):
  for j in range(dim):
    if np.argmax(qtable_passenger[i + dim*j]) == 0:
      print("^", end=' ')
    elif np.argmax(qtable_passenger[i + dim*j]) == 1:
      print(">", end=' ')
    elif np.argmax(qtable_passenger[i + dim*j]) == 2:
      print("v", end=' ')
    elif np.argmax(qtable_passenger[i + dim*j]) == 3:
      print("<", end=' ')
    elif np.argmax(qtable_passenger[i + dim*j]) == 4:
      print("P", end=' ')
    elif np.argmax(qtable_passenger[i + dim*j]) == 5:
      print("D", end=' ')

  print(" ")

> > > > D  
> ^ ^ ^ ^  
^ ^ ^ > ^  
^ ^ > ^ ^  
> > > ^ ^  


In [24]:
for i in range(dim*dim):
  for j in range(6):
    qtable_passenger[i,j] = round(qtable_passenger[i,j], 5)

print(qtable_passenger[0])
print(qtable_passenger[1])
print(qtable_passenger[2])
print(qtable_passenger[6])


[-7.44679 -0.98126 -2.5     -7.5     -7.49999 -7.5    ]
[ -2.43488  -1.01719  -2.5     -10.3     -15.76497 -10.29949]
[ -2.11029  -2.5      -2.5     -12.54    -10.07376 -15.7656 ]
[  1.05957  -2.5      -2.5      -2.49219 -15.6276  -18.52339]
