<a href="https://colab.research.google.com/github/Aditya-11/Reinforcement-learning-Experiments/blob/master/taxi_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install gym



In [0]:
import gym
import numpy as np
import random 



The agents (taxi) task will to pick up the passenger and drop him to the destination with least number of timesteps.

There are 6 discrete deterministic actions:
* 0: move down
* 1: move up
* 2: move to the right
* 3: move to the left
* 4: pick up a passenger
* 5: drop-off a passenger

The color coding is as follows:
* blue: passenger
* magenta: destination
* yellow: empty taxi
* green: full taxi
* other letters: locations

In [3]:
env = gym.make('Taxi-v3').env

env.reset()

env.render()

print('Action Space ',env.action_space.n)

print('State Space ',env.observation_space.n)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m:[43m [0m|
+---------+

Action Space  6
State Space  500


In [4]:
# check out position 

state = env.encode(3,1,0,2)

print('state is : ',state)

env.s = state

env.render()

state is :  322
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[35mY[0m| : |B: |
+---------+



In [5]:
# see the reward table for a state P -> state * action 
env.P[328]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [6]:
# solving the env without reinforcement learning

epochs = 0
penalties = 0
reward = 0

frames = [] # for animation 

done = False

while not done:
  action = env.action_space.sample()
  state ,reward, done, info = env.step(action)

  if reward == -10:
    penalties += 1

  # put each frame into dict

  frames.append(
      {
          'frame' : env.render(mode = 'ansi'),
          'state' : state,
          'action' : action,
          'reward' : reward
      }
  )

  epochs += 1

print ('total time steps taken : ',epochs)
print ('penalties incurred : ' , penalties)

# render the frames

from time import sleep
from IPython.display import clear_output

for i in range(len(frames)):
  clear_output(wait = True)
  print ('Timestep : ',i+1)
  print (frames[i]['frame'])
  print ('state -> ',frames[i]['state'])
  print ('action -> ',frames[i]['action'])
  print ('reward -> ',frames[i]['reward'])
  sleep(0.1)



Timestep :  211
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

state ->  410
action ->  5
reward ->  20


bellman equation to update the value


Q(state,action)←(1−α)Q(state,action)+α(reward+γmaxaQ(next state,all actions))

In [31]:
# Using Q learning and Bellman equation to train the agent

import math

q_table = np.zeros((env.observation_space.n,env.action_space.n,))

# no of episodes to be trained

no_episodes = 135000

# hyper parameters

learning_rate = 0.88
discount_rate = 0.93
#decay_rate = 0.0001

epsilon = 1.0
epsilon_decay = 0.000001
min_epsilon = 0.00005

epochs = 0
penalties = 0

from IPython.display import clear_output

for i in range(no_episodes):
  state = env.reset()

  penalties , reward = 0,0

  done = False

  while not done:
    if random.uniform(0,1) < epsilon:
      action = env.action_space.sample()
    else:
      action = np.argmax(q_table[state , :])

    next_state , reward , done , info = env.step(action)

    if (reward == -10):
      penalties += 1

    next_max = np.max(q_table[next_state])

    q_table[state,action] = (1 - learning_rate) * q_table[state,action] + learning_rate * (reward + discount_rate * next_max)

    state = next_state

    epochs += 1

    if (epsilon > min_epsilon):
      epsilon = epsilon -  epsilon_decay
    else:
      epsilon = epsilon 
  
  if (i%100==0):
    clear_output(wait=True)
    print(f"Episode: {i}")
    print("epsilom : ", epsilon)
    #print(env.render())


print(env.render())

print("Average Epochs per episode : {}".format(epochs/no_episodes))

print("Average Penalties per episode : {}".format(penalties/no_episodes))



print(q_table)


Episode: 134900
epsilom :  4.999999208169932e-05
+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
None
Average Epochs per episode : 17.036585185185185
Average Penalties per episode : 0.0
[[ 0.          0.          0.          0.          0.          0.        ]
 [ 1.14640728  2.30796482  1.14640728  2.30796482  3.55695142 -6.69203518]
 [ 6.34402985  7.89680629  6.34402985  7.89680629  9.56645838 -1.10319371]
 ...
 [ 9.56645838 11.3617832   9.56645838  7.89680629  0.56645838  0.56645838]
 [ 3.55695142  4.89994776  3.55695142  4.89994776 -5.44304858 -5.44304858]
 [15.368      13.29224    15.368      17.6         6.368       6.368     ]]


In [35]:
# testing the agent

total_epochs , total_penalties = 0,0
episode = 100


for i in range(episode):
  state = env.reset()
  penalties , reward = 0, 0 
  done = False

  while not done: 
    action = np.argmax(q_table[state,:])
    next_state , reward , done , info = env.step(action)
    if (reward == -10):
      total_penalties =+ 1
    #epochs =+ 1
    state = next_state
    total_epochs += 1

print("Average epochs per episode : {}".format(total_epochs/episode))

print("Average penalties per episode : {}".format(total_penalties/episode))


Average epochs per episode : 13.11
Average penalties per episode : 0.0


In [39]:

# programme illustrates the working of agent


episode = 1

total_epochs = 0

for i in range(episode):
  state = env.reset()
  penalties , reward = 0, 0 
  done = False

  while not done: 
    action = np.argmax(q_table[state,:])
    clear_output(wait=True)
    env.render()
    sleep(1.1)
    next_state , reward , done , info = env.step(action)
    state = next_state
    total_epochs += 1
  print("total_epochs :",total_epochs)



+---------+
|R: | : :[35m[42mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
total_epochs : 12
