In [1]:
import gym
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env = gym.make("Taxi-v2")
#env = gym.make("FrozenLake-v0")

In [3]:
env.reset()
env.render()                 # Probably puts up a display of the env for each step

for _ in range(10):
    obs, reward, done, info = env.step(env.action_space.sample())     # Taking any action from the action space
    # print(obs, reward, done, info)
    # obs - Board State         reward - Reward obtained for given step
    # info - Diagnosic info     done - Whether episode is finished
    env.render()            # Step taken, now displaying output

env.close()

# Does env.render() function return anything?

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |B: |
+---------+

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |B: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|Y| : |B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| :[43m [0m|B: |
+---------+
  (South)
+---------+
|[35mR[0m:

In [4]:
print("Num Observation space: {}".format(env.observation_space.n)) # No. of possible board states
print("Num Action space: {}".format(env.action_space.n))      # No. of possible actions

Num Observation space: 500
Num Action space: 6


In [5]:
x = env.observation_space.sample()       # Taking a random sample board state(Environment)
print(env.observation_space.contains(x))

True


In [6]:
# Making the Q-table (No. of possible states and Q-values corresponding to actions for all different states)
Q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [7]:
# Other Important parameters
lr =  0.8    # Learning rate
gamma = 0.95         # Discounting rate
n_episodes = 20000   # No. of episodes the algorithm is training for
max_steps = 100      # Maximum steps allowed per episode

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005            # Exponential decay rate for exploration prob

In [8]:
def eps(episode):
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    return epsilon

In [9]:
def choose_action(episode):
    
    exp_exp_tradeoff = random.uniform(0, 1)
    epsilon = eps(episode)
    
    ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
    if exp_exp_tradeoff > epsilon:
        action = np.argmax(Q_table[state,:])

    # Else doing a random choice --> exploration
    else:
        action = env.action_space.sample()
    
    return action

In [10]:
rewards = []
for episode in range(n_episodes):
    state = env.reset()  # Start a new game episode
    steps = 0        # Reset steps to 0
    done = False
    total_rewards = 0
    
    while steps < max_steps:
        steps += 1
        action = choose_action(episode)
        
        new_state, reward, done, info = env.step(action)
        total_rewards += reward
        
        Q_table[state, action] = Q_table[state, action] + lr * (reward + gamma * np.max(Q_table[new_state, :]) - Q_table[state, action])
        
        # Our new state is state
        state = new_state
        
        # If done: finish episode
        if done == True: 
            break
            
    # Reduce epsilon (because we need less and less exploration)
    epsilon = eps(episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/n_episodes))
print(Q_table)

Score over time: 4.17215
[[  0.           0.           0.           0.           0.
    0.        ]
 [  5.18681361   6.52293747   5.20922427   6.45683264   7.93349184
   -2.46353722]
 [  7.92023525   9.40280489   7.43435732   9.38026858  10.9512375
    0.38169366]
 ...
 [ -1.758464     9.39072747  -1.8432      -2.4050688   -9.984
   -9.999872  ]
 [ -5.42036464   6.4803555   -5.40907952  -5.09176845 -12.01910456
  -13.24976216]
 [ 16.0999995   14.29480474  16.09999999  18.           7.1
    7.09999997]]


In [15]:
states = env.reset()
env.render()
total_reward = 0
for steps in range(max_steps):
    action = np.argmax(Q_table[state,:])
    state, reward, done, info = env.step(action)
    total_reward += reward
    env.render()
    if done:
        print("Finished Episode")
        print("Total Average Reward: "+str(total_reward))
        break

+---------+
|[43mR[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|R: | : :[35mG[0m|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG