In [1]:
import pandas as pd
import numpy as np
import gym
import random

from os import system
from time import sleep
from tqdm import tqdm_notebook
from IPython.display import clear_output


In [2]:
class QLearningAgent():
    def __init__(self, env=None):
        if env is None:
            env = gym.make('Taxi-v2')
        self.env = env
        self.observation_space = env.observation_space
        self.action_space = env.action_space

        self.q_table = np.zeros([self.observation_space.n, self.action_space.n])

        self.α = 0.3
        self.ε = 0.95 
        self.γ = 0.6
    
    def update_q_value(self, reward, state, action, new_state):
        current_q = self.q_table[state, action]
        self.q_table[state, action] = current_q + self.α * (reward + self.γ * np.max(self.q_table[new_state]) - current_q)
    
    def epsilon_greedy(self, state):
        if self.ε > np.random.rand():
            return self.action_space.sample()
        else:
            return np.argmax(self.q_table[state])
        
    def training(self, epochs=5000):
        for i in tqdm_notebook(range(epochs)):
            done = False
            state = self.env.reset()
            
            while not done:
                action = self.epsilon_greedy(state)
                next_state, reward, done, _ = self.env.step(action)
                self.update_q_value(reward, state, action, next_state)
                state = next_state
            if i % 500 == 0:
                self.ε *= self.ε
                
    def run_episode(self, stupid_strategy=False):
        done = False
        state = self.env.reset()
        timestep = 0
        
        while not done:
            clear_output()
            if stupid_strategy:
                action = self.action_space.sample()
            else:
                action = np.argmax(self.q_table[state])
                print(f'action proposed from Q table:{np.argmax(self.qTable[state])}')

            state, reward, done, _ = self.env.step(action)
            self.env.render()
            timestep += 1
            print(f'state: {state}, reward: {reward}, maxQ: {np.argmax(self.q_table[state])}')
            sleep(1)
        print(f'Done! It took {timesteps} timesteps!')
                

In [3]:
q_agent = QLearningAgent()

In [4]:
q_agent.training(5000)

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [5]:
q_agent.run_episode()

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
state: 410, reward: 20, maxQ: 0
Done


In [6]:
q_agent.q_table

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ -2.27325184,  -2.1220864 ,  -2.27325184,  -2.1220864 ,
         -1.870144  , -11.1220864 ],
       [ -1.870144  ,  -1.45024   ,  -1.870144  ,  -1.45024   ,
         -0.7504    , -10.45024   ],
       ...,
       [ -0.78816177,   0.416     ,  -0.75052312,  -1.45048939,
         -9.75096754,  -9.7518095 ],
       [ -2.27334986,  -2.12222073,  -2.2732736 ,  -2.1220864 ,
        -11.27326178, -11.27329356],
       [  5.59987562,   2.35999897,   5.59998892,  11.        ,
         -3.40002129,  -3.39999295]])

In [26]:
env.observation_space.n

500

In [2]:
env = gym.make('Taxi-v2')
env.reset()

for i in range(40):
#     system('clear')
    clear_output()
    a = env.action_space.sample()
    obs, reward, done, info = env.step(a)
    env.render()
    print(f'state: {obs}, reward: {reward}, done: {done}, info: {info}')
    sleep(.5)
    if done:
        env.reset()

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Pickup)
state: 232, reward: -10, done: False, info: {'prob': 1.0}


KeyboardInterrupt: 