In [12]:
import gym
import numpy as np
import time
from IPython import display

In [13]:
env = gym.make('Taxi-v3')

In [14]:
# num of state
ns = env.observation_space.n
#num of action
na = env.action_space.n
print(f'num of states: {ns}\nnum of actions: {na}')

num of states: 500
num of actions: 6


In [15]:
env.P[0][4]

[(1.0, 16, -1, False)]

In [16]:
def value_iteration(gamma = 0.9, max_iter = 1000):
    # init v_values
    v_values = np.zeros(ns)
    for i in range(max_iter):
        pre_v_values = np.copy(v_values)
        # compute v_value for each state
        for state in range(ns):
            q_values = []
            #compute q_value for each action
            for action in range(na):
                # differ from FrokenLake, there one outcome
                [(prob, next_state, reward, done)] = env.P[state][action]
                q_value = prob*(reward + gamma*pre_v_values[next_state])
                q_values.append(q_value)
                
            # select the best action
            max_q_value = np.max(q_values)
            v_values[state] = max_q_value
        #check converge
        if np.all(np.isclose(pre_v_values, v_values)):
            print(f'convergence after: {i} steps')
            break
    return v_values  

In [17]:
def policy_extraction(v_value, gamma):
    policy = np.zeros(ns, dtype=np.int)
    # find policy for each state
    for state in range(ns):
        q_values = []
        #compute q_value for each action
        for action in range(na):
            [(prob, next_state, reward, done)] = env.P[state][action]
            q_value = prob*(reward + gamma*v_values[next_state])
            q_values.append(q_value)
        #select the best action
        best_action = np.argmax(q_values)
        policy[state] = best_action
    return policy

In [18]:
v_values = value_iteration(gamma=0.9, max_iter=1000)

convergence after: 116 steps


In [19]:
policy = policy_extraction(v_values, gamma=0.9)

In [20]:
print(policy)

[4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 3 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 3
 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 4 4 4 4 0 0 0 0 0 0 0 0 0 5 0 0 1 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 2 2 2 2 0 0 0 0 2 2 2 2 1 2 0 2 1 1
 1 1 2 2 2 2 3 3 3 3 2 2 2 2 1 2 3 2 3 3 3 3 1 1 1 1 3 3 3 3 2 2 2 2 3 1 3
 2 3 3 3 3 1 1 1 1 3 3 3 3 0 0 0 0 3 1 3 0 3 3 3 3 1 1 1 1 3 3 3 3 0 0 0 0
 3 1 3 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 0 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 0 1 1 1 1 1 1 1
 1 4 4 4 4 1 1 1 1 1 1 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 4 1 1 1 5 1
 1 1 1 1 1 1 1 1 1 1 1 3 

In [21]:
def play(env, policy):
    state = env.reset()
    total_reward = 0
    done = False
    steps = 0
    time.sleep(1)
    display.clear_output(wait=True)
    while not done:
        action = policy[state]
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        print(f'Step {steps}')
        env.render()
        time.sleep(0.5)
        if not done:
           display.clear_output(wait=True)
        state = next_state

    return total_reward

In [22]:
total_reward = play(env, policy)
print('reward', total_reward)

Step 13
+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
reward 8
