In [1]:
import gym
import numpy as np
import sys
from collections import defaultdict
env = gym.make("Taxi-v2")

print('Number of possible actions :', env.action_space.n)
print('Number of possible states :', env.observation_space.n)

Number of possible actions : 6
Number of possible states : 500


In [2]:
def epsilon_greedy_policy(env, Q, state, epsilon):
    if (Q[state] == Q[state][0]).all():
        probs = [1/env.action_space.n]*env.action_space.n
    else:
        prob_greedy_action = 1-epsilon+(epsilon/env.action_space.n)
        prob_non_greedy_action = epsilon/env.action_space.n
        probs = np.array([prob_non_greedy_action for i in Q[state]])
        probs[np.argmax(Q[state])] = prob_greedy_action

    action = np.random.choice(np.arange(env.action_space.n), p=probs)
    return(action, probs)

In [3]:
def expected_sarsa(env, num_episodes, alpha, gamma=1.0):
    # initialize empty dictionary of arrays
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    # loop over episodes
    for i_episode in range(1, num_episodes+1):
        # monitor progress
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
        
        state = env.reset()
        epsilon = 0.05     
        while(True):
            action, probs = epsilon_greedy_policy(env, Q, state, epsilon)
            next_state, reward, done, info = env.step(action)
            old_Q = Q[state][action]    
            next_action_egp, probs = epsilon_greedy_policy(env, Q, next_state, epsilon)
            Q[state][action] = old_Q + alpha*(reward+(gamma*np.dot(probs, Q[next_state]))-old_Q)
            state = next_state
            if done:
                break
    policy = {k:np.argmax(v) for k, v in Q.items()}    
    return policy

In [4]:
def simulation(env, num_episodes, policy):
    ep_ter = []
    ep_rwd = []
    for i_episode in range(num_episodes):
        state = env.reset()
        t = 0
        total_reward = 0
        while(True):
            env.render()
            state, reward, done, info = env.step(policy[state])
            t += 1
            total_reward += reward
            if done:
                print("Episode finished after {} timesteps, total reward = {}".format(t+1, total_reward))
                ep_ter.append(t+1)
                ep_rwd.append(total_reward)
                env.close()
                break
    print("\n")
    print("Number of trials = {}".format(num_episodes))
    print("Average episode termination after {} timesteps following the given policy, average reward = {}".format
          (np.mean(ep_ter), np.mean(ep_rwd)))

In [5]:
simulation(env, 10, expected_sarsa(env, 50000, 1))

Episode 50000/50000+---------+
|[35mR[0m:[43m [0m| : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|[35mR[0m: | : :G|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|[35m