In [1]:
import gym
import numpy as np
import sys
from collections import defaultdict

env = gym.make("Taxi-v2")
print(env.observation_space)
print(env.action_space)

Discrete(500)
Discrete(6)


In [2]:
def epsilon_greedy_policy(epsilon, Q, state):
    action_probs = np.ones(env.nA) * (epsilon/env.nA)
    action_probs[np.argmax(Q[state])] = 1 - epsilon + (epsilon/env.nA)
    action = np.random.choice(np.arange(env.nA), p=action_probs)
    
    return action, action_probs

In [3]:
def expected_sarsa(env, num_episodes, alpha, gamma=1, eps = None):
    Q = defaultdict(lambda: np.zeros(env.nA))
    for i_episode in range(1, num_episodes+1):
        # monitor progress
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
    
        epsilon = 1/((i_episode/10)+1)
#         epsilon = 1/i_episode
        if eps != None:
            epsilon = eps
        state = env.reset()
        while(True):
            action, probs = epsilon_greedy_policy(epsilon, Q, state)
            next_state, reward, done, info = env.step(action)
            Q[state][action] += alpha*(reward + (gamma*np.dot(probs, Q[next_state])) - Q[state][action])
            state = next_state
            if done:
                break
    
    policy = {k:np.argmax(v) for k, v in Q.items()}
    return Q, policy

In [4]:
def evaluate_algorithm(algo, env, eps_train, eps_test, alpha, gamma=1, eps = None):
    Q, policy = algo(env, eps_train, alpha, gamma, eps)
    
    return_per_episode = float(0)
    total_score = list()
    average_score = list()
    
    state = env.reset()
    
    for i in range(eps_test):
        while(True):
            action = policy.get(state, 0)
            next_state, reward, done, info = env.step(action)
            state = next_state
            return_per_episode += reward

            if done:
                state = env.reset()
                total_score.append(return_per_episode)
                return_per_episode = 0
                break
                
        if i%100 == 0:
            average_score.append(np.mean(total_score))
            total_score.clear()
    
    best_avg_score = np.max(average_score)
    
    return average_score, best_avg_score

In [5]:
avg_score, best_avg_score = evaluate_algorithm(expected_sarsa, env, eps_train=50000, 
                                               eps_test=10000, alpha=0.25, gamma=1)

Episode 50000/50000

In [7]:
print(best_avg_score)

9.13
