In [1]:
import torch
import gym

env = gym.make('Taxi-v3')


def gen_epsilon_greedy_policy(num_action, epsilon):
    def policy_function(state, Q):
        probs = torch.ones(n_action) * epsilon / n_action
        best_action = torch.argmax(Q[state]).item()
        probs[best_action] += 1.0 - epsilon
        action = torch.multinomial(probs, 1).item()
        return action
    return policy_function


from collections import defaultdict

def sarsa(env, gamma, num_episode, alpha):

    num_action = env.action_space.n
    Q = defaultdict(lambda: torch.zeros(n_action))
    for episode in range(num_episode):
        state = env.reset()
        is_done = False
        action = epsilon_greedy_policy(state, Q)
        while not is_done:
            next_state, reward, is_done, info = env.step(action)
            next_action = epsilon_greedy_policy(next_state, Q)
            td_delta = reward + gamma * Q[next_state][next_action] - Q[state][action]
            Q[state][action] += alpha * td_delta
            length_episode[episode] += 1
            total_reward_episode[episode] += reward
            if is_done:
                break
            state = next_state
            action = next_action
    policy = {}
    for state, actions in Q.items():
        policy[state] = torch.argmax(actions).item()
    return Q, policy

gamma = 1

n_episode = 1000


alpha = 0.4

epsilon = 0.01

epsilon_greedy_policy = gen_epsilon_greedy_policy(env.action_space.n, epsilon)

length_episode = [0] * n_episode
total_reward_episode = [0] * n_episode

optimal_Q, optimal_policy = sarsa(env, gamma, num_episode, alpha)

import matplotlib.pyplot as plt
plt.plot(length_episode)
plt.title('Episode length over time')
plt.xlabel('Episode')
plt.ylabel('Length')
plt.show()


plt.plot(total_reward_episode)
plt.title('Episode reward over time')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.show()



alpha_options = [0.4, 0.5, 0.6]
epsilon_options = [0.1, 0.03, 0.01]
n_episode = 500

for alpha in alpha_options:
    for epsilon in epsilon_options:
        length_episode = [0] * n_episode
        total_reward_episode = [0] * n_episode
        sarsa(env, gamma, n_episode, alpha)
        reward_per_step = [reward/float(step) for reward, step in zip(total_reward_episode, length_episode)]
        print('alpha: {}, epsilon: {}'.format(alpha, epsilon))
        print('Average reward over {} episodes: {}'.format(n_episode, sum(total_reward_episode) / n_episode))
        print('Average length over {} episodes: {}'.format(n_episode, sum(length_episode) / n_episode))
        print('Average reward per step over {} episodes: {}\n'.format(n_episode, sum(reward_per_step) / n_episode))

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

alpha: 0.4, epsilon: 0.1
Average reward over 500 episodes: -82.982
Average length over 500 episodes: 61.838
Average reward per step over 500 episodes: -0.40300468560483965

alpha: 0.4, epsilon: 0.03
Average reward over 500 episodes: -86.306
Average length over 500 episodes: 64.208
Average reward per step over 500 episodes: -0.4152733980617916

alpha: 0.4, epsilon: 0.01
Average reward over 500 episodes: -88.858
Average length over 500 episodes: 66.178
Average reward per step over 500 episodes: -0.44016282222277686

alpha: 0.5, epsilon: 0.1
Average reward over 500 episodes: -76.51
Average length over 500 episodes: 57.718
Average reward per step over 500 episodes: -0.3346038288291968

alpha: 0.5, epsilon: 0.03
Average reward over 500 episodes: -75.574
Average length over 500 episodes: 57.28
Average reward per step over 500 episodes: -0.30377906441183006

alpha: 0.5, epsilon: 0.01
Average reward over 500 episodes: -74.024
Average length over 500 episodes: 56.45
Average reward per step over