In [1]:
from Agent import Agent
from Monitor import interact
import gym
import numpy as np
from collections import defaultdict

In [2]:
# Creating Environment
env = gym.make('Taxi-v2')

## Agent

In [3]:
class Agent:

    def __init__(
        self, 
        algo='sarsa', 
        nA=6, 
        gamma=0.9, 
        alpha=0.03, 
        epsilon_start=1.0, 
        epsilon_stop=0.001, 
        epsilon_decay_rate=0.0005
    ):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """

        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        self.gamma = gamma
        self.alpha = alpha
        self.next_action = None
        self.algo = algo
        
        # Epsilon Initialization
        self.epsilon_start = epsilon_start
        self.epsilon_stop = epsilon_stop
        self.epsilon_decay_rate = epsilon_decay_rate


    def __update_rule_Q(self, Qsa, Qsa_next, reward, alpha, gamma):
        """
        Updates the action-value function estimate using the most recent time step
        """

        return Qsa + alpha * (reward + (gamma * Qsa_next) - Qsa)


    def __epsilon(self, i_episode):
        return self.epsilon_stop + (self.epsilon_start - self.epsilon_stop) * np.exp(- self.epsilon_decay_rate * i_episode)
        
        
    def __epsilon_greedy_probs(self, nA, Q_s, i_episode=1):
        """
        Obtains the action probabilities corresponding to epsilon-greedy policy
        """

        #epsilon = 1.0 / i_episode
        #print('EPSILON %f' % epsilon)
        
        epsilon = self.__epsilon(i_episode)
        #print('i_episode %d - EPSILON %f' % (i_episode, epsilon))
        
        policy_s = np.ones(nA) * epsilon / nA
        policy_s[np.argmax(Q_s)] = 1 - epsilon + (epsilon / nA)

        return policy_s


    def select_action(self, state, i_episode=1):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """

        if self.next_action:
            return self.next_action
        else:
            policy_s = self.__epsilon_greedy_probs(self.nA, self.Q[state], i_episode=i_episode)
            action = np.random.choice(np.arange(self.nA), p=policy_s)

            return action


    def sarsa_step(self, state, action, reward, next_state, done, i_episode=1):
        """
        Update the agent's knowledge, using the most recently sampled tuple using SARSA

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """

        if not done:

            # Get epsilon-greedy action probabilities
            policy_s = self.__epsilon_greedy_probs(self.nA, self.Q[next_state], i_episode=i_episode)
            next_action = np.random.choice(np.arange(self.nA), p=policy_s)

            # Update TD estimate of Q
            self.Q[state][action] = self.__update_rule_Q(
                self.Q[state][action],
                self.Q[next_state][next_action],
                reward,
                self.alpha,
                self.gamma
            )

            # Add next action to use for the agent
            self.next_action = next_action

        else:
            self.Q[state][action] = self.__update_rule_Q(
                self.Q[state][action],
                0,
                reward,
                self.alpha,
                self.gamma
            )

            # Reset Next action
            self.next_action = None


    def sarsamax_step(self, state, action, reward, next_state, done, i_episode=1):
        """
        Update the agent's knowledge, using the most recently sampled tuple using SARSAMAX - QLearning

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        if not done:

            # Get epsilon-greedy action probabilities
            policy_s = self.__epsilon_greedy_probs(self.nA, self.Q[next_state], i_episode=i_episode)
            next_action = np.random.choice(np.arange(self.nA), p=policy_s)

            # Update TD estimate of Q
            self.Q[state][action] = self.__update_rule_Q(
                self.Q[state][action],
                np.max(self.Q[next_state]),
                reward,
                self.alpha,
                self.gamma
            )

            # Add next action to use for the agent
            self.next_action = next_action

        else:
            self.Q[state][action] = self.__update_rule_Q(
                self.Q[state][action],
                0,
                reward,
                self.alpha,
                self.gamma
            )

            # Reset Next action
            self.next_action = None


    def expected_sarsa_step(self, state, action, reward, next_state, done, i_episode=1):
        """
        Update the agent's knowledge, using the most recently sampled tuple using SARSAMAX - QLearning

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """

        if not done:

            # Get epsilon-greedy action probabilities
            policy_s = self.__epsilon_greedy_probs(self.nA, self.Q[next_state], i_episode=i_episode)
            next_action = np.random.choice(np.arange(self.nA), p=policy_s)

            # Update TD estimate of Q
            self.Q[state][action] = self.__update_rule_Q(
                self.Q[state][action],
                np.dot(self.Q[next_state], policy_s),
                reward,
                self.alpha,
                self.gamma
            )

            # Add next action to use for the agent
            self.next_action = next_action

        else:
            self.Q[state][action] = self.__update_rule_Q(
                self.Q[state][action],
                0,
                reward,
                self.alpha,
                self.gamma
            )

            # Reset Next action
            self.next_action = None


    def step(self, state, action, reward, next_state, done, i_episode=1):
        """
        Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """

        if self.algo == 'sarsa':
            self.sarsa_step(state, action, reward, next_state, done, i_episode=i_episode)
        elif self.algo == 'sarsamax':
            self.sarsamax_step(state, action, reward, next_state, done, i_episode=i_episode)
        elif self.algo == 'expected_sarsa':
            self.expected_sarsa_step(state, action, reward, next_state, done, i_episode=i_episode)
        else:
            self.sarsa_step(state, action, reward, next_state, done, i_episode=i_episode)

In [4]:
# Creating Agents
sarsa_agent = Agent(algo='sarsa')
sarsamax_agent = Agent(algo='sarsamax')
expected_sarsa_agent = Agent(algo='expected_sarsa')

agents = [sarsa_agent, sarsamax_agent, expected_sarsa_agent]

In [5]:
def train_agents(agents, num_episodes=20000):
    """
    Train agents and return their performances
    
    Input: 
      * agents: Agents to train
      * num_episodes: Number of episodes to train on
      
    Output: Each list entry is a tuple of (agent, avg_rewards, best_avg_reward)
    """
    
    results = []
    
    for agent in agents:
        print('Training Agent: %s' % agent.algo)
        avg_rewards, best_avg_reward = interact(env, agent, num_episodes=num_episodes)
        results.append((agent, avg_rewards, best_avg_reward))
    
    return results

In [6]:
perf = train_agents(agents, num_episodes=20000)

Training Agent: sarsa
Episode 20000/20000 || Best average reward 9.0347

Training Agent: sarsamax
Episode 20000/20000 || Best average reward 9.0856

Training Agent: expected_sarsa
Episode 20000/20000 || Best average reward 9.3765

