In [1]:
import numpy as np
from collections import defaultdict

class Agent:

    def __init__(self, nA=6):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        self.iterations = 1
        self.alpha = 0.1
        self.gamma = 0.99
        self.policy = None
        # TODO: define epsilon or other variables that need to be initialized

    def select_action(self, state):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        # at the moment this returns a random action
        # we need to build a logic here
		
        epsilon = 1 / self.iterations
        self.iterations += 1
        
        self.policy = np.ones(self.nA) * epsilon / self.nA
        self.policy[np.argmax(self.Q[state])] = 1 - epsilon + (epsilon / self.nA)
        
        return np.random.choice(self.nA, p=self.policy)
   

    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        Q_value_current = self.Q[state][action]
        next_action = self.select_action(next_state)
        
        Q_learning = np.max(self.Q[next_state]) # Q Learning
        Q_sarsa = self.Q[next_state][next_action] # Sarsa
        Q_sarse_expected = np.dot(self.Q[next_state], self.policy)
        self.Q[state][action] = Q_value_current + (self.alpha * (reward + (self.gamma * Q_learning) - Q_value_current))
        

In [4]:
import numpy as np
from collections import defaultdict

class Agent:

    def __init__(self, nA=6):
        """ Initialize agent.
        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        self.num_episodes = 1
        self.policy_s = None
        self.gamma = .95
        self.alpha = 1
        
    def update_Q(self, Qsa, Qsa_next, reward):
        " updates the action-value function estimate using the most recent time step “”"
        return Qsa + (self.alpha * (reward + (self.gamma * Qsa_next) - Qsa))

    def epsilon_greedy_probs(self, Q_s, i_episode, eps=None):
        epsilon = 1.0 / i_episode
        if eps is not None:
            epsilon = eps
        policy_s = np.ones(self.nA) * epsilon / self.nA
        policy_s[np.argmax(Q_s)] = 1 - epsilon + (epsilon / self.nA)
        return policy_s        

    def select_action(self, state):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        """action = np.random.choice(np.arange(self.nA), p=policy_s)
        return np.random.choice(self.nA)"""
        
        self.policy_s = self.epsilon_greedy_probs(self.Q[state], self.num_episodes, 0.0005)
        self.num_episodes += 1
        
        return np.random.choice(np.arange(self.nA), p=self.policy_s)

    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        self.Q[state][action] = self.update_Q(self.Q[state][action], np.dot(self.Q[next_state], self.policy_s), \
                                                     reward)

        