# OpenAI_Gym_Taxi-v2

In [None]:
import numpy as np
from collections import deque, defaultdict
import random, sys, math, gym

In [None]:
class Agent:

    def __init__(self, nA=6, epsilon=0.1, alpha=0.8, gamma=0.85, episode=1):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        
        epsilon=0.1, alpha=0.08, gamma=0.85 => 9.435
        """
        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        
        self.epsilon = epsilon # Epsilon value, probability of taking on a random action in a epsilon-greedy policy (epsion = 0 means no randomness)
        self.alpha = alpha # Alpha value, the Step Size
        self.gamma = gamma # Gamma value, the Discount Rate, (gamma = 0 means only results from most immediate reward is being used)
        self.episode = episode

    def select_action(self, state, i_episode=0, num_episodes=0):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        
#         policy_state = ((np.ones(self.nA) * self.epsilon) / self.nA)
#         greedy_Q = np.argmax(self.Q[state])
#         policy_state[greedy_Q] = 1 - self.epsilon + (self.epsilon / self.nA)
#         return np.random.choice(self.nA, p=policy_state)


#         if i_episode != self.episode and i_episode % 2000 == 0:
#             self.gamma -= 20/num_episodes
#             self.alpha -= 20/num_episodes
#             self.episode = i_episode
#             print('  =>  ', 'gamma: ', self.gamma, 'alpha: ', self.alpha)
    
        # greedy policy
        return np.argmax(self.Q[state])

    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
#         next_action = self.select_action(next_state)
#         goft = reward + (self.gamma * self.Q[next_state][next_action])
#         self.Q[state][action] += self.alpha * (goft - self.Q[state][action])
        # # next_action = np.argmax(self.Q[state])
    
        self.Q[state][action] = self.Q[state][action] + (self.alpha * (reward + (self.gamma * np.max(self.Q[next_state])) - self.Q[state][action]))
        # self.Q[state][action] = (1 - self.alpha) * self.Q[state][action]  + self.alpha * (reward + self.gamma * np.max(self.Q[next_state]))

In [None]:
def interact(env, agent, num_episodes=20000, window=100):
    # initialize average rewards
    avg_rewards = deque(maxlen=num_episodes)
    # initialize best average reward
    best_avg_reward = -math.inf
    # initialize monitor for most recent rewards
    samp_rewards = deque(maxlen=window)
    # for each episode
    for i_episode in range(1, num_episodes+1):
        # begin the episode
        state = env.reset()
        # initialize the sampled reward
        samp_reward = 0
        while True:
            # agent selects an action
            action = agent.select_action(state, i_episode, num_episodes)
            # agent performs the selected action
            next_state, reward, done, _ = env.step(action)
            # agent performs internal updates based on sampled experience
            agent.step(state, action, reward, next_state, done)
            # update the sampled reward
            samp_reward += reward
            # update the state (s <- s') to next time step
            state = next_state
            if done:
                # save final sampled reward
                samp_rewards.append(samp_reward)
                break
        if (i_episode >= 100):
            # get average reward from last 100 episodes
            avg_reward = np.mean(samp_rewards)
            # append to deque
            avg_rewards.append(avg_reward)
            # update best average reward
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
        print("\rEpisode {} , epsilon = {:.8f}, alpha = {:.5f}, gamma = {:.3f}|| Best average reward {:.3f}".format(
            i_episode, agent.epsilon, agent.alpha, agent.gamma, best_avg_reward), end="")
        sys.stdout.flush()
        # check if task is solved (according to OpenAI Gym)
        if best_avg_reward >= 9.7:
            print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
            break
        if i_episode == num_episodes: print('\n')
    return avg_rewards, best_avg_reward

In [None]:
env = gym.make('Taxi-v3')
agent = Agent()
avg_rewards, best_avg_reward = interact(env, agent)

Episode 20000 , epsilon = 0.10000000, alpha = 0.80000, gamma = 0.850|| Best average reward 8.640

