# Project 1 - Navigation
---
<font color=black> In this notebook, it was implemented a DQN agent with Unity's Banana Collector environment modified by Udacity.</font>

<font color=red> This code is a adaptation from code provided by Deep Reinforcement Learning Nanodegree - Udacity.</font>


### 1. Import the Necessary Packages

In [None]:
from unityagents import UnityEnvironment
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

### 2. Instantiate the Environment and Agent

Initialize the environment in the code cell below.

In [None]:
env = UnityEnvironment(file_name="Banana_Linux/Banana.x86_64")

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

Initialize the agent in the code cell below.

In [None]:
env_info = env.reset(train_mode=True)[brain_name]

from navigation_dqn_agent import Agent

state = env_info.vector_observations[0]
state_size = len(state)                         # number of states

action_size = brain.vector_action_space_size    # number of actions

agent = Agent(state_size=state_size, action_size=action_size, seed=0)

### 3. Train the Agent with DQN

Run the code cell below to train the agent.

In [None]:
def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    first_time_solved = True           # To print solved only once

    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name] # reset the environment
        state = env_info.vector_observations[0]            # get the current state
        score = 0

        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]        # send the action to the environment
            next_state = env_info.vector_observations[0]   # get the next state
            reward = env_info.rewards[0]                   # get the reward
            done = env_info.local_done[0]                  # see if episode has finished
            agent.step(state, action, reward, next_state, done)  # makes the agent learn
            state = next_state
            score += reward
            if done:
                break 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            
        if first_time_solved and np.mean(scores_window)>13.0:  # At least +13 as average reward over laster 100 episode
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            first_time_solved = False

    torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')

    return scores

scores = dqn()

window = []
for ii in range(100, len(scores)):
    window.append(np.mean(scores[ii-100:ii]))

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.plot(np.arange(100, len(scores), 1), window)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
env.close()