# Load the module

Depending on your os, you need to download Udacity environment for banana collector game, the links are in below. After downloaded the file, change `env = UnityEnvironment(file_name="Banana.exe")` to the place where you put the Banana collector.

- Linux: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana_Linux.zip)
- Mac OSX: [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana.app.zip)
- Windows (32-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana_Windows_x86.zip)
- Windows (64-bit): [click here](https://s3-us-west-1.amazonaws.com/udacity-drlnd/P1/Banana/Banana_Windows_x86_64.zip)

In [None]:
%load_ext autoreload
%autoreload 1

In [None]:
from unityagents import UnityEnvironment
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

from collections import deque

%aimport agent, model

from model import QNetwork, DDQNetwork
from agent import Agent_no_soft_update, Agent, PERAgent

In [None]:
env = UnityEnvironment(file_name="Banana.exe")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=False)[brain_name]
state_size=len(env_info.vector_observations[0])
action_size=brain.vector_action_space_size

In [None]:
def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995, file= 'checkpoint.pth'):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    solved = False
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]    
        score = 0
        for t in range(max_t):
            action = int(agent.act(state, eps)) # you have to convert it to int for windows 10 
            env_info = env.step(action)[brain_name] 
            next_state = env_info.vector_observations[0] 
            reward = env_info.rewards[0]
            done = env_info.local_done[0]  
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if solved == False and np.mean(scores_window)>=13.0:
            solved = True
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
    if solved:
        torch.save(agent.qnetwork_local.state_dict(), file)
    return scores


In [None]:
def load_and_play(agent, file):
    
    if agent is None:
        print('no agent')
        return None
    agent.qnetwork_local.load_state_dict(torch.load(file, map_location= 'cpu'))

    env_info = env.reset(train_mode=False)[brain_name] # reset the environment
    state = env_info.vector_observations[0]            # get the current state
    score = 0                                          # initialize the score
    while True:
        action = int(agent.act(state))                 # select an action
        env_info = env.step(action)[brain_name]        # send the action to the environment
        next_state = env_info.vector_observations[0]   # get the next state
        reward = env_info.rewards[0]                   # get the reward
        done = env_info.local_done[0]                  # see if episode has finished
        score += reward                                # update the score
        state = next_state                             # roll over the state to next time step
        if done:                                       # exit loop if episode finished
            break
    
    print("Score: {}".format(score))

# Train

There are 3 agents from agent.py, there are Agent_no_soft_update, Agent, PERAgent. Agent uses experience replay, PERAgent uses priotized experience replay. Both Agent and PERAgent come with options where you can choose dueling or double. You can modify the hyperparameters in the agent.py and model.py. This notebook will automatically reload them, no need to restart the kernel.

The agent saves the model weights only when it can solve the environment.

In [None]:
file = 'filename.pth'
agent = Agent(state_size, action_size, seed=0, dueling= False, double= False)
scores = dqn(file= file)


# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

# Replay

In [None]:
file = 'filename.pth'
agent = Agent_no_soft_update(state_size, action_size, seed=0)
load_and_play(agent, file)

In [None]:
env.close()