# Navigation project using: DQN, DDQN, Dueling DQN & DDQN

In [1]:
# import the libraries used                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
from unityagents import UnityEnvironment
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
import time
%matplotlib inline
from Agent import agent

In [3]:
# Load the environement. Here you need to adjust the filename.
env = UnityEnvironment(file_name="../Banana_Linux/Banana.x86_64")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [4]:
# We have, one agent that navigate in an environment composed of 37 states possible. 
# The state space has 37 dimensions that includes the agent's velocity and ray-based
# perception of objects in front of the agent. 
# The agent has 4 actions possible in each state:
#    0 - walk forward
#    1 - walk backward
#    2 - turn left
#    3 - turn right 

In [5]:
def initialize_env(env,train_mode=True,brain_id=0):
    # Here we take the brain number brain_id  and set it as the default brain.
    # Environments contain brains which are responsible for deciding the actions of their associated agents.
    # We use a single agent that move around in the environment
    brain_name = env.brain_names[brain_id]
    brain = env.brains[brain_name]
    
    # number of actions
    action_size = brain.vector_action_space_size
    
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # examine the state space 
    state = env_info.vector_observations[0]
    print('States look like:',state)

    state_size = len(state)
    print('States have length:', state_size)
    
    return brain_name, brain, env_info, action_size, state_size

In [6]:
def train(env,brain_name,agent,dqn,file_save='checkpoint.pth',n_episodes=700, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
  
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
 
    scores = []                        # list containing scores from each episode
    scores_mean_w=[]                   # list containing the mean score evaluates every 10 episodes
    scores_window = deque(maxlen=100)  # last 100 scores

    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name] 
        state = env_info.vector_observations[0]
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]        # information about the environment
            next_state = env_info.vector_observations[0]   # get the next state
            reward = env_info.rewards[0]                   # get the reward
            done = env_info.local_done[0]                  # see if episode has finished
            agent.step(state, action, reward, next_state, done,dqn)
 
            score += reward      # update the score
            state = next_state   # roll over the state to next time step
            time = t
            if done:             # exit loop if episode finished
                break
 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if i_episode % 10 == 0:            
            scores_mean_w.append(np.mean(scores_window))
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

        if np.mean(scores_window)>=13.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), file_save)
            break

    return scores,scores_mean_w


In [7]:
def Test(env,brain_name,agent,file_name,max_t=1000):
    """ Visualize agent using saved checkpoint. """
    # load saved weights
    agent.qnetwork_local.load_state_dict(torch.load(file_name))
 
    scores = []                        # list containing scores from each episode
    score = 0

    env_info = env.reset(train_mode=False)[brain_name] 
    state = env_info.vector_observations[0]
    score = 0
    t=0
    for t in range(max_t):
        # select an action using a greedy policy
        action = agent.act(state)
        # take action in environment
        env_info = env.step(action)[brain_name]        # information about the environment
        next_state = env_info.vector_observations[0]   # get the next state
        reward = env_info.rewards[0]                   # get the reward
        done = env_info.local_done[0]  
        state = next_state   # roll over the state to next time step
        score += reward      # update the score
        scores.append(score)              # save most recent score
        if done or t>= max_t:
           break
    print("Average reward par step:",score/t)
    return scores

In [8]:
def plot_data(x,y,xlabel='Episode #',ylabel='Score'):
  # plot the scores
  fig = plt.figure()
  ax = fig.add_subplot(111)
  plt.plot(x, y)
  plt.ylabel(ylabel)
  plt.xlabel(xlabel)
  plt.show()

## Training

In [9]:
brain_name, brain, env_info, action_size, state_size = initialize_env(env,train_mode=True,brain_id=0)

States look like: [1.         0.         0.         0.         0.84408134 0.
 0.         1.         0.         0.0748472  0.         1.
 0.         0.         0.25755    1.         0.         0.
 0.         0.74177343 0.         1.         0.         0.
 0.25854847 0.         0.         1.         0.         0.09355672
 0.         1.         0.         0.         0.31969345 0.
 0.        ]
States have length: 37


In [None]:
dqn_agent = agent(state_size, action_size,duel=False, fc1_units=64,fc2_units=64,seed=0)
dqn_scores,dqn_scores_mean_w = train(env,brain_name,dqn_agent,dqn=True,file_save='dqn_checkpoint.pth')

QNetwork(
  (fc1): Linear(in_features=37, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=4, bias=True)
)
Episode 10	Average Score: -0.20
Episode 20	Average Score: 0.20
Episode 30	Average Score: 0.27
Episode 40	Average Score: 0.42
Episode 50	Average Score: 0.62
Episode 60	Average Score: 0.67
Episode 70	Average Score: 0.67
Episode 80	Average Score: 0.72
Episode 90	Average Score: 0.98
Episode 100	Average Score: 1.17
Episode 100	Average Score: 1.17
Episode 110	Average Score: 1.50
Episode 120	Average Score: 1.79
Episode 130	Average Score: 2.16
Episode 140	Average Score: 2.47
Episode 150	Average Score: 2.80
Episode 160	Average Score: 3.08
Episode 170	Average Score: 3.43
Episode 180	Average Score: 3.81
Episode 190	Average Score: 4.02
Episode 200	Average Score: 4.27
Episode 200	Average Score: 4.27
Episode 210	Average Score: 4.55
Episode 220	Average Score: 4.86
Episode 230	Average Score: 5.15
Episode 240	Avera

In [None]:
plot_data(np.arange(len(dqn_scores)),dqn_scores,xlabel='Episode #',ylabel='DQN Score')
plot_data(np.arange(len(dqn_scores_mean_w))*10,dqn_scores_mean_w,xlabel='Episode #',ylabel='DQN Score average (10Ep)')

In [None]:
ddqn_agent = agent(state_size, action_size, duel=False, fc1_units=64,fc2_units=64,seed=0)
ddqn_scores,ddqn_scores_mean_w = train(env,brain_name,ddqn_agent,dqn=False,file_save='ddqn_checkpoint.pth')

In [None]:
plot_data(np.arange(len(ddqn_scores)),ddqn_scores,xlabel='Episode #',ylabel='DDQN Score')
plot_data(np.arange(len(ddqn_scores_mean_w))*10,ddqn_scores_mean_w,xlabel='Episode #',ylabel='DDQN Score average (10Ep)')

## Testing

In [None]:
brain_name, brain, env_info, action_size, state_size = initialize_env(env,train_mode=True,brain_id=0)

In [None]:
print(env_info.state)

In [None]:
agent = dqn_Agent(state_size, action_size,duel=False, fc1_units=64,fc2_units=64,seed=0)
file_name ='dqn_checkpoint.pth'

In [None]:
scores=Test(env,brain_name,agent,file_name ,max_t=1000)

In [None]:
plot_data(np.arange(len(scores)),scores,xlabel='Step #',ylabel='DQN Score')


# Fin

In [None]:
env.close()