# Navigation


In [1]:
from unityagents import UnityEnvironment
import numpy as np

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
env = UnityEnvironment(file_name='./Banana_Windows_x86_64/Banana.exe')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


### Examine the State and Action Spaces

The simulation contains a single agent that navigates a large environment.  At each time step, it has four actions at its disposal:
- `0` - walk forward 
- `1` - walk backward
- `2` - turn left
- `3` - turn right

The state space has `37` dimensions and contains the agent's velocity, along with ray-based perception of objects around agent's forward direction.  A reward of `+1` is provided for collecting a yellow banana, and a reward of `-1` is provided for collecting a blue banana. 

Run the code cell below to print some information about the environment.

In [None]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environment
print('Number of agents:', len(env_info.agents))

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)

# examine the state space 
state = env_info.vector_observations[0]
print('States look like:', state)
state_size = len(state)
print('States have length:', state_size)

### Deep Q-Network training

In [4]:
import torch
from q_network import QNetwork
from replay_buffer import ReplayBuffer, ReplayBufferWeighted
from agent import DQNAgent, DoubleDQNAgent, PrioExpReplayAgent
from dqn import DQN

In [6]:
buffer_size = int(1e5)  # replay buffer size
batch_size = 64         # minibatch size
gamma = 0.99            # discount factor
tau = 1e-3              # for soft update of target parameters
lr = 5e-4               # learning rate 
update_every = 4        # how often to update the network

- buffer_size: Size of the replay buffer. Samples are taken from the replay buffer in order to train the Q-network.
- gamma: Discount factor during the computation of the target $(target = reward + gamma*Qvalue[state_{t+1}] - Qvalue[state_t])$.
- tau, update_every: $\tau$ is a float value used to compute the convex combination of old Q-network's and target Q-network's weights. The weights are updated every update_every steps to stabilize the learning process.
- lr: Learning rate for the optimizer of the Q-network.
    

In [7]:
def eps_iterator(eps_init, eps_discount, eps_min):
    eps = eps_init
    while True:
        eps = max(eps * eps_discount, eps_min)
        yield eps
        
def save_qnetwork_from_dqn(dqn, path):
    torch.save(dqn.agent.qnetwork_local.state_dict(), path)

def load_agent_from_qnetwork(path, state_space_size, action_space_size):
    qnetwork_restored = QNetwork(state_space_size, action_space_size)
    qnetwork_restored.load_state_dict(torch.load(path))
    qnetwork_restored.eval()
    agent = Agent(qnetwork_restored, ReplayBuffer(action_space_size, buffer_size, batch_size, 1337))
    return agent

In [8]:
# env = UnityEnvironment(file_name='./Banana_Windows_x86_64/Banana.exe')

brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=True)[brain_name] # reset the environment
state = env_info.vector_observations[0]  

In [9]:
state_space_size = len(env_info.vector_observations[0])
action_space_size = brain.vector_action_space_size

#### Standard DQN agent

Plain vanilla implementation of a Deep-Q-Network. The agent is trained via a temporal difference learning algorithm.
The Q-value function, which maps from state and action space to the expected value is modeled via a neural network.

#### Check different layer sizes

As the Q-Network a simple feed-forward network with two hidden layers with relu activation is used.
Since the banana problem is quite easy and the state space is rather small, this simple network 
structure seems appropriate. In the following the performance of the agent with regards to the 
number of neurons is tested. Please note, that a really small set of parameters is used due to the long training time. If more computational resources are available one could also tune further parameters, such as the batch_size, learning rate etc.

In [10]:
size_scores = dict()
max_iter = 500
for size in [1024, 512, 256, 128, 64]:
    qnetwork = QNetwork(state_space_size, action_space_size, layer_size1=size, layer_size2=int(size/2), lr=lr)
    replay_buffer = ReplayBuffer(action_space_size, buffer_size, batch_size, 1337)
    agent = DQNAgent(gamma, tau, batch_size, update_every, qnetwork, replay_buffer)
    dqn = DQN(env, agent)
    scores_dqn = dqn.train(eps_iterator(0.5, 0.99, 0.001), max_iter)
    size_scores[size] = scores_dqn


Episode 0	Average Score: 0.00
Episode 100	Average Score: 4.00
Episode 0	Average Score: 0.0040
Episode 100	Average Score: 1.50
Episode 0	Average Score: 2.0040
Episode 100	Average Score: 0.05
Episode 0	Average Score: 2.00700
Episode 100	Average Score: 0.95
Episode 0	Average Score: 1.0070
Episode 100	Average Score: 0.40
Episode 199	Average Score: 0.200

In [None]:
plt.figure(figsize=(12,6))
for key, value in size_scores.items():
    plt.plot(value, label=', '.join([str(key), str(int(key/2)), ' neurons']))
plt.legend()

The network with ... neurons performs best and is therefore used in the following.

In [19]:
qnetwork = QNetwork(state_space_size, action_space_size, layer_size1=512, layer_size2=256, lr=lr)
replay_buffer = ReplayBuffer(action_space_size, buffer_size, batch_size, 1337)
agent = DQNAgent(gamma, tau, batch_size, update_every, qnetwork, replay_buffer)
dqn = DQN(env, agent)

In [20]:
scores_dqn = dqn.train(eps_iterator(0.5, 0.9925, 0.005), 1800)

Episode 0	Average Score: 2.00
Episode 100	Average Score: 1.05
Episode 200	Average Score: 2.90
Episode 300	Average Score: 7.75
Episode 400	Average Score: 12.15
Episode 500	Average Score: 12.70
Episode 538	Average Score: 13.40Problem solved after 539 episodes


In [21]:
save_qnetwork_from_dqn(dqn, './vanilla_dqn.pickle')

#### Double DQN agent

In order to avoid an overestimation of Q-values during early training stages two Q-networks are used: one to select the best action and one to estimate the expected Q-value.

In [22]:
qnetwork = QNetwork(state_space_size, action_space_size, layer_size1=512, layer_size2=256, lr=lr)
replay_buffer = ReplayBuffer(action_space_size, buffer_size, batch_size, 1337)
agent = DoubleDQNAgent(gamma, tau, batch_size, update_every, qnetwork, replay_buffer)
dqn = DQN(env, agent)

In [23]:
scores_double_dqn = dqn.train(eps_iterator(0.5, 0.9925, 0.005), 1800)

Episode 0	Average Score: 3.00
Episode 100	Average Score: 2.30
Episode 200	Average Score: 5.00
Episode 300	Average Score: 8.200
Episode 400	Average Score: 10.60
Episode 500	Average Score: 12.25
Episode 560	Average Score: 13.65Problem solved after 561 episodes


In [24]:
save_qnetwork_from_dqn(dqn, './double_dqn.pickle')

#### Prioritized Experience Replay DQN agent

With the help of prioritized experience replays samples which lead to a larger change in the Q-table and are therefore more significant are used more frequently and thus leads to a more efficient training.

The parameter $\alpha \in [0,1]$ is an interpolation factor for the sampling from the replay buffer: values close to 0 indicate more random uniform sampling, while values close to 1 favour priority sampling. 

Due to the non-uniform sampling a bias is introduced during the weight update in the optimization step. The parameter $\beta \in [0,1]$ allows to correct this bias for values close to 1. For smaller values no correction is performed.

In [29]:
qnetwork = QNetwork(state_space_size, action_space_size, layer_size1=512, layer_size2=256, lr=lr)
replay_buffer = ReplayBufferWeighted(action_space_size, buffer_size, batch_size, 1337, alpha=0.5, beta=0.5, eps=0.01)
agent = PrioExpReplayAgent(gamma, tau, batch_size, update_every, qnetwork, replay_buffer)
dqn = DQN(env, agent)

In [30]:
scores_prioexpreplay_dqn = dqn.train(eps_iterator(0.5, 0.9925, 0.005), 1800)

Episode 0	Average Score: -2.00
Episode 100	Average Score: 1.40
Episode 200	Average Score: 3.10
Episode 300	Average Score: 3.40
Episode 400	Average Score: 4.60
Episode 446	Average Score: 3.25

KeyboardInterrupt: 

In [None]:
save_qnetwork_from_dqn(dqn, './prio_exp_replay_dqn.pickle')

In [None]:
plt.figure(figsize=(10,7))
plt.plot(pd.Series(scores_dqn).resample(10).mean(), color='blue', label='DQN')
plt.plot(pd.Series(scores_double_dqn).resample(10).mean(), color='red', label='Double DQN')
plt.plot(pd.Series(scores_prioexpreplay_dqn).resample(10).mean(), color='green', label='Prio. Exp. Replay DQN')
plt.legend()

### Conclusion and Outlook

In [None]:
agent = load_agent_from_qnetwork('./trained_model', state_space_size, action_space_size)

In [None]:
env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations[0]            # get the current state
score = 0# initialize the score

done = False
while not done:
    action = dqn.agent.act(state, 0.0)        # select an action
    env_info = env.step(action)[brain_name]        # send the action to the environment
    next_state = env_info.vector_observations[0]   # get the next state
    reward = env_info.rewards[0]                   # get the reward
    done = env_info.local_done[0]                  # see if episode has finished
    score += reward                                # update the score
    state = next_state                             # roll over the state to next time step
    
print("Score: {}".format(score))

In [None]:
# env.close()