# Continuous Control

---

## 1. Import the Necessary Packages

In [1]:
from unityagents import UnityEnvironment
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from ddpg_agent import Agent

## 2. Instantiate the Environment and 20 Agents

In [3]:
# initialize the environment
env = UnityEnvironment(file_name='./Reacher_20.app')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [4]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [5]:
# initialize agents
agent = Agent(state_size=33, 
              action_size=4, 
              random_seed=2, 
              num_agents=20)

## 3. Train the 20 Agents with DDPG

To amend the `ddpg` code to work for 20 agents instead of 1, here are the modifications I did in `ddpg_agent.py`:

- With each step, each agent adds its experience to a replay buffer shared by all agents (line 61-61).
- At first, the (local) actor and critic networks are updated 20 times in a row (one for each agent), using 20 different samples from the replay buffer as below:
```
def step(self, states, actions, rewards, next_states, dones):
    ...
    # Learn (with each agent), if enough samples are available in memory
    if len(self.memory) > BATCH_SIZE:
        for i in range(self.num_agents):
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
```
    Then in order to get less aggressive with the number of updates per time step, instead of updating the actor and critic networks __20 times__ at __every timestep__, we amended the code to update the networks __10 times__ after every __20 timesteps__ (line )

In [6]:
def ddpg(n_episodes=1000, max_t=300, print_every=100, 
         num_agents=1):
    """
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        print_every (int): episodes interval to print training scores
        num_agents (int): the number of agents
    """
    scores_deque = deque(maxlen=print_every)
    scores = []
    
    for i_episode in range(1, n_episodes+1):
        # reset the environment
        env_info = env.reset(train_mode=True)[brain_name]
        # get the current state (for each agent)
        states = env_info.vector_observations
        # initialize the scores (for each agent) of the current episode
        scores_i = np.zeros(num_agents)
        for t in range(max_t):
            # select an action (for each agent)
            actions = agent.act(states)
            # send action to the environment
            env_info = env.step(actions)[brain_name]
            # get the next_state, reward, done (for each agent)
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            # store experience and train the agent
            agent.step(states, actions, rewards, next_states, dones, 
                       update_every=20, update_times=10)
            # roll over state to next time step
            states = next_states
            # update the score
            scores_i += rewards
            # exit loop if episode finished
            if np.any(dones):
                break 
        # save average of the most recent scores
        scores_deque.append(scores_i.mean())
        scores.append(scores_i.mean())
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        torch.save(agent.actor_local.state_dict(), 'd')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            
    return scores

In [None]:
scores = ddpg(n_episodes=200, max_t=1000, print_every=20, num_agents=20)

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()
plt.savefig('ddpg_20_agents.png')

Episode 20	Average Score: 0.55
Episode 40	Average Score: 0.64
Episode 60	Average Score: 0.84
Episode 80	Average Score: 0.51
Episode 96	Average Score: 0.55

In [None]:
#env.close()

In [15]:
# load Actor-Critic policy
agent.actor_local.state_dict() = torch.load('checkpoint_actor.pth')
agent.critic_local.state_dict() = torch.load('checkpoint_critic.pth')

scores = ddpg(n_episodes=100, max_t=300, print_every=10, num_agents=20)

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()
plt.savefig('ddpg_20_agents_101to200.png')

SyntaxError: can't assign to function call (<ipython-input-15-d5eec6d1520f>, line 2)