# Continuous Control

---

### Start the Environment


In [None]:
#!pip -q install ./python

 Please select one of the two options below for loading the environment.

In [None]:
from unityagents import UnityEnvironment
import numpy as np

# select this option to load version 1 (with a single agent) of the environment
#env = UnityEnvironment(file_name='/data/Reacher_One_Linux_NoVis/Reacher_One_Linux_NoVis.x86_64')

# select this option to load version 2 (with 20 agents) of the environment
env = UnityEnvironment(file_name='/data/Reacher_Linux_NoVis/Reacher.x86_64')

Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

### Let's examine the State and Action Spaces

Run the code cell below to print some information about the environment.

In [None]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

### Taking  Random Actions in the Environment

In the next code cell, you will learn how to use the Python API to control the agent and receive feedback from the environment.

Note that **in this coding environment, you will not be able to watch the agents while they are training**, and you should set `train_mode=True` to restart the environment.

In [None]:
env_info = env.reset(train_mode=True)[brain_name]      # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

When finished, you can close the environment.

### Training Agent

In [None]:
import numpy as  np
#Setting random seed for reproducibility
np.random.seed(0) 
import sys
import random
#Setting random seed
random.seed(0)

import torch
#Setting random seed
torch.manual_seed(0)
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


from agent import Agent
from noise_generator import OUNoise
from config_file import Config
from utils import plot_reward

%load_ext autoreload
%autoreload 2

In [None]:
# Loading the config
config = Config()
config.display()

In [None]:
NUM_NEURONS_LAYER1 = config.NUM_NEURONS_LAYER1  # Number of neurons in layer1
NUM_NEURONS_LAYER2 = config.NUM_NEURONS_LAYER2  # Number of neurons in layer2

# Initializing our agent
agent = Agent(state_size,
              action_size,
              [NUM_NEURONS_LAYER1,
               NUM_NEURONS_LAYER1])

# Initializing Ornstein-Uhlenbeck process noise
noise  = OUNoise(action_size)

In [None]:
episodes = 1000                                        # Max no. of episodes   
scores = []                                            # list containing scores from each episode
scores_window = deque(maxlen=100)

for i_episode in range(1,episodes+1):
    
    env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
    state = env_info.vector_observations               # get the current state
    score = np.zeros(num_agents)                       # scores for each agent
    
    noise.reset()
    
    step = 0

    while True:
        
        action = agent.act(state)                      # select an action
        action = noise.get_action(action, step)        # add noise to action for exploration
        
        env_info = env.step(action)[brain_name]        # send the action to the environment
        
        next_state = env_info.vector_observations      # get the next state
        reward = env_info.rewards                      # get the reward
        done = env_info.local_done                     # see if episode has finished
        agent.step(state, action, reward,              # Update Agent's attributes(Replay buffer/parameters)
                   next_state, done)
        score += reward                                # update the score
        state = next_state                             # roll over the state to next time step
        
        step += 1                                      
         
        if np.any(done):                               # exit loop if episode finished is by any agent
            break
        
    scores_window.append(np.mean(score))               # save most recent score
    scores.append(np.mean(score))                      # save most recent score
    
    sys.stdout.flush()
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
    
    
    if np.mean(scores_window) >= 30:
        
        torch.save({'actor_main_network_state_dict' : agent.actor_main.state_dict(),
                    'actor_target_network_state_dict' : agent.actor_target.state_dict(),
                    'critic_main_network_state_dict' : agent.critic_main.state_dict(),
                    'critic_target_network_state_dict' : agent.critic_target.state_dict(),
                    'optimizer_actor_state_dict' : agent.optimizer_actor.state_dict(),
                    'optimizer_critic_state_dict' : agent.optimizer_actor.state_dict()},
                    'agent_state_dict.pt')
        
        if np.mean(scores_window) >= 32:
            
            print("\n Problem Solved!")
            
            break
    
print("Score: {}".format(score))


In [None]:
#Plotting Rewards
plot_reward(scores)

### Testing the agent

In [None]:
# Loading saved parameters to test agent over 100 trials

checkpoint = torch.load('agent_state_dict.pt', map_location= "cpu")
agent.actor_main.load_state_dict(checkpoint['actor_main_network_state_dict'])
agent.critic_main.load_state_dict(checkpoint['critic_main_network_state_dict'])

def test(num_episodes=100):
    
    all_scores = []

    from tqdm import tqdm
    
    for ep in tqdm(range(num_episodes)):

        env_info = env.reset(train_mode=True)[brain_name]      # reset the environment    
        states = env_info.vector_observations                  # get the current state (for each agent)
        scores = np.zeros(num_agents)                          # initialize the score (for each agent)
        while True:
            actions = agent.act(states)                        # all actions between -1 and 1
            env_info = env.step(actions)[brain_name]           # send all actions to tne environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
            dones = env_info.local_done                        # see if episode finished
            scores += env_info.rewards                         # update the score (for each agent)
            states = next_states                               # roll over states to next time step
            if np.any(dones):                                  # exit loop if episode finished
                break

    all_scores.append(np.mean(scores))
    print('Total score (averaged over agents) this episode: {}'.format(np.mean(all_scores)))
    
test(num_episodes = 100)