# Continuous Control

---



### Start the Environment



The environments corresponding to both versions of the environment are already saved in the Workspace and can be accessed at the file paths provided below.  

Please select one of the two options below for loading the environment.

In [1]:
from unityagents import UnityEnvironment
import numpy as np



In [2]:
# select this option to load version 1 (with a single agent) of the environment
#env = UnityEnvironment(file_name='/data/Reacher_One_Linux_NoVis/Reacher_One_Linux_NoVis.x86_64')

# select this option to load version 2 (with 20 agents) of the environment
env = UnityEnvironment(file_name='Reacher_Windows_x86_64/Reacher.exe')

## If this step gives error. restart kernel and clear output.

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

### Examine the State and Action Spaces

Run the code cell below to print some information about the environment.

In [5]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  7.90150833e+00 -1.00000000e+00
  1.25147629e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -5.22214413e-01]


### Take random actions

In [9]:
#num_agents = 20
env_info = env.reset(train_mode=True)[brain_name]      # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)

for _ in range(3):
    
    while True:
        actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
        actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

Total score (averaged over agents) this episode: 0.0549999987706542
Total score (averaged over agents) this episode: 0.15849999645724894
Total score (averaged over agents) this episode: 0.3169999929144979


When finished, you can close the environment.

In [None]:
env.close()

In [14]:
fc1_units = 512 #512  #128 ## 64
fc2_units = 256 #256  #64 #32
fc3_units = 64
#fc4 is 16 by default
lr_actor = 1e-4
lr_critic = 1e-4
buffer_size = int(1e6)
batch_size = 128 # 128
Gamma = 0.99
tau = 0.001


In [None]:
import importlib
import model
import Agent

In [15]:
agent = Agent(n_agents = 20,state_size = 33, action_size = 4, random_seed = 2)


In [None]:
from collections import deque



def cont_control(n_episodes = 1000,  print_every = 100 ):
    
    scores = []
    scores_deque = deque(maxlen=100)
    
    for i_episode in range(1,n_episodes+1):
        
        total_r = np.zeros(num_agents)
        #total_rewards = 0
        #state = env.reset()
        env_info = env.reset(train_mode=True)[brain_name]
        agent.reset()
        statesA = env_info.vector_observations
        #state = env_info.vector_observations[0]
        #for t in range(max_t):
        while True:
            
            actionsA = agent.act(statesA)
            env_info = env.step(actionsA)[brain_name]
            next_statesA = env_info.vector_observations
            #next_state, reward, done,_ = env.step(action)
            rewardsA = env_info.rewards
            donesA = env_info.local_done            
            
            agent.step(statesA, actionsA, rewardsA, next_statesA, donesA)
            statesA = next_statesA
            total_r += rewardsA
            if np.any(donesA):
                break
        
        scores.append(total_r)
        scores_deque.append(total_r)
        print('\r Episode:{} \t Average Score: {:.2f} \t Average of last 100 episodes: {:.2f}'\
              .format(i_episode, np.mean(scores), np.mean(scores_deque)), end = "")
        #print(' Episode:{} \t Average Score: {}'.format(i_episode, np.mean(scores_deque)))
        #if i_episode % print_every == 0:
            
            #print('\n Episode:{} \t Average Score: {:.2f}\n'.format(i_episode, np.mean(scores_deque)), end='')
            
        if np.mean(scores_deque) >=30:
            torch.save(agent.actor_local.state_dict(), 'actor_model.pth')
            torch.save(agent.critic_local.state_dict(), 'critic_model.pth')
            print('\n Environment Solved in {} episodes, Average score:{}'.format( (i_episode - 100), np.mean(scores_deque)), end='' )
            #torch.save(agent.actor_local.state_dict(), 'actor_model.pth')
            #torch.save(agent.critic_local.state_dict(), 'critic_model.pth')
            break
            
    
    return scores

                
                
            
            
            
            
    
    
    
    

In [None]:
### This step will train your model

In [None]:
scores = cont_control()

In [None]:
### My model was trained with the output as below

The output above shows the average of last 100 episodes from Episode:5 to Episode:105 which is +30.13

In [None]:
len(scores)


## Plot all 20 agents' scores

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot( range(1,len(scores)+1), scores)
plt.xlabel('# of Episodes')
plt.ylabel('Scores')
plt.show()

In [None]:
### Please see the readme or the report file for this plot.

In [None]:
score_ar = np.asarray(scores)

In [None]:
score_ar.shape

## Mean of 20 agents' scores

In [None]:
score_agent = np.mean(score_ar, axis = 1)

In [None]:
score_agent.shape

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot( range(1,len(score_agent)+1), score_agent)
plt.xlabel('# of Episodes')
plt.ylabel('Scores')
plt.show()

As we can see the model is quiet stable. The score of +30 was achieved in less than 40 episodes and the algorithm hasn't crashed even after twice the number of episodes.

### Now evaluate our trained Agent's Actor and Critic Models on the 20-agent environment.

#### Random actions

In [None]:
env_info = env.reset(train_mode=True)[brain_name]      # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

In [None]:
for episode in range(3):
    env_info = env.reset(train_mode=False)[brain_name]        
    states = env_info.vector_observations       
    score = np.zeros(num_agents)               
    
    while True:
        actions = agent.act(states, add_noise=False)                    
        
        env_info = env.step(actions)[brain_name]        
        next_states = env_info.vector_observations     
        rewards = env_info.rewards       
        dones = env_info.local_done
        score += rewards
        states = next_states

        if np.any(dones):                              
            break

    print('Episode: \t{} \tScore: \t{:.2f}'.format(episode, np.mean(score))) 

Very impressive and consistent results where in each episode the Agent's score was over 30.

In [16]:
agent.actor_local.load_state_dict(torch.load('actor_model.pth',map_location= 'cpu'))
agent.critic_local.load_state_dict(torch.load('critic_model.pth', map_location = 'cpu'))


for episode in range(3):
    env_info = env.reset(train_mode=False)[brain_name]        
    states = env_info.vector_observations       
    score = np.zeros(num_agents)               
    
    while True:
        actions = agent.act(states, add_noise=False)                    
        
        env_info = env.step(actions)[brain_name]        
        next_states = env_info.vector_observations     
        rewards = env_info.rewards       
        dones = env_info.local_done
        score += rewards
        states = next_states

        if np.any(dones):                              
            break

    print('Episode: \t{} \tScore: \t{:.2f}'.format(episode, np.mean(score))) 

Episode: 	0 	Score: 	39.11
Episode: 	1 	Score: 	39.08
Episode: 	2 	Score: 	38.89


In [17]:
env.close()