In [None]:
from unityagents import UnityEnvironment
import numpy as np
from ddpg_agent import Agent
import random
import torch
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
env = UnityEnvironment(file_name="Tennis.app")

Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [None]:
brain_name = env.brain_names[0] # get the default brain
brain = env.brains[brain_name]
num_agents = 2
action_size = brain.vector_action_space_size
env_info = env.reset(train_mode=True)[brain_name]     # reset the environment 
agent = Agent(state_size=24, action_size=action_size, random_seed=11)        

def ddpg(n_episodes=1000, max_t=800, print_every=100):   
    scores_deque = deque(maxlen=print_every)
    scores = []
    average_deque = []    
    for i_episode in range(1, n_episodes+1):      
        env_info = env.reset(train_mode = True)[brain_name]  # reset the environment  
        agent.reset()
        scores_a = np.zeros(num_agents)                      # initialize the score (for each agent)
        states = env_info.vector_observations                # get the current state (for each agent)
        for t in range(max_t):     
            actions = agent.act(state = states, add_noise = True)         # select an action (for each agent) ,changed add noise to False
            actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
            env_info = env.step(actions)[brain_name]           # send all actions to tne environment
            next_states = env_info.vector_observations         # get next state (for each agent)           
            rewards = env_info.rewards                         # get reward (for each agent)   
            dones = env_info.local_done                        # see if episode finished
            scores_a += env_info.rewards                         # update the score (for each agent)
            for k in range(0,num_agents):  
                agent.step(state = states[k], action = actions[k], 
                           reward = rewards[k], next_state = next_states[k], 
                           done = dones[k])
            states = next_states                              # roll over states to next time step
            if np.any(dones):                                  # exit loop if episode finished
                break

        scores.append(np.max(scores_a))
        scores_deque.append(scores)  
        average_deque.append(np.mean(scores_deque))
        print('\rScore (max over agents) from episode {}: {},{}'.format(i_episode, np.max(scores_a),
                                                                        np.mean(scores_deque)), end="")
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
        if i_episode % print_every == 0:
            print('\rScore (Average Score) from episode {}: {}'.format(i_episode, np.mean(scores_deque)))
         
        if i_episode % print_every == 0 and np.mean(scores_deque)>0.52:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            break
            
    return scores,average_deque

scores,average_deque = ddpg()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores,average_deque)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()