In [None]:
from unityagents import UnityEnvironment
import numpy as np

In [None]:
env = UnityEnvironment(file_name="Tennis", no_graphics=True)

In [None]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [None]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

In [None]:
for i in range(1, 6):                                      
    env_info = env.reset(train_mode=False)[brain_name]        
    states = env_info.vector_observations                  
    scores = np.zeros(num_agents)                          
    while True:
        actions = np.random.randn(num_agents, action_size) 
        actions = np.clip(actions, -1, 1)                  
        env_info = env.step(actions)[brain_name]           
        next_states = env_info.vector_observations         
        rewards = env_info.rewards                         
        dones = env_info.local_done                        
        scores += env_info.rewards                         
        states = next_states                               
        if np.any(dones):                                  
            break
    print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from MADDPG_agent import MADDPG
import torch
from collections import deque
from matplotlib import pyplot as plt

In [None]:
agent = MADDPG(seed=2, noise_start=0.5, buffer_size=100000, batch_size = 1024, update_every=2, gamma=0.95, t_stop_noise=30000)
n_episodes = 6000
max_t = 1000
scores = []
scores_deque = deque(maxlen=100)
scores_avg = []

for i_episode in range(1, n_episodes+1):
    rewards = []
    env_info = env.reset(train_mode=False)[brain_name]    
    state = env_info.vector_observations                  

    for t in range(max_t):
        
        action = agent.act(state)
        
        env_info = env.step(action)[brain_name]
        next_state = env_info.vector_observations
        rewards_vec = env_info.rewards
        done = env_info.local_done
       
        agent.step(state, action, rewards_vec, next_state, done)
        state = next_state
        rewards.append(rewards_vec)
        if any(done):
            break

    
    episode_reward = np.max(np.sum(np.array(rewards),axis=0))
        
    scores.append(episode_reward)             
    scores_deque.append(episode_reward)       
    current_avg_score = np.mean(scores_deque)
    scores_avg.append(current_avg_score)      
    
    print('\rEpisode {}\tAverage Score: {:.3f}'.format(i_episode, current_avg_score),end="")
    
    
    if i_episode % 200 == 0:
        print('\rEpisode {}\tAverage Score: {:.3f}'.format(i_episode, current_avg_score))
        agent.save_agents()

    
    if np.mean(scores_deque)>=1.2:
        print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode, np.mean(scores_deque)))
        agent.save_agents()
        break

In [None]:
import matplotlib
matplotlib.rcParams['xtick.labelsize'] = 16
matplotlib.rcParams['ytick.labelsize'] = 16

In [None]:
fig = plt.figure(figsize=(15,7))
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores,'b',label='Episode Scores')
plt.plot(np.arange(1, len(scores)+1), scores_avg,'r',\
         linewidth=5,label='Avg. score')
plt.ylabel('Score', fontsize=18)
plt.xlabel('Episode #', fontsize=18)
ax.legend(fontsize=14)
plt.show()

In [None]:
env.close()