# Watch a trained Agent

In [1]:
from unityagents import UnityEnvironment
import numpy as np
from collections import deque
import torch
import matplotlib.pyplot as plt

In [2]:
config = {
    'BUFFER_SIZE': int(1e6),         # replay buffer size
    'BATCH_SIZE' : 256,              # minibatch size
    'GAMMA' : 0.99,                  # discount factor
    'TAU' :1e-3,                     # for soft update of target parameters
    'LR_ACTOR' : 1e-3,               # learning rate of the actor
    'LR_CRITIC' : 1e-3,              # learning rate of the critic
    'WEIGHT_DECAY' : 0,              # L2 weight decay
    'UPDATE_EVERY' : 1,              # how often to update the network
    'THETA' : 0.15,                  # parameter for Ornstein-Uhlenbeck process
    'SIGMA' : 0.2,                   # parameter for Ornstein-Uhlenbeck process and Gaussian noise
    'hidden_layers' : [256,128],     # size of hidden_layers
    'use_bn' : True,                 # use batch norm or not 
    'use_reset' : True,              # weights initialization used in original ddpg paper
    'noise' : "gauss"                # choose noise type, gauss(Gaussian) or OU(Ornstein-Uhlenbeck process) 
}

In [3]:
from maddpg.agents import MultiAgents

multiagent = MultiAgents(num_agents=2, obs_size=24, action_size=2, config=config, seed=0)

In [4]:
multiagent.agents[0].actor_local

Actor(
  (batchnorm_input): BatchNorm1d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm_layers): ModuleList(
    (0): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (hidden_layers): ModuleList(
    (0): Linear(in_features=24, out_features=256, bias=False)
    (1): Linear(in_features=256, out_features=128, bias=False)
  )
  (output): Linear(in_features=128, out_features=2, bias=True)
)

In [5]:
multiagent.agents[0].critic_local

Critic(
  (bn0): BatchNorm1d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fcs1): Linear(in_features=48, out_features=256, bias=False)
  (fcs2): Linear(in_features=260, out_features=128, bias=True)
  (fcs3): Linear(in_features=128, out_features=1, bias=True)
)

In [6]:
# load the weights from file
for i in range(2):
    multiagent.agents[i].actor_local.load_state_dict(torch.load('weights/cp_actor_from_agent_{}.pth'.format(i)))
    multiagent.agents[i].critic_local.load_state_dict(torch.load('weights/cp_critic_from_agent_{}.pth'.format(i)))
    
# load the weights just after achieving goal score
#for i in range(2):
#    multiagent.agents[i].actor_local.load_state_dict(torch.load('weights/just_after_goal_score/cp_actor_from_agent_{}'.format(i)))
#    multiagent.agents[i].critic_local.load_state_dict(torch.load('weights/just_after_goal_score/cp_critic_from_agent_{}'.format(i)))

In [2]:
env = UnityEnvironment(file_name="Tennis_Windows_x86_64/Tennis.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [4]:
# Run this cell several times to change the initial state and watch the agent.

env_info = env.reset(train_mode=False)[brain_name]      # reset the environment    
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)
states = env_info.vector_observations                   # get the current state (for each agent)
scores = np.zeros(num_agents)                           # initialize the score (for each agent)

step = 0

Number of agents: 2


In [11]:
while True:
    actions = multiagent.act(states)                # select an action (for each agent)
    env_info = env.step(actions)[brain_name]        # send all actions to tne environment
    next_states = env_info.vector_observations      # get next state (for each agent)
    rewards = env_info.rewards                      # get reward (for each agent)
    dones = env_info.local_done                     # see if episode finished
    states = next_states                            # roll over states to next time step
    scores += rewards                               # update the score (for each agent)
    step += 1
    if np.any(dones):                               # exit loop if episode finished
        print("time step :", step-1)
        break
    
print("Score: {}".format(np.max(scores)))

time step : 1000
Score: 2.600000038743019


In [12]:
env.close()

## Random Agents

In [5]:
for i in range(1, 14):                                      # play game for 8 episodes
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    while True:
        actions = np.random.randn(num_agents, 2) # select an action (for each agent)
        actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))

Score (max over agents) from episode 1: 0.0
Score (max over agents) from episode 2: 0.0
Score (max over agents) from episode 3: 0.0
Score (max over agents) from episode 4: 0.0
Score (max over agents) from episode 5: 0.0
Score (max over agents) from episode 6: 0.0
Score (max over agents) from episode 7: 0.09000000171363354
Score (max over agents) from episode 8: 0.0
Score (max over agents) from episode 9: 0.0
Score (max over agents) from episode 10: 0.0
Score (max over agents) from episode 11: 0.09000000171363354
Score (max over agents) from episode 12: 0.0
Score (max over agents) from episode 13: 0.0


In [6]:
env.close()