In [1]:
import sys
import os
import gym
import time
import torch
import torch.nn.functional as F
import numpy as np
from gym.spaces.box import Box
from gym.spaces.discrete import Discrete
from collections import deque
from unityagents import UnityEnvironment
from baselines import bench
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.common.atari_wrappers import FrameStack as FrameStack_
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv, VecEnv






In [2]:
from envs import Task
from envs import OrnsteinUhlenbeckProcess
from envs import LinearSchedule
from config import Config
from ac_model import DeterministicActorCriticNet
from memory import Replay
from ddpg_agent import DDPGAgent
from ddpg_agent import FCBody
from ddpg_agent import TwoLayerFCBodyWithAction

In [3]:
env = UnityEnvironment(file_name='Reacher_Linux/Reacher.x86_64', no_graphics=True)
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)
states = env_info.vector_observations

print('Size of each action:', brain.vector_action_space_size)
action_size = brain.vector_action_space_size
print(states.shape[1])

config = Config()
config.state_dim = states.shape[1]
config.action_dim = brain.vector_action_space_size


INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Number of agents: 20
Size of each action: 4
33


In [6]:
hidden_size1 = 256
hidden_size2 = 128
lr_cr = 2e-4
lr_ac = 2e-4
memory_capacity = 1e8
batch_size = 128
discount = 0.99
config.max_steps = int(1e6)
config.eval_interval = int(1e4)
config.eval_episodes = 20
config.save_interval = 10000
config.discount= discount
config.network_fn = lambda: DeterministicActorCriticNet(
    config.state_dim, config.action_dim,
    actor_body=FCBody(config.state_dim, (hidden_size1,
               hidden_size2), gate=F.relu),
            critic_body=TwoLayerFCBodyWithAction(config.state_dim,
              config.action_dim, (hidden_size1, hidden_size2), gate=F.relu),
           actor_opt_fn=lambda params: torch.optim.Adam(params, lr=lr_ac),
             critic_opt_fn=lambda params: torch.optim.Adam(params, lr=lr_cr))
config.replay_fn = lambda: Replay(memory_size=memory_capacity, batch_size=batch_size)
config.discount = 0.99
config.random_process_fn = lambda: OrnsteinUhlenbeckProcess(size=(config.action_dim, ), std=LinearSchedule(0.2))
config.min_memory_size = 10000
config.target_network_mix = 1e-3
config.DEVICE = 'cuda:0'



In [9]:
t0 = time.time()
n_episodes = 300
learn_updates = 10
train_every = 20
total_steps = 0
scores_window = deque(maxlen=100)  # last 100 scores
all_agents = [[]for x in range(num_agents)]
agents_rewards = [scores_window for x in range(num_agents)]
agents = [DDPGAgent(config) for _ in range(num_agents)]
replay_buffer = Replay(memory_size=memory_capacity, batch_size=batch_size)

# fill replay buffer to minimum
env_info = env.reset(train_mode=True)[brain_name]
states = env_info.vector_observations                  # get the current state (for each agent)
for agent in agents:
    agent.random_process.reset_states()
percent = config.min_memory_size / 100
T = 0
while True:
    T = T + 20
    print("In Progress", T /percent, end='\r')
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1 
    env_info = env.step(actions)[brain_name]
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
        replay_buffer.feed([state, action, reward, next_state, int(done)])
    states = next_states
    if replay_buffer.size() >= config.min_memory_size:
        break
    if np.any(dones):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations                  # get the current state (for each agent)
        agent.random_process.reset_states()


In Progress 100.0

In [10]:
replay_buffer.sample(1)

[array([[-1.97168922e+00, -3.12158656e+00, -1.55307007e+00,
          9.41960394e-01, -2.46788085e-01, -5.77390455e-02,
         -2.20164850e-01, -4.24462199e-01,  5.08908868e-01,
         -1.00376630e+00, -3.99100280e+00,  1.22232926e+00,
          2.30739355e+00, -4.08615589e+00, -8.31753635e+00,
         -3.47632790e+00,  9.83702779e-01, -9.87396762e-02,
         -6.63560629e-02, -1.34818882e-01,  3.43628716e+00,
         -2.06503749e+00,  2.79981756e+00,  6.50988388e+00,
          1.42089128e+00, -1.30396957e+01, -7.81648636e+00,
         -1.00000000e+00, -1.70368385e+00,  0.00000000e+00,
          1.00000000e+00,  0.00000000e+00, -7.56323338e-03]]),
 array([[-0.35490269,  0.33631061,  1.        , -0.37761166]]),
 array([0.]),
 array([[-2.08846474e+00, -3.18280888e+00, -1.24459839e+00,
          9.46440339e-01, -2.65142232e-01, -4.97015044e-02,
         -1.77426323e-01, -1.38379896e+00,  2.71570906e-02,
         -6.47327527e-02, -2.41123557e-01, -1.70311463e+00,
          4.4400219

In [11]:
for i_episode in range(1, n_episodes+1):
    scores = np.zeros(num_agents)
    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations # get the current state (for each agent)
    for agent in agents:
        agent.random_process.reset_states()
    scores = np.zeros(num_agents)
    while True:
        total_steps +=1
        actions = []
        for state, agent in zip(states, agents):
            action = (agent.network(state))
            action = action.cpu().detach().numpy()
            action += agent.random_process.sample()
            actions.append(action)
        env_info = env.step(actions)[brain_name]
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            replay_buffer.feed([state, action, reward, next_state, int(done)])
        states = next_states
        if  total_steps % train_every == 0:
            for agent in agents:
                for _ in range(learn_updates):
                    agent.learn(replay_buffer)
        if np.any(dones):
            for score, one_agent, agent_reward in zip(scores, all_agents, agents_rewards):
                one_agent.append(score)
                agent_reward.append(score)
                scores_window.append(np.mean(scores))
                print('\rEpisode {}\t Average Score all: {:.2f} , Score: {:.2f} Time: {:.2f}'.format(i_episode,
                                                                                                     np.mean(scores_window),
                                                                                                     np.mean(scores), time.time() - t0))
            break


Episode 1	 Average Score all: 0.35 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.45 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.48 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.42 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.44 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.43 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.41 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.41 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.39 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.40 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.42 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.40 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.43 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.42 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.41 , Score: 0.46 Time: 894.69
Episode 1	 Average Score all: 0.44 , Score: 0.46 Time: 894.69
Episode 

Episode 8	 Average Score all: 0.69 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.70 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.69 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.69 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.70 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.70 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.70 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.71 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.68 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.70 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.69 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.67 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.68 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.69 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.68 , Score: 0.58 Time: 1615.28
Episode 8	 Average Score all: 0.68 , Score: 0.58 Time: 

Episode 15	 Average Score all: 0.61 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.61 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.60 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.59 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.60 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.60 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.59 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.61 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.60 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.61 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.60 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.60 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.59 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.60 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.60 , Score: 0.64 Time: 2223.61
Episode 15	 Average Score all: 0.60 , Sc

KeyboardInterrupt: 

In [None]:
def save_and_plot(scores, model_num):
    """ saves the result of the training into the given file
    Args:
        param1 (list): score
         param2 (int):
     """
    for i, score in enumerate(scores):
        fig = plt.figure()
        fig.add_subplot(111)
        plt.plot(np.arange(len(score)), score)
        plt.ylabel('Score')
        plt.xlabel('Episode #')
        plt.savefig('results/model-{}/scores{}.png'.format(model_num, i))

        df = pd.DataFrame({'episode':np.arange(len(score)), 'score':score})
        df.set_index('episode', inplace=True)
        df.to_csv('results/model-{}/scores{}.csv'.format(model_num, i))