Create the trainer

In [2]:
import inspect
import time
from statistics import mean, stdev
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
import os
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents import ppo
from ray.tune.registry import register_env
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper
import warnings
import numpy as np
import random
import ray
from collections import deque
warnings.filterwarnings('ignore')

In [3]:
MAX_EPS = 50
agent_name = 'Blue'

def wrap(env):
    return RLlibWrapper(agent_name="Blue", env=env)


def evaluate(steps):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'

    #print(f'using CybORG v{cyborg_version}, {scenario}\n')
    for num_steps in steps:
        for red_agent in [B_lineAgent, RedMeanderAgent, SleepAgent]:

            cyborg = CybORG(path, 'sim', agents={'Red': red_agent})
            wrapped_cyborg = wrap(cyborg)

            observation = wrapped_cyborg.reset()
            # observation = cyborg.reset().observation

            action_space = wrapped_cyborg.get_action_space(agent_name)
            # action_space = cyborg.get_action_space(agent_name)
            total_reward = []
            actions = []
            for i in range(MAX_EPS):
                r = []
                a = []
                # cyborg.env.env.tracker.render()
                for j in range(num_steps):
                    action = trainer.compute_single_action(observation)
                    #action = agent.get_action(observation, action_space)
                    observation, rew, done, info = wrapped_cyborg.step(action)
                    # result = cyborg.step(agent_name, action)
                    r.append(rew)
                    # r.append(result.reward)
                    a.append((str(cyborg.get_last_action('Blue')), str(cyborg.get_last_action('Red'))))
                total_reward.append(sum(r))
                actions.append(a)
                # observation = cyborg.reset().observation
                observation = wrapped_cyborg.reset()
            print(f'Average reward for red agent {red_agent.__name__} and steps {num_steps} is: {mean(total_reward):.1f} with a standard deviation of {stdev(total_reward):.1f}')


In [4]:
from ray.rllib.offline.json_writer import JsonWriter
from ray.rllib.offline.dataset_writer import DatasetWriter
from ray.rllib.offline.io_context import IOContext
from datetime import datetime
import shutil
import inspect
from ray.tune.registry import register_env

def env_creator(env_config: dict):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
    agents = {"Red": B_lineAgent, "Green": GreenAgent}
    cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
    env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
    return env

def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")
    
register_env(name="CybORG", env_creator=env_creator)

In [8]:
from ray.rllib.agents.dqn.dqn import DQNTrainer
from ray.rllib.agents.dqn import dqn
import ray

all_rewards = []


dqn_config = dqn.DEFAULT_CONFIG.copy()
dqn_config.update({"num_gpus": 1,"num_workers":20,
                    "framework": "tf2",
                    "horizon": 100,
                    'lr': 0.00025,
                    'train_batch_size': 32,
                    'double_q': True,
                    "gamma": 0.95,
                    'dueling': True,
                    'hiddens': [512],
                    "model": {
                        "fcnet_hiddens": [512],
                        "fcnet_activation": "relu",
                    },
                    #"exploration_config": {
                    #    "type": "Curiosity",
                    #    "sub_exploration": "EpsilonGreedy",
                    #    "inverse_net_hiddens": [512],  
                    #    "inverse_net_activation": "relu",  
                    #    "forward_net_hiddens": [512],  
                    #    "forward_net_activation": "relu", }
                  }) 
rewards = []
trainer = DQNTrainer(config=dqn_config, env="CybORG")

m = -60
for i in range(int(500)):
    results_dict=trainer.train()
    print_results(results_dict)
    rewards.append(results_dict["episode_reward_mean"])
    if m < results_dict["episode_reward_mean"]:
        print(trainer.save("/Supervisor"))
        m = results_dict["episode_reward_mean"]
        
all_rewards.append(rewards)
trainer.save("/Supervisor")



   1 	r_mean: nan 	r_max: nan 	r_min:  nan
   2 	r_mean: -689.2 	r_max: -128.8 	r_min: -1151.7
   3 	r_mean: -689.2 	r_max: -128.8 	r_min: -1151.7
   4 	r_mean: -806.5 	r_max: -128.8 	r_min: -1151.7
   5 	r_mean: -806.5 	r_max: -128.8 	r_min: -1151.7
   6 	r_mean: -814.6 	r_max: -128.8 	r_min: -1152.7
   7 	r_mean: -814.6 	r_max: -128.8 	r_min: -1152.7
   8 	r_mean: -860.8 	r_max: -128.8 	r_min: -1184.8
   9 	r_mean: -860.8 	r_max: -128.8 	r_min: -1184.8
  10 	r_mean: -780.3 	r_max: -78.8 	r_min: -1184.8
  11 	r_mean: -780.3 	r_max: -78.8 	r_min: -1184.8
  12 	r_mean: -700.1 	r_max: -78.8 	r_min: -1184.8
  13 	r_mean: -700.1 	r_max: -78.8 	r_min: -1184.8
  14 	r_mean: -573.6 	r_max: -78.8 	r_min: -1184.8
  15 	r_mean: -573.6 	r_max: -78.8 	r_min: -1184.8
  16 	r_mean: -465.2 	r_max: -78.8 	r_min: -1184.8
  17 	r_mean: -465.2 	r_max: -78.8 	r_min: -1184.8
  18 	r_mean: -316.7 	r_max: -78.8 	r_min: -839.3
  19 	r_mean: -316.7 	r_max: -78.8 	r_min: -839.3
  20 	r_mean: -273.5 	r_max: -204

 165 	r_mean: -90.2 	r_max: -12.8 	r_min: -230.2
 166 	r_mean: -93.4 	r_max: -43.8 	r_min: -230.2
 167 	r_mean: -93.4 	r_max: -43.8 	r_min: -230.2
 168 	r_mean: -89.2 	r_max: -43.8 	r_min: -230.2
 169 	r_mean: -89.2 	r_max: -43.8 	r_min: -230.2
 170 	r_mean: -82.5 	r_max: -40.7 	r_min: -230.2
 171 	r_mean: -82.5 	r_max: -40.7 	r_min: -230.2
 172 	r_mean: -83.9 	r_max: -18.8 	r_min: -230.2
 173 	r_mean: -83.9 	r_max: -18.8 	r_min: -230.2
 174 	r_mean: -88.8 	r_max: -18.8 	r_min: -228.7
 175 	r_mean: -93.7 	r_max: -18.8 	r_min: -239.7
 176 	r_mean: -93.7 	r_max: -18.8 	r_min: -239.7
 177 	r_mean: -94.9 	r_max: -18.8 	r_min: -239.7
 178 	r_mean: -94.9 	r_max: -18.8 	r_min: -239.7
 179 	r_mean: -102.2 	r_max: -18.8 	r_min: -239.7
 180 	r_mean: -102.2 	r_max: -18.8 	r_min: -239.7
 181 	r_mean: -101.9 	r_max: -33.8 	r_min: -239.7
 182 	r_mean: -101.9 	r_max: -33.8 	r_min: -239.7
 183 	r_mean: -100.0 	r_max: -33.8 	r_min: -239.7
 184 	r_mean: -100.0 	r_max: -33.8 	r_min: -239.7
 185 	r_mean: 

 333 	r_mean: -91.8 	r_max: -14.7 	r_min: -233.7
 334 	r_mean: -91.8 	r_max: -14.7 	r_min: -233.7
 335 	r_mean: -98.8 	r_max: -14.7 	r_min: -233.8
 336 	r_mean: -98.8 	r_max: -14.7 	r_min: -233.8
 337 	r_mean: -100.9 	r_max: -14.7 	r_min: -233.8
 338 	r_mean: -100.9 	r_max: -14.7 	r_min: -233.8
 339 	r_mean: -96.6 	r_max: -10.7 	r_min: -233.8
 340 	r_mean: -96.6 	r_max: -10.7 	r_min: -233.8
 341 	r_mean: -95.1 	r_max: -10.7 	r_min: -233.8
 342 	r_mean: -95.1 	r_max: -10.7 	r_min: -233.8
 343 	r_mean: -92.6 	r_max: -10.7 	r_min: -233.8
 344 	r_mean: -92.6 	r_max: -10.7 	r_min: -233.8
 345 	r_mean: -87.1 	r_max: -10.7 	r_min: -225.7
 346 	r_mean: -87.1 	r_max: -10.7 	r_min: -225.7
 347 	r_mean: -92.3 	r_max: -10.7 	r_min: -225.2
 348 	r_mean: -92.3 	r_max: -10.7 	r_min: -225.2
 349 	r_mean: -95.3 	r_max: -15.8 	r_min: -225.2
 350 	r_mean: -93.3 	r_max: -15.8 	r_min: -218.8
 351 	r_mean: -93.3 	r_max: -15.8 	r_min: -218.8
 352 	r_mean: -92.9 	r_max: -17.8 	r_min: -218.8
 353 	r_mean: -92.

 499 	r_mean: -74.8 	r_max: -12.7 	r_min: -234.8
 500 	r_mean: -75.5 	r_max: -12.7 	r_min: -227.8


'/Supervisor/checkpoint_000500/checkpoint-500'

In [10]:
for i in range(int(100)):
    results_dict=trainer.train()
    print_results(results_dict)
    if m < results_dict["episode_reward_mean"]:
        print(trainer.save("/Supervisor"))
        m = results_dict["episode_reward_mean"]

 601 	r_mean: -70.8 	r_max: -12.8 	r_min: -222.8
 602 	r_mean: -66.9 	r_max: -10.8 	r_min: -222.8
 603 	r_mean: -66.9 	r_max: -10.8 	r_min: -222.8
 604 	r_mean: -64.1 	r_max: -10.8 	r_min: -222.8
 605 	r_mean: -64.1 	r_max: -10.8 	r_min: -222.8
 606 	r_mean: -65.9 	r_max: -10.8 	r_min: -234.8
 607 	r_mean: -65.9 	r_max: -10.8 	r_min: -234.8
 608 	r_mean: -71.2 	r_max: -10.8 	r_min: -237.8
 609 	r_mean: -71.2 	r_max: -10.8 	r_min: -237.8
 610 	r_mean: -81.1 	r_max: -10.8 	r_min: -237.8
 611 	r_mean: -81.1 	r_max: -10.8 	r_min: -237.8
 612 	r_mean: -83.8 	r_max: -13.8 	r_min: -237.8
 613 	r_mean: -83.8 	r_max: -13.8 	r_min: -237.8
 614 	r_mean: -88.0 	r_max: -12.7 	r_min: -237.8
 615 	r_mean: -88.0 	r_max: -12.7 	r_min: -237.8
 616 	r_mean: -86.0 	r_max: -12.7 	r_min: -237.8
 617 	r_mean: -86.0 	r_max: -12.7 	r_min: -237.8
 618 	r_mean: -85.4 	r_max: -12.7 	r_min: -235.7
 619 	r_mean: -85.4 	r_max: -12.7 	r_min: -235.7
 620 	r_mean: -77.7 	r_max: -12.7 	r_min: -235.7
 621 	r_mean: -77.7 

In [None]:
import time
t = time.time()
for i in range(100):
    loaded_trainer.compute_single_action(np.ones(52))[2]['q_values']
print(time.time() - t)

In [None]:
p = loaded_trainer.get_policy()
p.model.forward({'obs_flat':np.ones((1,52))}, np.ones((1,52)), None)

In [None]:
import time
t = time.time()
for i in range(100):
    loaded_trainer.compute_single_action(np.ones(52), full_fetch=True)[2]['q_values']
print(time.time() - t)