In [19]:
from gym.spaces import Box
import numpy as np

from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.models.torch.misc import SlimFC
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.agents.ppo import ppo


tf1, tf, tfv = try_import_tf()
torch, nn = try_import_torch()

In [20]:
class SupervisorModel(TFModelV2):
    
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        super(SupervisorModel, self).__init__(
            obs_space, action_space, num_outputs, model_config, name
        )
        # Base of the model for PPO agents
        self.model = FullyConnectedNetwork(
            obs_space, action_space, num_outputs, model_config, name
        )
  
        # Upper DQN for action masking
        #obs = tf.keras.layers.Input(shape=(obs_space.shape[0],), name="obs")
        #hidden_1 = tf.keras.layers.Dense(512, activation=tf.nn.tanh, name="hidden_1")(obs)
        #hidden_2 = tf.keras.layers.Dense(512, activation=tf.nn.tanh, name="hidden_2")(hidden_1)
        #q_values = tf.keras.layers.Dense(action_space.n, activation=None, name="q_values")(hidden_2)
        #self.supervisor_q_vals = tf.keras.Model(inputs=[obs], outputs=q_values)
    
        ppo_config = ppo.DEFAULT_CONFIG.copy()
        ppo_config.update({"num_gpus": 0,"num_workers": 0,
                # Also, use "framework: tf2" for tfe eager execution.
                "framework": "tf2",
                "train_batch_size": batch_size,
                "horizon": 100,
                "gamma": 0.9,
                "model": {
                    "fcnet_hiddens": [512, 512],
                    "fcnet_activation": "relu",
                }})
        self.supervisor = PPOTrainer(config=ppo_config,env="CybORG")
        self.supervisor.restore("b_line_agent/checkpoint_000109/checkpoint-109")

    @override(ModelV2)
    def forward(self, input_dict, state, seq_lens):
    
        action_logits = self.model.forward(input_dict, state, seq_lens)[0]
       # if input_dict[SampleBatch.CUR_OBS].shape[0] > 1:

       #     return action_logits, state
        #q_vals = self.supervisor_q_vals(input_dict[SampleBatch.CUR_OBS])
        q_vals = self.supervisor.compute_single_action(input_dict[SampleBatch.CUR_OBS][0], full_fetch=True, explore=False, training=False)[2]['action_dist_inputs']
        max_q_vals = tf.math.top_k(q_vals, k=20, sorted=False, name=None)
        indices = tf.cast(max_q_vals.indices, dtype=tf.int32).numpy()
       # action_mask = np.zeros(tf.shape(action_logits).numpy(), dtype=int)
       # action_mask[:,indices] = 1
        # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the
        # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].
       # intent_vector = tf.expand_dims(action_embed, 1)
        #action_logits = tf.expand_dims(action_logits, 1)
       # print(action_logits)
        # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
      #  action_logits = tf.cast(tf.reduce_sum(action_mask * action_logits, axis=1), dtype=tf.float32)
       # print(action_logits)
        min_mask = np.ones(tf.shape(action_logits).numpy()) * -10000000000
        min_mask[:,indices] = 0
       # print(action_logits)
        action_logits = action_logits + min_mask
        #print(action_logits)
       # print(action_logits)
        return action_logits, state
        
        #action_mask = np.zeros(tf.shape(q_vals).numpy(), dtype=bool)
       # action_mask[indices] = True
       # masked_logits = action_logits[0] * action_mask
       # return tf.expand_dims(masked_logits, axis=0), state
    
    @override(ModelV2)
    def value_function(self):
        print(self.model.value_function())
        return self.model.value_function()

    def q_value_function(self, obs, opponent_obs, opponent_actions):
        return tf.reshape(self.supervisor_q_vals(obs),[-1])

 425 	r_mean: -27.8 	r_max: -11.8 	r_min: -147.7
/Supervisor/checkpoint_000425/checkpoint-425

In [21]:

from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.policy.sample_batch import SampleBatch

class SupervisorPolicy(PPOTFPolicy):
    def __init__(self, observation_space, action_space, config):
        PPOTFPolicy.__init__(self, observation_space, action_space, config)

    @override(PPOTFPolicy)
    def loss(self, model, dist_class, train_batch):
        loss = super().loss
        
        #q_val_s = model.supervisor_q_vals({train_batch[SampleBatch.CUR_OBS]})[0]
        #q_val_ns = model.supervisor_q_vals({train_batch[SampleBatch.NEXT_OBS]})[0]
        
        #action_selection = tf.cast(train_batch[SampleBatch.ACTIONS], dtype=tf.int32)   
        #one_hot_selection = tf.one_hot(action_selection, 1)
        #selected_q = tf.reduce_sum(q_val_s * one_hot_selection, 1)
        #dones = tf.cast(train_batch[SampleBatch.DONES], tf.float32)
        
        #one_hot_max = tf.one_hot(tf.argmax(q_val_ns, 1), 1)
        #ns_max_q = tf.reduce_sum(q_val_ns * one_hot_max, 1)
        #ns_max_q = (1.0 - dones) * ns_max_q
        #Calculate TD error and convert to huber loss
        #target_q = (train_batch[SampleBatch.REWARDS] + policy.config["gamma"] * ns_max_q)
        #td_error = selected_q - tf.stop_gradient(target_q)
        #DQN_Loss = tf.reduce_sum(huber_loss(td_error)) 
        
        return loss #+ DQN_Loss
        

In [22]:
from ray.rllib.agents.ppo.ppo import PPOTrainer
class SVTrainer(PPOTrainer):
    def get_default_policy_class(self, config):
        return SupervisorPolicy

In [23]:
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
from ray.rllib.models import ModelCatalog
from ray.tune.registry import register_env
from ray.rllib.agents import ppo
import inspect
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper

def env_creator(env_config: dict):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
    agents = {"Red": B_lineAgent, "Green": GreenAgent}
    cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
    env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
    return env

def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")

import subprocess
import json
import os

batch_size = 4000
# Set up CybORG
register_env(name="CybORG", env_creator=env_creator)
config = ppo.DEFAULT_CONFIG.copy()

ModelCatalog.register_custom_model("sv_model", SupervisorModel)

from shutil import make_archive
allrewards = []

config.update({"num_gpus": 1,"num_workers": 0,
                # Also, use "framework: tf2" for tfe eager execution.
                "framework": "tf2",
                "train_batch_size": batch_size,
                "horizon": 100,
                "gamma": 0.9,
                "model": {
                    "custom_model": "sv_model",
                    "fcnet_hiddens": [512],
                    "fcnet_activation": "relu",
                },

                }) 
trainer = SVTrainer(config=config, env="CybORG")

reward = []
novel_obs = []
novel_actions = []
for i in range(200):
    results_dict = trainer.train()
    print_results(results_dict)


2022-08-03 14:00:12,162	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: b_line_agent/checkpoint_000109/checkpoint-109
2022-08-03 14:00:12,164	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 109, '_timesteps_total': None, '_time_total': 4080.5770568847656, '_episodes_total': 4360}


tf.Tensor(
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.], shape=(32,), dtype=float32)
tf.Tensor([0.], shape=(1,), dtype=float32)
tf.Tensor(
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.], shape=(32,), dtype=float32)
tf.Tensor([0.], shape=(1,), dtype=float32)
tf.Tensor([7.472682e-05], shape=(1,), dtype=float32)
tf.Tensor([0.0005847], shape=(1,), dtype=float32)
tf.Tensor([0.00165592], shape=(1,), dtype=float32)
tf.Tensor([0.00012702], shape=(1,), dtype=float32)
tf.Tensor([0.00286229], shape=(1,), dtype=float32)
tf.Tensor([0.001107], shape=(1,), dtype=float32)
tf.Tensor([0.00236332], shape=(1,), dtype=float32)
tf.Tensor([0.00053987], shape=(1,), dtype=float32)
tf.Tensor([0.00056258], shape=(1,), dtype=float32)
tf.Tensor([0.00028439], shape=(1,), dtype=float32)
tf.Tensor([0.00169003], shape=(1,), dtype=float32)
tf.Tensor([0.00185006], shape=(1,), dtype=float32)
tf.Tensor([0.00034794], s

tf.Tensor([0.00025557], shape=(1,), dtype=float32)
tf.Tensor([0.00105841], shape=(1,), dtype=float32)
tf.Tensor([0.00025557], shape=(1,), dtype=float32)
tf.Tensor([0.00025557], shape=(1,), dtype=float32)
tf.Tensor([0.00119679], shape=(1,), dtype=float32)
tf.Tensor([-0.0004479], shape=(1,), dtype=float32)
tf.Tensor([0.00119679], shape=(1,), dtype=float32)
tf.Tensor([-1.5479774e-05], shape=(1,), dtype=float32)
tf.Tensor([0.00013697], shape=(1,), dtype=float32)
tf.Tensor([-0.00034953], shape=(1,), dtype=float32)
tf.Tensor([-0.00036583], shape=(1,), dtype=float32)
tf.Tensor([-0.00076596], shape=(1,), dtype=float32)
tf.Tensor([-0.00129069], shape=(1,), dtype=float32)
tf.Tensor([-0.00150775], shape=(1,), dtype=float32)
tf.Tensor([-0.00129069], shape=(1,), dtype=float32)
tf.Tensor([-0.00230315], shape=(1,), dtype=float32)
tf.Tensor([-0.00129069], shape=(1,), dtype=float32)
tf.Tensor([-0.00010949], shape=(1,), dtype=float32)
tf.Tensor([-0.00036583], shape=(1,), dtype=float32)
tf.Tensor([-0.000

tf.Tensor([-0.00160515], shape=(1,), dtype=float32)
tf.Tensor([0.], shape=(1,), dtype=float32)
tf.Tensor([0.00014884], shape=(1,), dtype=float32)
tf.Tensor([0.000502], shape=(1,), dtype=float32)
tf.Tensor([0.00065875], shape=(1,), dtype=float32)
tf.Tensor([0.00150546], shape=(1,), dtype=float32)
tf.Tensor([0.00065875], shape=(1,), dtype=float32)
tf.Tensor([-0.00065047], shape=(1,), dtype=float32)
tf.Tensor([9.822252e-06], shape=(1,), dtype=float32)
tf.Tensor([0.00220026], shape=(1,), dtype=float32)
tf.Tensor([0.00065875], shape=(1,), dtype=float32)
tf.Tensor([0.000502], shape=(1,), dtype=float32)
tf.Tensor([1.536083e-05], shape=(1,), dtype=float32)
tf.Tensor([0.00172862], shape=(1,), dtype=float32)
tf.Tensor([0.00051674], shape=(1,), dtype=float32)
tf.Tensor([-0.00133301], shape=(1,), dtype=float32)
tf.Tensor([-0.00049075], shape=(1,), dtype=float32)
tf.Tensor([-0.00049075], shape=(1,), dtype=float32)
tf.Tensor([-0.00049075], shape=(1,), dtype=float32)
tf.Tensor([-0.00049075], shape=(1

tf.Tensor([0.], shape=(1,), dtype=float32)
tf.Tensor([-0.00013904], shape=(1,), dtype=float32)
tf.Tensor([0.00065875], shape=(1,), dtype=float32)
tf.Tensor([0.0013221], shape=(1,), dtype=float32)
tf.Tensor([0.00095233], shape=(1,), dtype=float32)
tf.Tensor([0.00065875], shape=(1,), dtype=float32)
tf.Tensor([0.], shape=(1,), dtype=float32)
tf.Tensor([0.00083485], shape=(1,), dtype=float32)
tf.Tensor([9.359617e-05], shape=(1,), dtype=float32)
tf.Tensor([0.00092979], shape=(1,), dtype=float32)
tf.Tensor([-0.00071305], shape=(1,), dtype=float32)
tf.Tensor([-0.00035746], shape=(1,), dtype=float32)
tf.Tensor([-0.00097382], shape=(1,), dtype=float32)
tf.Tensor([0.], shape=(1,), dtype=float32)
tf.Tensor([0.00014884], shape=(1,), dtype=float32)
tf.Tensor([0.00058673], shape=(1,), dtype=float32)
tf.Tensor([0.000516], shape=(1,), dtype=float32)
tf.Tensor([-0.00092867], shape=(1,), dtype=float32)
tf.Tensor([0.00126709], shape=(1,), dtype=float32)
tf.Tensor([0.00065875], shape=(1,), dtype=float32)


tf.Tensor([-0.00152146], shape=(1,), dtype=float32)
tf.Tensor([-0.00151188], shape=(1,), dtype=float32)
tf.Tensor([-0.00151188], shape=(1,), dtype=float32)
tf.Tensor([-0.00131849], shape=(1,), dtype=float32)
tf.Tensor([-0.00040527], shape=(1,), dtype=float32)
tf.Tensor([-0.00151188], shape=(1,), dtype=float32)
tf.Tensor([-0.00134044], shape=(1,), dtype=float32)
tf.Tensor([-0.00151188], shape=(1,), dtype=float32)
tf.Tensor([-0.00040527], shape=(1,), dtype=float32)
tf.Tensor([-0.00151188], shape=(1,), dtype=float32)
tf.Tensor([-0.00134044], shape=(1,), dtype=float32)
tf.Tensor([-0.00152146], shape=(1,), dtype=float32)
tf.Tensor([-0.00151188], shape=(1,), dtype=float32)


KeyboardInterrupt: 