In [112]:
from gym.spaces import Box
import numpy as np

from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.models.torch.misc import SlimFC
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.agents.ppo import ppo
from ray.rllib.models.tf.tf_action_dist import Categorical

tf1, tf, tfv = try_import_tf()
torch, nn = try_import_torch()

In [150]:
class HieracrchyModel(TFModelV2):
    
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):

        super(HieracrchyModel, self).__init__(
            obs_space, action_space, num_outputs, model_config, name)
        # Base of the model for PPO agents
        self.model = FullyConnectedNetwork(
            obs_space, 2, num_outputs, model_config, name
        )

        # Upper DQN for action masking
        #obs = tf.keras.layers.Input(shape=(obs_space.shape[0],), name="obs")
        #hidden_1 = tf.keras.layers.Dense(512, activation=tf.nn.tanh, name="hidden_1")(obs)
        #hidden_2 = tf.keras.layers.Dense(512, activation=tf.nn.tanh, name="hidden_2")(hidden_1)
        #q_values = tf.keras.layers.Dense(action_space.n, activation=None, name="q_values")(hidden_2)
        #self.supervisor_q_vals = tf.keras.Model(inputs=[obs], outputs=q_values)

        ppo_config = ppo.DEFAULT_CONFIG.copy()
        ppo_config.update({"num_gpus": 0,"num_workers": 0,
                # Also, use "framework: tf2" for tfe eager execution.
                "framework": "tf2",
                "train_batch_size": batch_size,
                "horizon": 100,
                "gamma": 0.95,
                "model": {
                    "fcnet_hiddens": [512, 512],
                    "fcnet_activation": "relu",
                }})
        self.b_line = PPOTrainer(config=ppo_config,env="CybORG")
        self.b_line.restore("b_line_agent/checkpoint_000109/checkpoint-109")

        self.meander = PPOTrainer(config=ppo_config,env="CybORG")
        self.meander.restore("supervisor_ppo/checkpoint_000076/checkpoint-76")

    @override(ModelV2)
    def forward(self, input_dict, state, seq_lens):
        action_logits = self.model.forward({'obs_flat': input_dict['obs']}, state, seq_lens)[0]
        
        #action_logits = self.model.forward(input_dict, state, seq_lens)[0]
        return action_logits, state
    
    @override(ModelV2)
    def value_function(self):
        return self.model.value_function()

def build_model(policy, obs_space, action_space, config):
    model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        1,
        config["model"],
        name="h_model",
        framework="tf2",
        model_interface=HieracrchyModel
    )
    policy.model_variables = model.variables()
    return model

In [163]:
def action_distribution_fn(policy , model, input_dict, *, explore, is_training, **kwargs):
    print(input_dict)
    logits = policy.model.forward(input_dict, None, None)[0]
    action = tf.random.categorical(logits, 1, dtype=tf.int32)[0]
    print(action)
    logits = policy.model.b_line.compute_single_action(input_dict[SampleBatch.CUR_OBS][0], full_fetch=True, explore=False, training=False)[2]['action_dist_inputs']
    print(logits)
    return tf.expand_dims(logits, 1), Categorical, []
   # logits = policy.model.meander.compute_single_action(input_dict[SampleBatch.CUR_OBS][0], full_fetch=True, explore=False, training=False)[2]['action_dist_inputs']
   # return (tf.expand_dims(logits, 1), Categorical, [])

In [164]:
def build_action_sampler(policy, q_model, input_dict, explore, **kwargs):
    logits = policy.model.forward({'obs_flat': input_dict}, None, None)[0]
    action = tf.random.categorical(logits, 1, dtype=tf.int32)[0]
    print(tf.random.categorical(logits, 1, dtype=tf.int32))
    return action, logits[action[0]]

In [165]:
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy

HierarchyPolicy = PPOTFPolicy.with_updates(
    name="HierarchyPPOPolicy",
    make_model=build_model,
    action_distribution_fn=action_distribution_fn)

In [166]:

from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.policy.sample_batch import SampleBatch

class SupervisorPolicy(PPOTFPolicy):
    def __init__(self, observation_space, action_space, config):
        PPOTFPolicy.__init__(self, observation_space, action_space, config)

    @override(PPOTFPolicy)
    def loss(self, model, dist_class, train_batch):
        loss = super().loss
        
        #q_val_s = model.supervisor_q_vals({train_batch[SampleBatch.CUR_OBS]})[0]
        #q_val_ns = model.supervisor_q_vals({train_batch[SampleBatch.NEXT_OBS]})[0]
        
        #action_selection = tf.cast(train_batch[SampleBatch.ACTIONS], dtype=tf.int32)   
        #one_hot_selection = tf.one_hot(action_selection, 1)
        #selected_q = tf.reduce_sum(q_val_s * one_hot_selection, 1)
        #dones = tf.cast(train_batch[SampleBatch.DONES], tf.float32)
        
        #one_hot_max = tf.one_hot(tf.argmax(q_val_ns, 1), 1)
        #ns_max_q = tf.reduce_sum(q_val_ns * one_hot_max, 1)
        #ns_max_q = (1.0 - dones) * ns_max_q
        #Calculate TD error and convert to huber loss
        #target_q = (train_batch[SampleBatch.REWARDS] + policy.config["gamma"] * ns_max_q)
        #td_error = selected_q - tf.stop_gradient(target_q)
        #DQN_Loss = tf.reduce_sum(huber_loss(td_error)) 
        
        return loss #+ DQN_Loss
    
    

In [167]:
from ray.rllib.agents.ppo.ppo import PPOTrainer
class SVTrainer(PPOTrainer):
    def get_default_policy_class(self, config):
        return HierarchyPolicy

In [168]:
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
from ray.rllib.models import ModelCatalog
from ray.tune.registry import register_env
from ray.rllib.agents import ppo
import inspect
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper

def env_creator(env_config: dict):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
    agents = {"Red": B_lineAgent, "Green": GreenAgent}
    cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
    env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
    return env

def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")

import subprocess
import json
import os

batch_size = 2000
# Set up CybORG
register_env(name="CybORG", env_creator=env_creator)
config = ppo.DEFAULT_CONFIG.copy()

#ModelCatalog.register_custom_model("h_model", HieracrchyModel)

from shutil import make_archive
allrewards = []

config.update({"num_gpus": 1,"num_workers": 0,
                # Also, use "framework: tf2" for tfe eager execution.
                "framework": "tf2",
                "train_batch_size": batch_size,
                "horizon": 100,
                "gamma": 0.9,
                "model": {
                    "fcnet_hiddens": [512],
                    "fcnet_activation": "relu",
                },

                }) 
trainer = SVTrainer(config=config, env="CybORG")

reward = []
novel_obs = []
novel_actions = []
for i in range(200):
    results_dict = trainer.train()
    print_results(results_dict)


2022-08-03 14:39:37,376	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: b_line_agent/checkpoint_000109/checkpoint-109
2022-08-03 14:39:37,376	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 109, '_timesteps_total': None, '_time_total': 4080.5770568847656, '_episodes_total': 4360}
2022-08-03 14:39:37,972	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000076/checkpoint-76
2022-08-03 14:39:37,972	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 76, '_timesteps_total': None, '_time_total': 2702.382201910019, '_episodes_total': 3040}


SampleBatch(32: ['obs', 'new_obs', 'actions', 'prev_actions', 'rewards', 'prev_rewards', 'dones', 'infos', 'eps_id', 'unroll_id', 'agent_index', 't'])
tf.Tensor([0], shape=(1,), dtype=int32)
[ 3.1435278e-01 -1.0851140e+00  2.1183152e+00  2.1210349e-01
  1.2379090e+00  7.9796135e-01 -4.3306494e-01  1.3179013e+00
 -1.6268424e+00  1.5975549e+00  1.5686109e+00  2.1595149e+00
  3.5432547e-01 -2.6048261e-01  2.0117016e+00 -4.0419511e-02
  8.7674385e-01  1.6341534e+00 -2.0381639e+00  1.5786418e+00
  1.9773276e+00 -3.8035417e-01 -3.5461459e-01  2.5708525e+00
  8.6024022e-01  3.3051972e+00  9.4483644e-01 -1.2722969e-01
  8.5048449e-01  3.8812587e+00  1.9359040e+00 -1.7192309e-01
 -2.4845111e+00  7.1764261e-01  2.3846743e+00  9.5787060e-01
 -7.6575726e-01  1.5630621e-01  1.7647816e+00  9.4396478e-01
 -5.6792873e-01  2.5413115e+00  4.3730459e-01  3.4920442e+00
  8.4968156e-01  2.3837469e+00  3.3455942e+00  3.8005364e-01
  1.9753731e+00  3.2737470e+00  9.1209275e-01  1.6775652e+00
  5.7922840e-01 

IndexError: list index out of range