In [1]:
from gym.spaces import Box
import numpy as np

from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.models.torch.misc import SlimFC
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.agents.ppo import ppo
from ray.rllib.models.tf.tf_action_dist import Categorical
from ray.rllib.models.tf.misc import normc_initializer
from ray.rllib.models.tf.recurrent_net import RecurrentNetwork
import gym

tf1, tf, tfv = try_import_tf()

Option Critic Model

class HieracrchyModel(TFModelV2):
    
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        super(HieracrchyModel, self).__init__(
            obs_space, action_space, num_outputs, model_config, name
        )
        self.inputs = tf.keras.layers.Input(shape=obs_space.shape, name="observations")
        layer_1 = tf.keras.layers.Dense(
            256,
            name="layer1",
            activation=tf.nn.relu,
            kernel_initializer=normc_initializer(1.0),
        )(self.inputs)
        layer_2 = tf.keras.layers.Dense(
            128,
            name="layer2",
            activation=tf.nn.relu,
            kernel_initializer=normc_initializer(1.0),
        )(layer_1)
        layer_out = tf.keras.layers.Dense(
            num_outputs,
            name="out",
            activation=None,
            kernel_initializer=normc_initializer(0.01),
        )(layer_1)
        value_out = tf.keras.layers.Dense(
            1,
            name="value_out",
            activation=None,
            kernel_initializer=normc_initializer(0.01),
        )(layer_1)
        self.model = tf.keras.Model(self.inputs, [layer_out, value_out])
        
        ppo_config = ppo.DEFAULT_CONFIG.copy()
        ppo_config.update({"num_gpus": 0,"num_workers": 0,
                # Also, use "framework: tf2" for tfe eager execution.
                "framework": "tf2",
                "train_batch_size": 200,
                "horizon": 100,
                "gamma": 0.95,
                "model": {
                    "fcnet_hiddens": [512, 512],
                    "fcnet_activation": "relu",
                }})
        
        b_line = PPOTrainer(config=ppo_config,env="CybORG")
        b_line.restore("b_line_agent/checkpoint_000109/checkpoint-109")
        meander = PPOTrainer(config=ppo_config,env="CybORG")
        meander.restore("supervisor_ppo/checkpoint_000183/checkpoint-183")
        self.sub_agents = [b_line.get_policy().model, meander.get_policy().model]
        
        self.action = np.array([0])
        self.action_logp = np.array([0.])
        self.logits = np.array([[0.,0.]])

    @override(ModelV2)
    def forward(self, input_dict, state, seq_lens):
        self.logits, self.value = self.model({'observations': input_dict[SampleBatch.CUR_OBS]})
        self.action = tf.random.categorical(self.logits, 1, dtype=tf.int32)
        self.action = tf.reshape(self.action, self.action.shape[0])
        self.action_logp = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.action, logits=self.logits)
        return self.logits, state
    
    @override(ModelV2)
    def value_function(self):
        return tf.reshape(self.value, [-1])
    
def build_model(policy, obs_space, action_space, config) -> ModelV2:
    return ModelCatalog.get_model_v2(obs_space,
            2,
            2,
            config["model"],
            name="h_model",
            framework="tf2",
            model_interface=HieracrchyModel)

In [185]:
#adapted from https://github.com/ray-project/ray/blob/master/rllib/examples/models/rnn_model.py
class HieracrchyRNNModel(RecurrentNetwork):
    """Example of using the Keras functional API to define a RNN model."""

    def __init__(
        self,
        obs_space,
        action_space,
        num_outputs,
        model_config,
        name,
        hiddens_size=256,
        cell_size=64,
    ):
        super(HieracrchyRNNModel, self).__init__(
            obs_space, action_space, num_outputs, model_config, name
        )
        self.cell_size = cell_size

        # Define input layers
        input_layer = tf.keras.layers.Input(
            shape=(None, obs_space.shape[0]), name="inputs"
        )
        state_in_h = tf.keras.layers.Input(shape=(cell_size,), name="h")
        state_in_c = tf.keras.layers.Input(shape=(cell_size,), name="c")
        seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32)

        # Preprocess observation with a hidden layer and send to LSTM cell
        dense1 = tf.keras.layers.Dense(
            hiddens_size, activation=tf.nn.relu, name="dense1"
        )(input_layer)
        lstm_out, state_h, state_c = tf.keras.layers.LSTM(
            cell_size, return_sequences=True, return_state=True, name="lstm"
        )(
            inputs=dense1,
            mask=tf.sequence_mask(seq_in),
            initial_state=[state_in_h, state_in_c],
        )

        # Postprocess LSTM output with another hidden layer and compute values
        logits = tf.keras.layers.Dense(
            self.num_outputs, activation=tf.keras.activations.linear, name="logits"
        )(lstm_out)
        values = tf.keras.layers.Dense(1, activation=None, name="values")(lstm_out)

        # Create the RNN model
        self.rnn_model = tf.keras.Model(
            inputs=[input_layer, seq_in, state_in_h, state_in_c],
            outputs=[logits, values, state_h, state_c],
        )        
        
        ppo_config = ppo.DEFAULT_CONFIG.copy()
        ppo_config.update({"num_gpus": 0,"num_workers": 0,
            "framework": "tf2",
            "model": {
                "fcnet_hiddens": [512, 512],
                "fcnet_activation": "relu",
            }})
        
        b_line = PPOTrainer(config=ppo_config,env="CybORG")
        b_line.restore("b_line_agent/checkpoint_000109/checkpoint-109")
        meander = PPOTrainer(config=ppo_config,env="CybORG")
        meander.restore("supervisor_ppo/checkpoint_000183/checkpoint-183")
        self.sub_agents = [b_line.get_policy().model, meander.get_policy().model]
        
        self.action = np.array([0])
        self.action_logp = np.array([0.])
        self.logits = np.array([[0.,0.]])
        
        
    @override(RecurrentNetwork)
    def forward_rnn(self, inputs, state, seq_lens):
        self.logits, self._value_out, h, c = self.rnn_model([inputs, seq_lens] + state)
        #self.logits = tf.squeeze(self.logits)
        #print(self.logits)
        #self.action = tf.squeeze(tf.random.categorical(self.logits, 1, dtype=tf.int32))
        ##self.action = tf.reshape(self.action, self.action.shape[0])

        #self.action_logp = -tf.nn.sparse_softmax_cross_entropy_with_logits(
        #    labels=self.action, logits=self.logits)
        
        #dist = Categorical(tf.squeeze(self.logits))
        #self.action = dist.sample()
       # print(self.action)
       # self.action_logp = dist.sampled_action_logp()
      #  print(self.action_logp)
        return self.logits, [h, c]

    @override(ModelV2)
    def get_initial_state(self):
        return [
            np.zeros(self.cell_size, np.float32),
            np.zeros(self.cell_size, np.float32),
        ]

    @override(ModelV2)
    def value_function(self):
        return tf.reshape(self._value_out, [-1])
    
def build_model(policy, obs_space, action_space, config) -> ModelV2:
    return ModelCatalog.get_model_v2(obs_space,
            2,
            2,
            config["model"],
            name="h_rnn_model",
            framework="tf2",
            model_interface=HieracrchyRNNModel)


In [186]:
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.agents.ppo.ppo_tf_policy import ppo_surrogate_loss
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.utils.exploration.stochastic_sampling import StochasticSampling
from gym.spaces.discrete import Discrete
from ray.rllib.models.tf.tf_action_dist import Categorical

def after_action(policy):
    return {
        SampleBatch.ACTIONS: policy.model.action,
        SampleBatch.ACTION_LOGP: policy.model.action_logp,
        SampleBatch.ACTION_DIST_INPUTS: policy.model.logits,
        SampleBatch.VF_PREDS: policy.model.value_function(),
    } 

def loss(self, model, dist_class, train_batch):
    return ppo_surrogate_loss(self, model, Categorical, train_batch)

def action_sampler_fn(self, model, obs_batch, explore, seq_lens, state_batches, 
                      prev_action_batch, prev_reward_batch, is_training):
    self.model.forward({'obs_flat': obs_batch}, state_batches, seq_lens)
    #For tf2
    logits = model.sub_agents[model.action[0]].forward({'obs_flat': obs}, None, None)[0]
    #Annoying bodge for tf1, will need to get smart, if more than than 2 options
    #obs_copy = tf.identity(obs_batch)
    #logits = tf.cond(tf.equal(model.action[0], tf.constant(0)),
    #                 lambda: model.sub_agents[0].forward({'obs_flat': obs_copy}, None, None)[0],
    #                 lambda: model.sub_agents[1].forward({'obs_flat': obs}, None, None)[0])
    
    #return tf.random.categorical(logits, 1, dtype=tf.int32)[0], None
    return tf.math.argmax(logits, axis=1), None

def action_distribution_fn(self, model, input_dict, state_batches, seq_lens, explore, timestep, is_training=False):
    self.model.forward({'obs_flat': input_dict['obs']}, state_batches, seq_lens)
    logits = model.sub_agents[model.action[0]].forward({'obs_flat': input_dict['obs']}, None, None)[0]
    return logits, Categorical, state_batches
    
HierarchyPolicy = PPOTFPolicy.with_updates(
    name="HierarchyPPOPolicy",
    loss_fn=loss,
    #action_sampler_fn=action_sampler_fn,
    #action_distribution_fn=action_distribution_fn,
    make_model=build_model,
    extra_action_out_fn=after_action)

class HieracrchyTrainer(PPOTrainer):
    def get_default_policy_class(self, config):
        return HierarchyPolicy

Additional Option Critic functions

In [187]:
import inspect
import time
from statistics import mean, stdev
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
import os
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents import ppo
from ray.tune.registry import register_env
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper
import warnings
warnings.filterwarnings('ignore')

In [188]:
MAX_EPS = 50
agent_name = 'Blue'

def wrap(env):
    return RLlibWrapper(agent_name="Blue", env=env)


def evaluate(steps):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'

    #print(f'using CybORG v{cyborg_version}, {scenario}\n')
    for num_steps in steps:
        #for red_agent in [B_lineAgent, RedMeanderAgent, SleepAgent]:
        rs = []
        for red_agent in [RedMeanderAgent, B_lineAgent]:

            cyborg = CybORG(path, 'sim', agents={'Red': red_agent})
            wrapped_cyborg = wrap(cyborg)

            observation = wrapped_cyborg.reset()
            # observation = cyborg.reset().observation

            action_space = wrapped_cyborg.get_action_space(agent_name)
            # action_space = cyborg.get_action_space(agent_name)
            total_reward = []
            actions = []
            for i in range(MAX_EPS):
                r = []
                a = []
                # cyborg.env.env.tracker.render()
                for j in range(num_steps):
                    action = agent.compute_single_action(observation)
                    #action = agent.get_action(observation, action_space)
                    observation, rew, done, info = wrapped_cyborg.step(action)
                    # result = cyborg.step(agent_name, action)
                    r.append(rew)
                    # r.append(result.reward)
                    a.append((str(cyborg.get_last_action('Blue')), str(cyborg.get_last_action('Red'))))
                total_reward.append(sum(r))
                actions.append(a)
                # observation = cyborg.reset().observation
                observation = wrapped_cyborg.reset()
            rs.append(mean(total_reward))
            print(f'Average reward for red agent {red_agent.__name__} at steps {num_steps} is: {mean(total_reward):.1f} with a standard deviation of {stdev(total_reward):.1f}')
    return rs

In [189]:
from ray.rllib.models import ModelCatalog
from ray.tune.registry import register_env

def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")
    

class MultiEnv(gym.Env):
    def __init__(self, env_config):
        # pick actual env based on worker and env indexes
        self.env = self.choose_env_for(env_config.worker_index)
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
    def reset(self):
        return self.env.reset()
    def step(self, action):
        return self.env.step(action)   
    def choose_env_for(self, index):
        if index > 20:
            path = str(inspect.getfile(CybORG))
            path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
            agents = {"Red": SleepAgent, "Green": GreenAgent}
            cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
            return RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
        elif index % 2 == 0:
            path = str(inspect.getfile(CybORG))
            path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
            agents = {"Red": B_lineAgent, "Green": GreenAgent}
            cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
            return RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
        else:
            path = str(inspect.getfile(CybORG))
            path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
            agents = {"Red": RedMeanderAgent, "Green": GreenAgent}
            cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
            return RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
        
def env_creator(env_config: dict):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
    agents = {"Red": B_lineAgent, "Green": GreenAgent}
    cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
    env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
    return env

register_env("CybORG", env_creator=env_creator)
register_env("multienv", lambda config: MultiEnv(config))

batch_size = 4000
config = ppo.DEFAULT_CONFIG.copy()

ModelCatalog.register_custom_model("h_rnn_model", HieracrchyRNNModel)

config.update({"num_gpus": 1, "num_workers": 0,
               "env": "multienv",
                # Also, use "framework: tf2" for tfe eager execution.
                "framework": "tf2",
                "train_batch_size": batch_size,
                "horizon": 100,
                "sgd_minibatch_size": 100,
                "use_critic": False,
                "use_gae": False,
                "gamma": 0.8,
                "batch_mode":"complete_episodes",
                "model": {
                    "custom_model": "h_rnn_model",
                },

                }) 
agent = HieracrchyTrainer(config=config)

for i in range(200):
    results_dict = agent.train()
    print_results(results_dict)
    #evaluate([100])

2022-08-08 14:15:26,885	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: b_line_agent/checkpoint_000109/checkpoint-109
2022-08-08 14:15:26,886	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 109, '_timesteps_total': None, '_time_total': 4080.5770568847656, '_episodes_total': 4360}
2022-08-08 14:15:27,511	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000183/checkpoint-183
2022-08-08 14:15:27,512	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 183, '_timesteps_total': None, '_time_total': 6626.44996380806, '_episodes_total': 7320}


InvalidArgumentError: logits should be a matrix, got shape [32,1,2] [Op:Multinomial]

In [None]:
ppo.DEFAULT_CONFIG.copy()