Create the trainer

In [1]:
import inspect
import time
from statistics import mean, stdev
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
import os
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents import ppo
from ray.tune.registry import register_env
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper
import warnings
from tqdm import tqdm
import numpy as np
import random
import ray
from collections import deque
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'tqdm'

In [None]:
MAX_EPS = 50
agent_name = 'Blue'

def wrap(env):
    return RLlibWrapper(agent_name="Blue", env=env)


def evaluate(steps):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario1b.yaml'

    #print(f'using CybORG v{cyborg_version}, {scenario}\n')
    for num_steps in steps:
        for red_agent in [B_lineAgent, RedMeanderAgent, SleepAgent]:

            cyborg = CybORG(path, 'sim', agents={'Red': red_agent})
            wrapped_cyborg = wrap(cyborg)

            observation = wrapped_cyborg.reset()
            # observation = cyborg.reset().observation

            action_space = wrapped_cyborg.get_action_space(agent_name)
            # action_space = cyborg.get_action_space(agent_name)
            total_reward = []
            actions = []
            for i in range(MAX_EPS):
                r = []
                a = []
                # cyborg.env.env.tracker.render()
                for j in range(num_steps):
                    action = trainer.compute_single_action(observation)
                    #action = agent.get_action(observation, action_space)
                    observation, rew, done, info = wrapped_cyborg.step(action)
                    # result = cyborg.step(agent_name, action)
                    r.append(rew)
                    # r.append(result.reward)
                    a.append((str(cyborg.get_last_action('Blue')), str(cyborg.get_last_action('Red'))))
                total_reward.append(sum(r))
                actions.append(a)
                # observation = cyborg.reset().observation
                observation = wrapped_cyborg.reset()
            print(f'Average reward for red agent {red_agent.__name__} and steps {num_steps} is: {mean(total_reward):.1f} with a standard deviation of {stdev(total_reward):.1f}')


In [None]:
from ray.rllib.offline.json_writer import JsonWriter
from ray.rllib.offline.dataset_writer import DatasetWriter
from ray.rllib.offline.io_context import IOContext
from datetime import datetime
import shutil

def env_creator(env_config: dict):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario1b.yaml'
    agents = {"Red": B_lineAgent, "Green": GreenAgent}
    cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
    env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
    return env

def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")
    
register_env(name="CybORG", env_creator=env_creator)

In [4]:
"""
Deep Q-Networks (DQN, Rainbow, Parametric DQN)
==============================================

This file defines the distributed Trainer class for the Deep Q-Networks
algorithm. See `dqn_[tf|torch]_policy.py` for the definition of the policies.

Detailed documentation:
https://docs.ray.io/en/master/rllib-algorithms.html#deep-q-networks-dqn-rainbow-parametric-dqn
"""  # noqa: E501

import logging
from typing import List, Optional, Type

from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
from ray.rllib.agents.dqn.dqn_torch_policy import DQNTorchPolicy
from ray.rllib.agents.dqn.simple_q import (
    SimpleQTrainer,
    DEFAULT_CONFIG as SIMPLEQ_DEFAULT_CONFIG,
)
from ray.rllib.agents.trainer import Trainer
from ray.rllib.evaluation.worker_set import WorkerSet
from ray.rllib.execution.concurrency_ops import Concurrently
from ray.rllib.execution.metric_ops import StandardMetricsReporting
from ray.rllib.execution.replay_ops import Replay, StoreToReplayBuffer
from ray.rllib.execution.rollout_ops import ParallelRollouts
from ray.rllib.execution.train_ops import (
    TrainOneStep,
    UpdateTargetNetwork,
    MultiGPUTrainOneStep,
)
from ray.rllib.policy.policy import Policy
from ray.rllib.utils.annotations import override
from ray.rllib.utils.deprecation import Deprecated
from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY
from ray.rllib.utils.typing import TrainerConfigDict
from ray.util.iter import LocalIterator

logger = logging.getLogger(__name__)

# fmt: off
# __sphinx_doc_begin__
DEFAULT_CONFIG = Trainer.merge_trainer_configs(
    SIMPLEQ_DEFAULT_CONFIG,
    {
        # === Model ===
        # Number of atoms for representing the distribution of return. When
        # this is greater than 1, distributional Q-learning is used.
        # the discrete supports are bounded by v_min and v_max
        "num_atoms": 1,
        "v_min": -10.0,
        "v_max": 10.0,
        # Whether to use noisy network
        "noisy": False,
        # control the initial value of noisy nets
        "sigma0": 0.5,
        # Whether to use dueling dqn
        "dueling": True,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens": [256],
        # Whether to use double dqn
        "double_q": True,
        # N-step Q learning
        "n_step": 1,

        # === Prioritized replay buffer ===
        # If True prioritized replay buffer will be used.
        "prioritized_replay": True,
        # Alpha parameter for prioritized replay buffer.
        "prioritized_replay_alpha": 0.6,
        # Beta parameter for sampling from prioritized replay buffer.
        "prioritized_replay_beta": 0.4,
        # Final value of beta (by default, we use constant beta=0.4).
        "final_prioritized_replay_beta": 0.4,
        # Time steps over which the beta parameter is annealed.
        "prioritized_replay_beta_annealing_timesteps": 20000,
        # Epsilon to add to the TD errors when updating priorities.
        "prioritized_replay_eps": 1e-6,

        # Callback to run before learning on a multi-agent batch of
        # experiences.
        "before_learn_on_batch": None,

        # The intensity with which to update the model (vs collecting samples
        # from the env). If None, uses the "natural" value of:
        # `train_batch_size` / (`rollout_fragment_length` x `num_workers` x
        # `num_envs_per_worker`).
        # If provided, will make sure that the ratio between ts inserted into
        # and sampled from the buffer matches the given value.
        # Example:
        #   training_intensity=1000.0
        #   train_batch_size=250 rollout_fragment_length=1
        #   num_workers=1 (or 0) num_envs_per_worker=1
        #   -> natural value = 250 / 1 = 250.0
        #   -> will make sure that replay+train op will be executed 4x as
        #      often as rollout+insert op (4 * 250 = 1000).
        # See: rllib/agents/dqn/dqn.py::calculate_rr_weights for further
        # details.
        "training_intensity": None,

        # === Parallelism ===
        # Whether to compute priorities on workers.
        "worker_side_prioritization": False,
    },
    _allow_unknown_configs=True,
)
# __sphinx_doc_end__
# fmt: on


def calculate_rr_weights(config: TrainerConfigDict) -> List[float]:
    """Calculate the round robin weights for the rollout and train steps"""
    if not config["training_intensity"]:
        return [1, 1]

    # Calculate the "native ratio" as:
    # [train-batch-size] / [size of env-rolled-out sampled data]
    # This is to set freshly rollout-collected data in relation to
    # the data we pull from the replay buffer (which also contains old
    # samples).
    native_ratio = config["train_batch_size"] / (
        config["rollout_fragment_length"]
        * config["num_envs_per_worker"]
        * config["num_workers"]
    )

    # Training intensity is specified in terms of
    # (steps_replayed / steps_sampled), so adjust for the native ratio.
    weights = [1, config["training_intensity"] / native_ratio]
    return weights


class DQNTrainer_Offline(SimpleQTrainer):
    @classmethod
    @override(SimpleQTrainer)
    def get_default_config(cls) -> TrainerConfigDict:
        return DEFAULT_CONFIG

    @override(SimpleQTrainer)
    def validate_config(self, config: TrainerConfigDict) -> None:
        # Call super's validation method.
        super().validate_config(config)

        # Update effective batch size to include n-step
        adjusted_rollout_len = max(config["rollout_fragment_length"], config["n_step"])
        config["rollout_fragment_length"] = adjusted_rollout_len

    @override(SimpleQTrainer)
    def get_default_policy_class(
        self, config: TrainerConfigDict
    ) -> Optional[Type[Policy]]:
        return DQNTFPolicyOffline

    @staticmethod
    @override(SimpleQTrainer)
    def execution_plan(
        workers: WorkerSet, config: TrainerConfigDict, **kwargs
    ) -> LocalIterator[dict]:
        assert (
            "local_replay_buffer" in kwargs
        ), "DQN's execution plan requires a local replay buffer."

        # Assign to Trainer, so we can store the MultiAgentReplayBuffer's
        # data when we save checkpoints.
        local_replay_buffer = kwargs["local_replay_buffer"]

        rollouts = ParallelRollouts(workers, mode="bulk_sync")
    
        # We execute the following steps concurrently:
        # (1) Generate rollouts and store them in our local replay buffer.
        # Calling next() on store_op drives this.
        store_op = rollouts.for_each(
            StoreToReplayBuffer(local_buffer=local_replay_buffer)
        )

        def update_prio(item):
            samples, info_dict = item
            if config.get("prioritized_replay"):
                prio_dict = {}
                for policy_id, info in info_dict.items():
                    # TODO(sven): This is currently structured differently for
                    #  torch/tf. Clean up these results/info dicts across
                    #  policies (note: fixing this in torch_policy.py will
                    #  break e.g. DDPPO!).
                    td_error = info.get(
                        "td_error", info[LEARNER_STATS_KEY].get("td_error")
                    )
                    samples.policy_batches[policy_id].set_get_interceptor(None)
                    batch_indices = samples.policy_batches[policy_id].get(
                        "batch_indexes"
                    )
                    # In case the buffer stores sequences, TD-error could
                    # already be calculated per sequence chunk.
                    if len(batch_indices) != len(td_error):
                        T = local_replay_buffer.replay_sequence_length
                        assert (
                            len(batch_indices) > len(td_error)
                            and len(batch_indices) % T == 0
                        )
                        batch_indices = batch_indices.reshape([-1, T])[:, 0]
                        assert len(batch_indices) == len(td_error)
                    prio_dict[policy_id] = (batch_indices, td_error)
                local_replay_buffer.update_priorities(prio_dict)
            return info_dict

        # (2) Read and train on experiences from the replay buffer. Every batch
        # returned from the LocalReplay() iterator is passed to TrainOneStep to
        # take a SGD step, and then we decide whether to update the target
        # network.
        post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)

        if config["simple_optimizer"]:
            train_step_op = TrainOneStep(workers)
        else:
            train_step_op = MultiGPUTrainOneStep(
                workers=workers,
                sgd_minibatch_size=config["train_batch_size"],
                num_sgd_iter=1,
                num_gpus=config["num_gpus"],
                _fake_gpus=config["_fake_gpus"],
            )

        replay_op = (
            Replay(local_buffer=local_replay_buffer)
            .for_each(lambda x: post_fn(x, workers, config))
            .for_each(train_step_op)
            .for_each(update_prio)
            .for_each(
                UpdateTargetNetwork(workers, config["target_network_update_freq"])
            )
        )

        # Alternate deterministically between (1) and (2).
        # Only return the output of (2) since training metrics are not
        # available until (2) runs.
        train_op = Concurrently(
            [store_op, replay_op],
            mode="round_robin",
            output_indexes=[1],
            round_robin_weights=calculate_rr_weights(config),
        )
        return StandardMetricsReporting(train_op, workers, config)


@Deprecated(
    new="Sub-class directly from `DQNTrainer` and override its methods", error=False
)
class GenericOffPolicyTrainer(DQNTrainer_Offline):
    pass

In [5]:
"""TensorFlow policy class used for DQN"""

from typing import Dict

import gym
import numpy as np
import ray
from ray.rllib.agents.dqn.distributional_q_tf_model import DistributionalQTFModel
from ray.rllib.agents.dqn.simple_q_tf_policy import TargetNetworkMixin
from ray.rllib.evaluation.postprocessing import adjust_nstep
from ray.rllib.models import ModelCatalog
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.tf_action_dist import Categorical
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.tf_policy import LearningRateSchedule
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils.exploration import ParameterNoise
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.numpy import convert_to_numpy
from ray.rllib.utils.tf_utils import (
    huber_loss,
    make_tf_callable,
    minimize_and_clip,
    reduce_mean_ignore_inf,
)
from ray.rllib.utils.typing import ModelGradients, TensorType, TrainerConfigDict

tf1, tf, tfv = try_import_tf()

Q_SCOPE = "q_func"
Q_TARGET_SCOPE = "target_q_func"

# Importance sampling weights for prioritized replay
PRIO_WEIGHTS = "weights"


class QLoss:
    def __init__(
        self,
        q_t_selected: TensorType,
        q_logits_t_selected: TensorType,
        q_tp1_best: TensorType,
        q_dist_tp1_best: TensorType,
        importance_weights: TensorType,
        rewards: TensorType,
        done_mask: TensorType,
        gamma: float = 0.99,
        n_step: int = 1,
        num_atoms: int = 1,
        v_min: float = -10.0,
        v_max: float = 0.0,
    ):
        

        if num_atoms > 1:
            # Distributional Q-learning which corresponds to an entropy loss

            z = tf.range(num_atoms, dtype=tf.float32)
            z = v_min + z * (v_max - v_min) / float(num_atoms - 1)

            # (batch_size, 1) * (1, num_atoms) = (batch_size, num_atoms)
            r_tau = tf.expand_dims(tf.cast(rewards, tf.float32), -1) + gamma ** n_step * tf.expand_dims(
                1.0 - done_mask, -1
            ) * tf.expand_dims(z, 0)
            r_tau = tf.clip_by_value(r_tau, v_min, v_max)
            b = (r_tau - v_min) / ((v_max - v_min) / float(num_atoms - 1))
            lb = tf.floor(b)
            ub = tf.math.ceil(b)
            # indispensable judgement which is missed in most implementations
            # when b happens to be an integer, lb == ub, so pr_j(s', a*) will
            # be discarded because (ub-b) == (b-lb) == 0
            floor_equal_ceil = tf.cast(tf.less(ub - lb, 0.5), tf.float32)

            l_project = tf.one_hot(
                tf.cast(lb, dtype=tf.int32), num_atoms
            )  # (batch_size, num_atoms, num_atoms)
            u_project = tf.one_hot(
                tf.cast(ub, dtype=tf.int32), num_atoms
            )  # (batch_size, num_atoms, num_atoms)
            ml_delta = q_dist_tp1_best * (ub - b + floor_equal_ceil)
            mu_delta = q_dist_tp1_best * (b - lb)
            ml_delta = tf.reduce_sum(l_project * tf.expand_dims(ml_delta, -1), axis=1)
            mu_delta = tf.reduce_sum(u_project * tf.expand_dims(mu_delta, -1), axis=1)
            m = ml_delta + mu_delta

            # Rainbow paper claims that using this cross entropy loss for
            # priority is robust and insensitive to `prioritized_replay_alpha`
            self.td_error = tf.nn.softmax_cross_entropy_with_logits(
                labels=m, logits=q_logits_t_selected
            )
            self.loss = tf.reduce_mean(
                self.td_error * tf.cast(importance_weights, tf.float32)
            )
            self.stats = {
                # TODO: better Q stats for dist dqn
                "mean_td_error": tf.reduce_mean(self.td_error),
            }
        else:
            q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best

            # compute RHS of bellman equation
            q_t_selected_target = tf.cast(rewards, tf.float32) + gamma ** n_step * q_tp1_best_masked

            # compute the error (potentially clipped)
            self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            self.loss = tf.reduce_mean(
                tf.cast(importance_weights, tf.float32) * huber_loss(self.td_error)
            )
            self.stats = {
                "mean_q": tf.reduce_mean(q_t_selected),
                "min_q": tf.reduce_min(q_t_selected),
                "max_q": tf.reduce_max(q_t_selected),
                "mean_td_error": tf.reduce_mean(self.td_error),
            }


class ComputeTDErrorMixin:
    """Assign the `compute_td_error` method to the DQNTFPolicy

    This allows us to prioritize on the worker side.
    """

    def __init__(self):
        @make_tf_callable(self.get_session(), dynamic_shape=True)
        def compute_td_error(
            obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights
        ):
            # Do forward pass on loss to update td error attribute
            build_q_losses(
                self,
                self.model,
                None,
                {
                    SampleBatch.CUR_OBS: tf.convert_to_tensor(obs_t),
                    SampleBatch.ACTIONS: tf.convert_to_tensor(act_t),
                    SampleBatch.REWARDS: tf.convert_to_tensor(rew_t),
                    SampleBatch.NEXT_OBS: tf.convert_to_tensor(obs_tp1),
                    SampleBatch.DONES: tf.convert_to_tensor(done_mask),
                    PRIO_WEIGHTS: tf.convert_to_tensor(importance_weights),
                },
            )

            return self.q_loss.td_error

        self.compute_td_error = compute_td_error


def build_q_model(
    policy: Policy,
    obs_space: gym.spaces.Space,
    action_space: gym.spaces.Space,
    config: TrainerConfigDict,
) -> ModelV2:
    """Build q_model and target_model for DQN

    Args:
        policy (Policy): The Policy, which will use the model for optimization.
        obs_space (gym.spaces.Space): The policy's observation space.
        action_space (gym.spaces.Space): The policy's action space.
        config (TrainerConfigDict):

    Returns:
        ModelV2: The Model for the Policy to use.
            Note: The target q model will not be returned, just assigned to
            `policy.target_model`.
    """
    if not isinstance(action_space, gym.spaces.Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space)
        )

    if config["hiddens"]:
        # try to infer the last layer size, otherwise fall back to 256
        num_outputs = ([256] + list(config["model"]["fcnet_hiddens"]))[-1]
        config["model"]["no_final_linear"] = True
    else:
        num_outputs = action_space.n

    q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="tf",
        model_interface=DistributionalQTFModel,
        name=Q_SCOPE,
        num_atoms=config["num_atoms"],
        dueling=config["dueling"],
        q_hiddens=config["hiddens"],
        use_noisy=config["noisy"],
        v_min=config["v_min"],
        v_max=config["v_max"],
        sigma0=config["sigma0"],
        # TODO(sven): Move option to add LayerNorm after each Dense
        #  generically into ModelCatalog.
        add_layer_norm=isinstance(getattr(policy, "exploration", None), ParameterNoise)
        or config["exploration_config"]["type"] == "ParameterNoise",
    )

    policy.target_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="tf",
        model_interface=DistributionalQTFModel,
        name=Q_TARGET_SCOPE,
        num_atoms=config["num_atoms"],
        dueling=config["dueling"],
        q_hiddens=config["hiddens"],
        use_noisy=config["noisy"],
        v_min=config["v_min"],
        v_max=config["v_max"],
        sigma0=config["sigma0"],
        # TODO(sven): Move option to add LayerNorm after each Dense
        #  generically into ModelCatalog.
        add_layer_norm=isinstance(getattr(policy, "exploration", None), ParameterNoise)
        or config["exploration_config"]["type"] == "ParameterNoise",
    )

    return q_model


def get_distribution_inputs_and_class(
    policy: Policy, model: ModelV2, input_dict: SampleBatch, *, explore=True, **kwargs
):
    q_vals = compute_q_values(
        policy, model, input_dict, state_batches=None, explore=explore
    )
    q_vals = q_vals[0] if isinstance(q_vals, tuple) else q_vals

    policy.q_values = q_vals

    return policy.q_values, Categorical, []  # state-out


def build_q_losses(policy: Policy, model, _, train_batch: SampleBatch) -> TensorType:
    """Constructs the loss for DQNTFPolicy.

    Args:
        policy (Policy): The Policy to calculate the loss for.
        model (ModelV2): The Model to calculate the loss for.
        train_batch (SampleBatch): The training data.

    Returns:
        TensorType: A single loss tensor.
    """
    config = policy.config
    # q network evaluation
    q_t, q_logits_t, q_dist_t, _ = compute_q_values(
        policy,
        model,
        SampleBatch({"obs": train_batch[SampleBatch.CUR_OBS]}),
        state_batches=None,
        explore=False,
    )

    # target q network evalution
    q_tp1, q_logits_tp1, q_dist_tp1, _ = compute_q_values(
        policy,
        policy.target_model,
        SampleBatch({"obs": train_batch[SampleBatch.NEXT_OBS]}),
        state_batches=None,
        explore=False,
    )
    if not hasattr(policy, "target_q_func_vars"):
        policy.target_q_func_vars = policy.target_model.variables()

    # q scores for actions which we know were selected in the given state.
    one_hot_selection = tf.one_hot(
        tf.cast(train_batch[SampleBatch.ACTIONS], tf.int32), policy.action_space.n
    )
    q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1)
    q_logits_t_selected = tf.reduce_sum(
        q_logits_t * tf.expand_dims(one_hot_selection, -1), 1
    )

    # compute estimate of best possible value starting from state at t + 1
    if config["double_q"]:
        (
            q_tp1_using_online_net,
            q_logits_tp1_using_online_net,
            q_dist_tp1_using_online_net,
            _,
        ) = compute_q_values(
            policy,
            model,
            SampleBatch({"obs": train_batch[SampleBatch.NEXT_OBS]}),
            state_batches=None,
            explore=False,
        )
        q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
        q_tp1_best_one_hot_selection = tf.one_hot(
            q_tp1_best_using_online_net, policy.action_space.n
        )
        q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
        q_dist_tp1_best = tf.reduce_sum(
            q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1
        )
    else:
        q_tp1_best_one_hot_selection = tf.one_hot(
            tf.argmax(q_tp1, 1), policy.action_space.n
        )
        q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
        q_dist_tp1_best = tf.reduce_sum(
            q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1
        )

    policy.q_loss = QLoss(
        q_t_selected,
        q_logits_t_selected,
        q_tp1_best,
        q_dist_tp1_best,
        train_batch[PRIO_WEIGHTS],
        train_batch[SampleBatch.REWARDS],
        tf.cast(train_batch[SampleBatch.DONES], tf.float32),
        config["gamma"],
        config["n_step"],
        config["num_atoms"],
        config["v_min"],
        config["v_max"],
    )

    return policy.q_loss.loss


def adam_optimizer(
    policy: Policy, config: TrainerConfigDict
) -> "tf.keras.optimizers.Optimizer":
    if policy.config["framework"] in ["tf2", "tfe"]:
        return tf.keras.optimizers.Adam(
            learning_rate=policy.cur_lr, epsilon=config["adam_epsilon"]
        )
    else:
        return tf1.train.AdamOptimizer(
            learning_rate=policy.cur_lr, epsilon=config["adam_epsilon"]
        )


def clip_gradients(
    policy: Policy, optimizer: "tf.keras.optimizers.Optimizer", loss: TensorType
) -> ModelGradients:
    if not hasattr(policy, "q_func_vars"):
        policy.q_func_vars = policy.model.variables()

    return minimize_and_clip(
        optimizer,
        loss,
        var_list=policy.q_func_vars,
        clip_val=policy.config["grad_clip"],
    )


def build_q_stats(policy: Policy, batch) -> Dict[str, TensorType]:
    return dict(
        {
            "cur_lr": tf.cast(policy.cur_lr, tf.float64),
        },
        **policy.q_loss.stats
    )


def setup_mid_mixins(policy: Policy, obs_space, action_space, config) -> None:
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
    ComputeTDErrorMixin.__init__(policy)


def setup_late_mixins(
    policy: Policy,
    obs_space: gym.spaces.Space,
    action_space: gym.spaces.Space,
    config: TrainerConfigDict,
) -> None:
    TargetNetworkMixin.__init__(policy, obs_space, action_space, config)


def compute_q_values(
    policy: Policy,
    model: ModelV2,
    input_batch: SampleBatch,
    state_batches=None,
    seq_lens=None,
    explore=None,
    is_training: bool = False,
):

    config = policy.config

    model_out, state = model(input_batch, state_batches or [], seq_lens)

    if config["num_atoms"] > 1:
        (
            action_scores,
            z,
            support_logits_per_action,
            logits,
            dist,
        ) = model.get_q_value_distributions(model_out)
    else:
        (action_scores, logits, dist) = model.get_q_value_distributions(model_out)

    if config["dueling"]:
        state_score = model.get_state_value(model_out)
        if config["num_atoms"] > 1:
            support_logits_per_action_mean = tf.reduce_mean(
                support_logits_per_action, 1
            )
            support_logits_per_action_centered = (
                support_logits_per_action
                - tf.expand_dims(support_logits_per_action_mean, 1)
            )
            support_logits_per_action = (
                tf.expand_dims(state_score, 1) + support_logits_per_action_centered
            )
            support_prob_per_action = tf.nn.softmax(logits=support_logits_per_action)
            value = tf.reduce_sum(input_tensor=z * support_prob_per_action, axis=-1)
            logits = support_logits_per_action
            dist = support_prob_per_action
        else:
            action_scores_mean = reduce_mean_ignore_inf(action_scores, 1)
            action_scores_centered = action_scores - tf.expand_dims(
                action_scores_mean, 1
            )
            value = state_score + action_scores_centered
    else:
        value = action_scores

    return value, logits, dist, state


def postprocess_nstep_and_prio(
    policy: Policy, batch: SampleBatch, other_agent=None, episode=None
) -> SampleBatch:
    # N-step Q adjustments.
    if policy.config["n_step"] > 1:
        adjust_nstep(policy.config["n_step"], policy.config["gamma"], batch)

    # Create dummy prio-weights (1.0) in case we don't have any in
    # the batch.
    if PRIO_WEIGHTS not in batch:
        batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS])

    # Prioritize on the worker side.
    if batch.count > 0 and policy.config["worker_side_prioritization"]:
        td_errors = policy.compute_td_error(
            batch[SampleBatch.OBS],
            batch[SampleBatch.ACTIONS],
            batch[SampleBatch.REWARDS],
            batch[SampleBatch.NEXT_OBS],
            batch[SampleBatch.DONES],
            batch[PRIO_WEIGHTS],
        )
        new_priorities = (
            np.abs(convert_to_numpy(td_errors))
            + policy.config["prioritized_replay_eps"]
        )
        batch[PRIO_WEIGHTS] = new_priorities

    return batch


DQNTFPolicyOffline = build_tf_policy(
    name="DQNTFPolicyOffline",
    get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
    make_model=build_q_model,
    action_distribution_fn=get_distribution_inputs_and_class,
    loss_fn=build_q_losses,
    stats_fn=build_q_stats,
    postprocess_fn=postprocess_nstep_and_prio,
    optimizer_fn=adam_optimizer,
    compute_gradients_fn=clip_gradients,
    extra_action_out_fn=lambda policy: {"q_values": policy.q_values},
    extra_learn_fetches_fn=lambda policy: {"td_error": policy.q_loss.td_error},
    before_loss_init=setup_mid_mixins,
    after_init=setup_late_mixins,
    mixins=[
        TargetNetworkMixin,
        ComputeTDErrorMixin,
        LearningRateSchedule,
    ],
)

In [6]:
from ray.rllib.agents.dqn import dqn
dqn.DEFAULT_CONFIG.copy()

{'num_workers': 0,
 'num_envs_per_worker': 1,
 'create_env_on_driver': False,
 'rollout_fragment_length': 4,
 'batch_mode': 'truncate_episodes',
 'gamma': 0.99,
 'lr': 0.0005,
 'train_batch_size': 32,
 'model': {'_use_default_native_models': False,
  '_disable_preprocessor_api': False,
  '_disable_action_flattening': False,
  'fcnet_hiddens': [256, 256],
  'fcnet_activation': 'tanh',
  'conv_filters': None,
  'conv_activation': 'relu',
  'post_fcnet_hiddens': [],
  'post_fcnet_activation': 'relu',
  'free_log_std': False,
  'no_final_linear': False,
  'vf_share_layers': True,
  'use_lstm': False,
  'max_seq_len': 20,
  'lstm_cell_size': 256,
  'lstm_use_prev_action': False,
  'lstm_use_prev_reward': False,
  '_time_major': False,
  'use_attention': False,
  'attention_num_transformer_units': 1,
  'attention_dim': 64,
  'attention_num_heads': 1,
  'attention_head_dim': 32,
  'attention_memory_inference': 50,
  'attention_memory_training': 50,
  'attention_position_wise_mlp_dim': 32,
  '

In [None]:
from ray.rllib.agents.dqn import dqn
import ray

all_rewards = []


dqn_config = dqn.DEFAULT_CONFIG.copy()
dqn_config.update({"num_gpus": 1,"num_workers":20,
                    "explore": True,
                    "framework": "tf",
                    "horizon": 100,
                    'lr': .00002,
                    'train_batch_size': 64,
                    'double_q': True,
                    'dueling': True,
                    'num_atoms': 51,
                    'target_network_update_freq': 2000,
                    'rollout_fragment_length': 8,
                    'replay_buffer_config': {'_enable_replay_buffer_api': True,
                      'type': 'MultiAgentPrioritizedReplayBuffer',
                      'capacity': 50000,
                      'prioritized_replay_alpha': 0.6,
                      'prioritized_replay_beta': 0.4,
                      'prioritized_replay_eps': 1e-06,
                      'replay_sequence_length': 1},
                    "gamma":0.9,
                    'hiddens': [512],
                    "model": {
                        "fcnet_hiddens": [512],
                        "fcnet_activation": "relu",
                    },
                    'evaluation_config': {'evaluation_parallel_to_training': True,
                                        'evaluation_num_workers': 2,
                                         'evaluation_interval': 1},
                    }) 
rewards = []
trainer = DQNTrainer_Offline(config=dqn_config, env="CybORG")

for i in range(int(200000)):
    results_dict=trainer.train()
    print_results(results_dict)
    rewards.append(results_dict["episode_reward_mean"])
all_rewards.append(rewards)
np.save(n+'.npy',rewards)
trainer.stop()

[2m[36m(RolloutWorker pid=36851)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=36853)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=36858)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=36856)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=36860)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=36863)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=36910)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=36882)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=36938)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=36977)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=37008)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=37090)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=37064)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=37130)[0m   "class": algorithms.Blowfish,
[2m[

[2m[36m(RolloutWorker pid=37272)[0m   "class": algorithms.Blowfish,
[2m[36m(RolloutWorker pid=37286)[0m   "class": algorithms.Blowfish,
2022-06-27 17:36:57,202	INFO trainable.py:159 -- Trainable.setup took 16.111 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


   1 	r_mean: nan 	r_max: nan 	r_min:  nan
   2 	r_mean: -203.4 	r_max: -154.5 	r_min: -273.3
   3 	r_mean: -203.4 	r_max: -154.5 	r_min: -273.3
   4 	r_mean: -200.3 	r_max: -129.6 	r_min: -273.3
   5 	r_mean: -200.3 	r_max: -129.6 	r_min: -273.3
   6 	r_mean: -204.9 	r_max: -129.6 	r_min: -283.8
   7 	r_mean: -204.9 	r_max: -129.6 	r_min: -283.8
   8 	r_mean: -204.9 	r_max: -127.3 	r_min: -283.8
   9 	r_mean: -204.0 	r_max: -127.1 	r_min: -283.8
  10 	r_mean: -204.0 	r_max: -127.1 	r_min: -283.8
  11 	r_mean: -205.8 	r_max: -127.1 	r_min: -283.8
  12 	r_mean: -205.8 	r_max: -127.1 	r_min: -283.8
  13 	r_mean: -210.8 	r_max: -125.8 	r_min: -283.8
  14 	r_mean: -210.8 	r_max: -125.8 	r_min: -283.8
  15 	r_mean: -210.2 	r_max: -125.8 	r_min: -283.8
  16 	r_mean: -210.2 	r_max: -125.8 	r_min: -283.8
  17 	r_mean: -211.9 	r_max: -110.7 	r_min: -283.8
  18 	r_mean: -210.6 	r_max: -110.7 	r_min: -283.8
  19 	r_mean: -210.6 	r_max: -110.7 	r_min: -283.8
  20 	r_mean: -210.5 	r_max: -110.7 	r_

 165 	r_mean: -76.2 	r_max: -67.3 	r_min: -84.2
 166 	r_mean: -76.2 	r_max: -67.3 	r_min: -84.2
 167 	r_mean: -75.5 	r_max: -67.3 	r_min: -84.2
 168 	r_mean: -74.8 	r_max: -62.9 	r_min: -84.2
 169 	r_mean: -74.8 	r_max: -62.9 	r_min: -84.2
 170 	r_mean: -73.9 	r_max: -62.9 	r_min: -84.2
 171 	r_mean: -73.9 	r_max: -62.9 	r_min: -84.2
 172 	r_mean: -73.1 	r_max: -57.4 	r_min: -149.3
 173 	r_mean: -73.1 	r_max: -57.4 	r_min: -149.3
 174 	r_mean: -70.1 	r_max: -57.1 	r_min: -149.3
 175 	r_mean: -67.6 	r_max: -55.1 	r_min: -149.3
 176 	r_mean: -67.6 	r_max: -55.1 	r_min: -149.3
 177 	r_mean: -64.9 	r_max: -51.1 	r_min: -149.3
 178 	r_mean: -64.9 	r_max: -51.1 	r_min: -149.3
 179 	r_mean: -62.7 	r_max: -51.1 	r_min: -149.3
 180 	r_mean: -62.7 	r_max: -51.1 	r_min: -149.3
 181 	r_mean: -60.5 	r_max: -48.9 	r_min: -124.0
 182 	r_mean: -60.5 	r_max: -48.9 	r_min: -124.0
 183 	r_mean: -60.4 	r_max: -48.9 	r_min: -124.0
 184 	r_mean: -60.9 	r_max: -48.9 	r_min: -124.0
 185 	r_mean: -60.9 	r_max:

 332 	r_mean: -102.9 	r_max: -45.6 	r_min: -223.5
 333 	r_mean: -99.2 	r_max: -45.4 	r_min: -223.5
 334 	r_mean: -99.5 	r_max: -45.4 	r_min: -223.5
 335 	r_mean: -99.5 	r_max: -45.4 	r_min: -223.5
 336 	r_mean: -97.2 	r_max: -45.4 	r_min: -223.5
 337 	r_mean: -97.2 	r_max: -45.4 	r_min: -223.5
 338 	r_mean: -103.1 	r_max: -45.4 	r_min: -216.4
 339 	r_mean: -103.1 	r_max: -45.4 	r_min: -216.4
 340 	r_mean: -100.7 	r_max: -45.4 	r_min: -216.4
 341 	r_mean: -100.7 	r_max: -45.4 	r_min: -216.4
 342 	r_mean: -107.8 	r_max: -45.5 	r_min: -218.2
 343 	r_mean: -107.3 	r_max: -45.5 	r_min: -219.2
 344 	r_mean: -107.3 	r_max: -45.5 	r_min: -219.2
 345 	r_mean: -111.1 	r_max: -45.5 	r_min: -224.2
 346 	r_mean: -111.1 	r_max: -45.5 	r_min: -224.2
 347 	r_mean: -105.5 	r_max: -31.7 	r_min: -224.2
 348 	r_mean: -105.5 	r_max: -31.7 	r_min: -224.2
 349 	r_mean: -106.6 	r_max: -31.7 	r_min: -224.2
 350 	r_mean: -102.2 	r_max: -31.7 	r_min: -224.2
 351 	r_mean: -102.2 	r_max: -31.7 	r_min: -224.2
 352 

 497 	r_mean: -92.0 	r_max: -27.5 	r_min: -222.2
 498 	r_mean: -92.0 	r_max: -27.5 	r_min: -222.2
 499 	r_mean: -97.8 	r_max: -32.7 	r_min: -227.4
 500 	r_mean: -95.0 	r_max: -32.7 	r_min: -227.4
 501 	r_mean: -95.0 	r_max: -32.7 	r_min: -227.4
 502 	r_mean: -100.5 	r_max: -33.6 	r_min: -227.4
 503 	r_mean: -100.5 	r_max: -33.6 	r_min: -227.4
 504 	r_mean: -101.6 	r_max: -33.6 	r_min: -227.4
 505 	r_mean: -101.6 	r_max: -33.6 	r_min: -227.4
 506 	r_mean: -102.6 	r_max: -31.0 	r_min: -227.4
 507 	r_mean: -102.6 	r_max: -31.0 	r_min: -227.4
 508 	r_mean: -102.6 	r_max: -28.4 	r_min: -222.6
 509 	r_mean: -103.7 	r_max: -28.4 	r_min: -222.6
 510 	r_mean: -103.7 	r_max: -28.4 	r_min: -222.6
 511 	r_mean: -98.1 	r_max: -28.4 	r_min: -209.7
 512 	r_mean: -98.1 	r_max: -28.4 	r_min: -209.7
 513 	r_mean: -96.5 	r_max: -28.4 	r_min: -209.7
 514 	r_mean: -96.5 	r_max: -28.4 	r_min: -209.7
 515 	r_mean: -96.8 	r_max: -28.4 	r_min: -214.1
 516 	r_mean: -96.8 	r_max: -28.4 	r_min: -214.1
 517 	r_mea

 663 	r_mean: -99.7 	r_max: -4.7 	r_min: -232.5
 664 	r_mean: -99.7 	r_max: -4.7 	r_min: -232.5
 665 	r_mean: -102.3 	r_max: -4.7 	r_min: -232.5
 666 	r_mean: -102.3 	r_max: -4.7 	r_min: -232.5
 667 	r_mean: -101.4 	r_max: -4.7 	r_min: -232.5
 668 	r_mean: -106.4 	r_max: -4.7 	r_min: -221.5
 669 	r_mean: -106.4 	r_max: -4.7 	r_min: -221.5
 670 	r_mean: -109.3 	r_max: -20.7 	r_min: -223.8
 671 	r_mean: -109.3 	r_max: -20.7 	r_min: -223.8
 672 	r_mean: -104.2 	r_max: -20.7 	r_min: -223.8
 673 	r_mean: -104.2 	r_max: -20.7 	r_min: -223.8
 674 	r_mean: -103.4 	r_max: -22.6 	r_min: -223.8
 675 	r_mean: -104.9 	r_max: -22.6 	r_min: -228.5
 676 	r_mean: -104.9 	r_max: -22.6 	r_min: -228.5
 677 	r_mean: -106.2 	r_max: -22.6 	r_min: -228.5
 678 	r_mean: -106.2 	r_max: -22.6 	r_min: -228.5
 679 	r_mean: -105.9 	r_max: -22.6 	r_min: -228.5
 680 	r_mean: -105.9 	r_max: -22.6 	r_min: -228.5
 681 	r_mean: -108.5 	r_max: -22.6 	r_min: -228.5
 682 	r_mean: -108.5 	r_max: -22.6 	r_min: -228.5
 683 	r_m

 830 	r_mean: -99.6 	r_max: -32.4 	r_min: -217.8
 831 	r_mean: -101.5 	r_max: -32.4 	r_min: -217.8
 832 	r_mean: -101.5 	r_max: -32.4 	r_min: -217.8
 833 	r_mean: -104.6 	r_max: -32.4 	r_min: -217.8
 834 	r_mean: -107.8 	r_max: -30.8 	r_min: -217.8
 835 	r_mean: -107.8 	r_max: -30.8 	r_min: -217.8
 836 	r_mean: -106.2 	r_max: -30.8 	r_min: -217.8
 837 	r_mean: -106.2 	r_max: -30.8 	r_min: -217.8
 838 	r_mean: -109.4 	r_max: -30.8 	r_min: -226.2
 839 	r_mean: -109.4 	r_max: -30.8 	r_min: -226.2
 840 	r_mean: -109.5 	r_max: -30.8 	r_min: -226.2
 841 	r_mean: -109.5 	r_max: -30.8 	r_min: -226.2
 842 	r_mean: -110.0 	r_max: -30.8 	r_min: -226.2
 843 	r_mean: -110.2 	r_max: -34.8 	r_min: -230.6
 844 	r_mean: -110.2 	r_max: -34.8 	r_min: -230.6
 845 	r_mean: -115.7 	r_max: -31.7 	r_min: -230.6
 846 	r_mean: -115.7 	r_max: -31.7 	r_min: -230.6
 847 	r_mean: -117.8 	r_max: -31.7 	r_min: -230.6
 848 	r_mean: -117.8 	r_max: -31.7 	r_min: -230.6
 849 	r_mean: -117.6 	r_max: -31.7 	r_min: -230.6
 

 996 	r_mean: -114.1 	r_max: -34.2 	r_min: -227.4
 997 	r_mean: -116.4 	r_max: -38.2 	r_min: -228.8
 998 	r_mean: -116.4 	r_max: -38.2 	r_min: -228.8
 999 	r_mean: -113.4 	r_max: -38.2 	r_min: -229.5
1000 	r_mean: -112.1 	r_max: -41.1 	r_min: -229.5
1001 	r_mean: -112.1 	r_max: -41.1 	r_min: -229.5
1002 	r_mean: -107.5 	r_max: -40.3 	r_min: -229.5
1003 	r_mean: -107.5 	r_max: -40.3 	r_min: -229.5
1004 	r_mean: -98.0 	r_max: -29.2 	r_min: -229.5
1005 	r_mean: -98.0 	r_max: -29.2 	r_min: -229.5
1006 	r_mean: -92.0 	r_max: -22.8 	r_min: -229.5
1007 	r_mean: -92.0 	r_max: -22.8 	r_min: -229.5
1008 	r_mean: -86.4 	r_max: -22.8 	r_min: -206.8
1009 	r_mean: -78.3 	r_max: -22.8 	r_min: -206.8
1010 	r_mean: -78.3 	r_max: -22.8 	r_min: -206.8
1011 	r_mean: -77.5 	r_max: -22.8 	r_min: -202.1
1012 	r_mean: -77.5 	r_max: -22.8 	r_min: -202.1
1013 	r_mean: -77.3 	r_max: -22.8 	r_min: -202.1
1014 	r_mean: -77.3 	r_max: -22.8 	r_min: -202.1
1015 	r_mean: -75.8 	r_max: -26.6 	r_min: -202.1
1016 	r_mean

In [None]:
trainer.stop()

In [None]:

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(111)
ax1.plot(al[0], label="GA Data")
ax1.plot(al[1], label="PPO Data")
plt.legend(loc='lower right')
plt.title('Offline DQN')
plt.xlabel('Batch')
plt.ylabel('Reward')
plt.show()

In [None]:
for i in range(int(50000)):
    results_dict=trainer.train()
    print_results(results_dict)
    rewards.append(results_dict["episode_reward_mean"])
all_rewards.append(rewards)

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(111)
ax1.plot(rewards[3], label="")
plt.legend(loc='lower right')
plt.title('Store Size (B_lineAgent)')
plt.xlabel('Batch')
plt.ylabel('Reward')
plt.show()

In [None]:
import subprocess
result = subprocess.run(['ls', os.path.join(ray._private.utils.get_user_temp_dir(), "8_data_s")], stdout=subprocess.PIPE)
result

In [None]:
import os
import ray
import zipfile
with zipfile.ZipFile("8_data_s_smallish"+'.zip', 'r') as zip_ref:
    zip_ref.extractall("/8_data_s_smallish")

In [None]:
import subprocess
result = subprocess.run(['ls',"/8_data_s_smallish"], stdout=subprocess.PIPE)
len(str(result.stdout)[2:].split('\\n')[0:-1])

In [None]:
import subprocess
result = subprocess.run(['ls',"/85_data_a"], stdout=subprocess.PIPE)
for i, s in enumerate(str(result.stdout)[2:].split('\\n')[0:-1]):
    if i % 2 == 0:
        os.remove("/85_data_a/" + s)
        #shutil.copyfile(os.path.join(ay._private.utils.get_user_temp_dir(), "82_data_s_a",s), os.path.join("8_data_s",s))

In [None]:
for file in str(result.stdout)[2:].split('\\n'):
    os.remove(os.path.join("/85_data_a", file))

In [None]:
len(str(result.stdout)[2:].split('\\n'))

In [None]:
import subprocess
import json
import os
removed = 0
result = subprocess.run(['ls', "/ppo"], stdout=subprocess.PIPE)
for j, name in enumerate(str(result.stdout)[2:].split('\\n')[0:-1]):
    f = open(os.path.join("/ppo", name))
    try:
        obj = json.load(f)
        with open('/ppo2/'+name, 'w') as fp:
            json.dump(obj["value"], fp)
    except ValueError as err:
        #os.remove(os.path.join("/ppo", name)) 
        removed += 1
print('Removed ' + str(removed) + ' files, of ' + str(j) + 'files')

In [None]:
os.mkdir('/ppo2')