In [1]:
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Notebook used for debugging purpose to train the
the DQL agent and then run it one step at a time.
"""

# pylint: disable=invalid-name

'Notebook used for debugging purpose to train the\nthe DQL agent and then run it one step at a time.\n'

In [2]:
import sys
import logging
import gym
import cyberbattle.agents.baseline.learner as learner
import cyberbattle.agents.baseline.agent_wrapper as w
import cyberbattle.agents.baseline.agent_dql as dqla
from cyberbattle.agents.baseline.agent_wrapper import ActionTrackingStateAugmentation, AgentWrapper, Verbosity
logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format="%(levelname)s: %(message)s")

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
gymid = 'CyberBattleTiny-v0'
iteration_count = 150
training_episode_count = 10

In [4]:
# Parameters
gymid = "CyberBattleTiny-v0"
iteration_count = 150


In [5]:
# Load the gym environment

ctf_env = gym.make(gymid)

ep = w.EnvironmentBounds.of_identifiers(
    maximum_node_count=12,
    maximum_total_credentials=10,
    identifiers=ctf_env.identifiers
)

In [6]:
# Evaluate the Deep Q-learning agent
dqn_learning_run = learner.epsilon_greedy_search(
    cyberbattle_gym_env=ctf_env,
    environment_properties=ep,
    learner=dqla.DeepQLearnerPolicy(
        ep=ep,
        gamma=0.015,
        replay_memory_size=10000,
        target_update=5,
        batch_size=512,
        learning_rate=0.01  # torch default learning rate is 1e-2
    ),
    episode_count=training_episode_count,
    iteration_count=iteration_count,
    epsilon=0.90,
    epsilon_exponential_decay=5000,
    epsilon_minimum=0.10,
    verbosity=Verbosity.Quiet,
    render=False,
    plot_episodes_length=False,
    title="DQL"
)

###### DQL
Learning with: episode_count=10,iteration_count=150,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=5000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/10 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   11.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:   22.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:   22.0|last_reward_at:    4|Elapsed Time: 0:00:00||
  state_batch = torch.tensor(states_to_consider).to(device)
Episode 1|Iteration 18|reward:   31.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 18|reward:   31.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 1|Iteration 69|reward: 1031.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 1|Iteration 69|reward: 1031.0|last_reward_at:   69|Elapsed Time: 0:00:00||
Episode 1|Iteration 124|reward: 1031.0|last_reward_at:   69|Elapsed Time: 0:00:00||
Episode 1|Iteration 150|reward: 1031.0|last_reward_at:   69|Elapsed Time: 0:00:00||


  Episode 1 stopped at t=150 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/38 (0.03)
    explore-remote: 2/50 (0.04)
    explore-connect: 1/43 (0.02)
    exploit-local: 0/14 (0.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 0/1 (0.00)
  exploit deflected to exploration: 3
  ## Episode: 2/10 'DQL' ϵ=0.8765, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   11.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   22.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   22.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   31.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   31.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 2|Iteration 12|reward: 1031.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 2|Iteration 12|reward: 1031.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 2|Iteration 67|reward: 1031.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 2|Iteration 124|reward: 1031.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 2|Iteration 150|reward: 1031.0|last_reward_at:   12|Elapsed Time: 0:00:00||


  Episode 2 stopped at t=150 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/40 (0.02)
    explore-remote: 2/57 (0.03)
    explore-connect: 1/37 (0.03)
    exploit-local: 0/6 (0.00)
    exploit-remote: 0/6 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 7
  ## Episode: 3/10 'DQL' ϵ=0.8536, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   11.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:   22.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 3|Iteration 8|reward:   31.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 3|Iteration 8|reward:   31.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 3|Iteration 19|reward: 1031.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 3|Iteration 19|reward: 1031.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode 3|Iteration 71|reward: 1031.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode 3|Iteration 126|reward: 1031.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode 3|Iteration 150|reward: 1031.0|last_reward_at:   19|Elapsed Time: 0:00:00||


  Episode 3 stopped at t=150 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/27 (0.04)
    explore-remote: 2/47 (0.04)
    explore-connect: 1/48 (0.02)
    exploit-local: 0/14 (0.00)
    exploit-remote: 0/10 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 2
  ## Episode: 4/10 'DQL' ϵ=0.8313, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:   11.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   22.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 23|reward:   31.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 23|reward:   31.0|last_reward_at:   23|Elapsed Time: 0:00:00||
Episode 4|Iteration 46|reward:   31.0|last_reward_at:   23|Elapsed Time: 0:00:00||
Episode 4|Iteration 46|reward: 1031.0|last_reward_at:   23|Elapsed Time: 0:00:00||
Episode 4|Iteration 46|reward: 1031.0|last_reward_at:   46|Elapsed Time: 0:00:00||
Episode 4|Iteration 48|reward: 1031.0|last_reward_at:   46|Elapsed Time: 0:00:00||
Episode 4|

  Episode 4 stopped at t=150 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/28 (0.03)
    explore-remote: 2/62 (0.03)
    explore-connect: 1/35 (0.03)
    exploit-local: 0/8 (0.00)
    exploit-remote: 0/13 (0.00)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 9
  ## Episode: 5/10 'DQL' ϵ=0.8097, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:   11.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   22.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 6|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 12|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 14|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 15|reward:   31.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|It

  Episode 5 stopped at t=150 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/31 (0.03)
    explore-remote: 0/58 (0.00)
    explore-connect: 1/53 (0.02)
    exploit-local: 0/0 (NaN)
    exploit-remote: 2/4 (0.33)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 16
  ## Episode: 6/10 'DQL' ϵ=0.7887, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:   11.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 3|reward:   22.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 3|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 6|Iteration 6|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 6|Iteration 10|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 6|Iteration 12|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 6|Iteration 13|reward:   31.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 6|Iteration 13|reward:   31.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 6|Iteration 16|reward:   31.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 6|I

  Episode 6 stopped at t=150 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/45 (0.02)
    explore-remote: 0/64 (0.00)
    explore-connect: 0/33 (0.00)
    exploit-local: 0/1 (0.00)
    exploit-remote: 2/3 (0.40)
    exploit-connect: 1/0 (1.00)
  exploit deflected to exploration: 29
  ## Episode: 7/10 'DQL' ϵ=0.7683, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:   11.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   11.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 10|reward:   11.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 10|reward:   22.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 10|reward:   22.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 7|Iteration 16|reward:   22.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 7|Iteration 20|reward:   31.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 7|Iteration 20|reward:   31.0|last_reward_at:   20|Elapsed Time: 0:00:00||
Episode 7|Iteration 24|reward: 1031.0|last_reward_at:   20|Elapsed Time: 0:00:00||
Episode 7

  Episode 7 stopped at t=150 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 0/41 (0.00)
    explore-remote: 1/50 (0.02)
    explore-connect: 0/55 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 1/0 (1.00)
    exploit-connect: 1/0 (1.00)
  exploit deflected to exploration: 36
  ## Episode: 8/10 'DQL' ϵ=0.7486, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 2|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 2|reward:   11.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 8|Iteration 4|reward:   22.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 8|Iteration 4|reward:   22.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iteration 8|reward:   22.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iteration 9|reward:   31.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iteration 9|reward:   31.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 8|Iteration 12|reward:   31.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 8|Iteration 14|reward:   31.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 8|Iteration 16|reward:   31.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 8|Ite

  Episode 8 stopped at t=150 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 1/34 (0.03)
    explore-remote: 1/45 (0.02)
    explore-connect: 0/49 (0.00)
    exploit-local: 0/0 (NaN)
    exploit-remote: 1/18 (0.05)
    exploit-connect: 1/0 (1.00)
  exploit deflected to exploration: 18
  ## Episode: 9/10 'DQL' ϵ=0.7294, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   11.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 4|reward:   22.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 4|reward:   22.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 8|reward:   22.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 10|reward:   22.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 10|reward:   31.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 10|reward:   31.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 9|Iteration 13|reward: 1031.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 9|Iteration 13|reward: 1031.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 9|I

  Episode 9 stopped at t=150 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 0/24 (0.00)
    explore-remote: 2/46 (0.04)
    explore-connect: 0/41 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/35 (0.00)
    exploit-connect: 1/0 (1.00)
  exploit deflected to exploration: 5
  ## Episode: 10/10 'DQL' ϵ=0.7108, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:   11.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:   11.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 3|reward:   22.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 3|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 10|Iteration 5|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 10|Iteration 8|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 10|Iteration 10|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 10|Iteration 12|reward:   22.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 10|Iteration 12|reward:   31.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 10|Iteration 12|reward:   31.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 10|Iteration 16|reward:   31.0|last_reward_at:   12|Elapsed Time: 0:00:00||

  Episode 10 stopped at t=150 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 0/26 (0.00)
    explore-remote: 0/59 (0.00)
    explore-connect: 1/38 (0.03)
    exploit-local: 1/0 (1.00)
    exploit-remote: 2/23 (0.08)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 21
simulation ended


In [7]:
# initialize the environment

current_o = ctf_env.reset()
wrapped_env = AgentWrapper(ctf_env, ActionTrackingStateAugmentation(ep, current_o))
l = dqn_learning_run['learner']

In [8]:
# Use the trained agent to run the steps one by one

max_steps = 10

# next action suggested by DQL agent
h = []
for i in range(max_steps):
    # run the suggested action
    _, next_action, _ = l.exploit(wrapped_env, current_o)
    h.append((ctf_env.get_explored_network_node_properties_bitmap_as_numpy(current_o), next_action))
    print(h[-1])
    if next_action is None:
        break
    current_o, _, _, _ = wrapped_env.step(next_action)

print(f'len: {len(h)}')

(array([[0., 1., 0., 0., 0., 0.]]), {'local_vulnerability': array([0, 0])})
(array([[0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.]]), None)
len: 2


In [9]:
ctf_env.render()

Unnamed: 0_level_0,status,properties,local_attacks,remote_attacks
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
client,owned,[CLIENT:Win10],[SearchEdgeHistory],[]
Website,discovered,,,[ScanPageSource]


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed