In [1]:
from gym.spaces import Box
import numpy as np

from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.models.torch.misc import SlimFC
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.agents.ppo import ppo
from ray.rllib.models.tf.tf_action_dist import Categorical
from ray.rllib.models.tf.misc import normc_initializer
from ray.rllib.models.tf.recurrent_net import RecurrentNetwork
import gym

tf1, tf, tfv = try_import_tf()

In [2]:
import inspect
import time
from statistics import mean, stdev
from CybORG import CybORG
from CybORG.Agents import B_lineAgent, SleepAgent, GreenAgent
from CybORG.Agents.SimpleAgents.BaseAgent import BaseAgent
from CybORG.Agents.SimpleAgents.BlueReactAgent import BlueReactRemoveAgent
from CybORG.Agents.SimpleAgents.Meander import RedMeanderAgent
from CybORG.Agents.Wrappers.EnumActionWrapper import EnumActionWrapper
from CybORG.Agents.Wrappers.FixedFlatWrapper import FixedFlatWrapper
from CybORG.Agents.Wrappers.OpenAIGymWrapper import OpenAIGymWrapper
from CybORG.Agents.Wrappers.ReduceActionSpaceWrapper import ReduceActionSpaceWrapper
from CybORG.Agents.Wrappers import ChallengeWrapper
import os
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents import ppo
from ray.tune.registry import register_env
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper
import warnings
warnings.filterwarnings('ignore')

In [10]:
MAX_EPS = 50
agent_name = 'Blue'

def wrap(env):
    return HierarchyWrapperEval(agent_name="Blue", env=env)

def evaluate(steps):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'

    #print(f'using CybORG v{cyborg_version}, {scenario}\n')
    for num_steps in steps:
        #for red_agent in [B_lineAgent, RedMeanderAgent, SleepAgent]:
        rs = []
        for red_agent in [RedMeanderAgent, B_lineAgent]:

            cyborg = CybORG(path, 'sim', agents={'Red': red_agent})
            wrapped_cyborg = wrap(cyborg)

            observation = wrapped_cyborg.reset()

            action_space = wrapped_cyborg.get_action_space(agent_name)
            
            cell_size=256
            state=[np.zeros(cell_size, np.float32),
                   np.zeros(cell_size, np.float32)]

            total_reward = []
            actions = []
            for i in range(MAX_EPS):
                r = []
                a = []
                # cyborg.env.env.tracker.render()
                for j in range(num_steps):
                    action, state, logits = agent.compute_action(observation, state, explore=False)
                    #action = agent.get_action(observation, action_space)
                    observation, rew, done, info = wrapped_cyborg.step(action)
                    # result = cyborg.step(agent_name, action)
                    r.append(rew)
                    # r.append(result.reward)
                    a.append((str(cyborg.get_last_action('Blue')), str(cyborg.get_last_action('Red'))))
                total_reward.append(sum(r))
                actions.append(a)
                # observation = cyborg.reset().observation
                observation = wrapped_cyborg.reset()
            rs.append(mean(total_reward))
            print(f'Average reward for red agent {red_agent.__name__} at steps {num_steps} is: {mean(total_reward):.1f} with a standard deviation of {stdev(total_reward):.1f}')
    return rs

In [4]:
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper

class HierarchyWrapper(RLlibWrapper):
    def __init__(self, agent_name, env, agent=None, reward_threshold=None, max_steps=None, env_id=None):
        super().__init__(agent_name, env, agent, reward_threshold, max_steps)
        self.action_space = gym.spaces.Discrete(2)
        ppo_config = ppo.DEFAULT_CONFIG.copy()
        ppo_config.update({"num_gpus": 0,"num_workers": 0,
            "framework": "tf2",
            "model": {
                "fcnet_hiddens": [512, 512],
                "fcnet_activation": "relu",
            }})
        
        b_line = PPOTrainer(config=ppo_config,env="CybORG")
        b_line.restore("supervisor_ppo/checkpoint_000394/checkpoint-394")
        meander = PPOTrainer(config=ppo_config,env="CybORG")
        meander.restore("supervisor_ppo/checkpoint_000487/checkpoint-487")
        self.sub_agents = [b_line.get_policy().model, meander.get_policy().model]
        self.prev_obs = self.reset()
        self.env_id = env_id
        
    def step(self, action=None):
        
        reward = -1
        if self.env_id == 0:
            if action == 0:
                reward = 0
        else:
            if action == 1:
                reward = 0
                
        logits = self.sub_agents[action].forward({'obs_flat': np.array([self.prev_obs])}, None, None)[0]
        action = tf.math.argmax(logits, axis=1)[0]
        self.prev_obs, r, done, info = self.env.step(action=action)
        self.step_counter += 1
        if self.max_steps is not None and self.step_counter >= self.max_steps:
            done = True
            self.reset()
            
        return np.float32(self.prev_obs), reward, done, info


In [5]:
from CybORG.Agents.Wrappers.rllib_wrapper import RLlibWrapper

class HierarchyWrapperEval(HierarchyWrapper):
    def __init__(self, agent_name, env, agent=None, reward_threshold=None, max_steps=None, env_id=None):
        super().__init__(agent_name, env, agent, reward_threshold, max_steps)

    def step(self, action=None):

        logits = self.sub_agents[action].forward({'obs_flat': np.array([self.prev_obs])}, None, None)[0]
        action = tf.math.argmax(logits, axis=1)[0]
        self.prev_obs, reward, done, info = self.env.step(action=action)
        self.step_counter += 1
        if self.max_steps is not None and self.step_counter >= self.max_steps:
            done = True
            self.reset()
            
        return np.float32(self.prev_obs), reward, done, info


In [6]:
from ray.rllib.models import ModelCatalog
from ray.tune.registry import register_env
import time 

def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    eval_dic = results_dict['evaluation']
    e_mean = eval_dic["episode_reward_mean"]
    e_max = eval_dic["episode_reward_max"]
    e_min = eval_dic["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f} \te_mean: {e_mean:.1f} \te_max: {e_max:.1f} \te_min: {e_min: .1f}")
    

class MultiEnv(gym.Env):
    def __init__(self, env_config):
        # pick actual env based on worker and env indexes
        self.env = self.choose_env_for(env_config.worker_index)
        self.action_space = gym.spaces.Discrete(2)
        self.observation_space = self.env.observation_space
    def reset(self):
        return self.env.reset()
    def step(self, action):
        return self.env.step(action)   
    def choose_env_for(self, index):
        if index > 40:
            path = str(inspect.getfile(CybORG))
            path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
            agents = {"Red": SleepAgent, "Green": GreenAgent}
            cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
            return HierarchyWrapper(env=cyborg, agent_name="Blue", max_steps=100, env_id=2)
        elif index % 2 == 0:
            path = str(inspect.getfile(CybORG))
            path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
            agents = {"Red": B_lineAgent, "Green": GreenAgent}
            cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
            return HierarchyWrapper(env=cyborg, agent_name="Blue", max_steps=100, env_id=0)
        else:
            path = str(inspect.getfile(CybORG))
            path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
            agents = {"Red": RedMeanderAgent, "Green": GreenAgent}
            cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
            return HierarchyWrapper(env=cyborg, agent_name="Blue", max_steps=100, env_id=1)
        
def env_creator(env_config: dict):
    path = str(inspect.getfile(CybORG))
    path = path[:-10] + '/Shared/Scenarios/Scenario2.yaml'
    agents = {"Red": B_lineAgent, "Green": GreenAgent}
    cyborg = CybORG(scenario_file=path, environment='sim', agents=agents)
    env = RLlibWrapper(env=cyborg, agent_name="Blue", max_steps=100)
    return env

register_env("CybORG", env_creator=env_creator)
register_env("multienv", lambda config: MultiEnv(config))

batch_size = 2000
config = ppo.DEFAULT_CONFIG.copy()

config.update({"num_gpus": 1, "num_workers": 20,
               'num_cpus_per_worker':1,
               "env": "multienv",
                # Also, use "framework: tf2" for tfe eager execution.
                "framework": "tf2",
                "train_batch_size": batch_size,
                "horizon": 100,
                "sgd_minibatch_size": 100,
                "gamma": 0.9,

                "model": {
                    "use_lstm": True,
                   "max_seq_len": 3,
                    "lstm_cell_size": 256,
                    "fcnet_hiddens": [256],
                   "fcnet_activation": "relu",
                },
               
                 'evaluation_interval': 1,
                 'evaluation_duration': 100,
                 'evaluation_duration_unit': 'episodes',
                 'evaluation_parallel_to_training': True,
                 'evaluation_num_workers': 5,
                }) 


In [7]:
agent = PPOTrainer(config=config)

t = time.time()
for i in range(500):
    results_dict = agent.train()
    print_results(results_dict)
    #evaluate([100])
print(time.time()-t)

[2m[36m(RolloutWorker pid=15431)[0m 2022-08-15 15:49:09,415	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000394/checkpoint-394
[2m[36m(RolloutWorker pid=15431)[0m 2022-08-15 15:49:09,416	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 394, '_timesteps_total': None, '_time_total': 13182.43779706955, '_episodes_total': 14180}
[2m[36m(RolloutWorker pid=15431)[0m 2022-08-15 15:49:10,060	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000487/checkpoint-487
[2m[36m(RolloutWorker pid=15431)[0m 2022-08-15 15:49:10,060	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 487, '_timesteps_total': None, '_time_total': 15745.162350177765, '_episodes_total': 17520}


[2m[36m(RolloutWorker pid=15441)[0m 2022-08-15 15:49:10,514	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000394/checkpoint-394
[2m[36m(RolloutWorker pid=15441)[0m 2022-08-15 15:49:10,514	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 394, '_timesteps_total': None, '_time_total': 13182.43779706955, '_episodes_total': 14180}
[2m[36m(RolloutWorker pid=15452)[0m 2022-08-15 15:49:10,566	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000394/checkpoint-394
[2m[36m(RolloutWorker pid=15452)[0m 2022-08-15 15:49:10,566	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 394, '_timesteps_total': None, '_time_total': 13182.43779706955, '_episodes_total': 14180}
[2m[36m(RolloutWorker pid=15458)[0m 2022-08-15 15:49:10,524	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000394/checkpoint-394
[2m[36m(RolloutWorke

[2m[36m(RolloutWorker pid=15437)[0m 2022-08-15 15:49:10,920	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000394/checkpoint-394
[2m[36m(RolloutWorker pid=15437)[0m 2022-08-15 15:49:10,920	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 394, '_timesteps_total': None, '_time_total': 13182.43779706955, '_episodes_total': 14180}
[2m[36m(RolloutWorker pid=15460)[0m 2022-08-15 15:49:10,950	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000394/checkpoint-394
[2m[36m(RolloutWorker pid=15460)[0m 2022-08-15 15:49:10,950	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 394, '_timesteps_total': None, '_time_total': 13182.43779706955, '_episodes_total': 14180}
[2m[36m(RolloutWorker pid=15441)[0m 2022-08-15 15:49:11,194	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000487/checkpoint-487
[2m[36m(RolloutWorke

[2m[36m(RolloutWorker pid=15429)[0m 2022-08-15 15:49:11,415	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000487/checkpoint-487
[2m[36m(RolloutWorker pid=15429)[0m 2022-08-15 15:49:11,416	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 487, '_timesteps_total': None, '_time_total': 15745.162350177765, '_episodes_total': 17520}
[2m[36m(RolloutWorker pid=15439)[0m 2022-08-15 15:49:11,416	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000487/checkpoint-487
[2m[36m(RolloutWorker pid=15439)[0m 2022-08-15 15:49:11,416	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 487, '_timesteps_total': None, '_time_total': 15745.162350177765, '_episodes_total': 17520}
[2m[36m(RolloutWorker pid=15453)[0m 2022-08-15 15:49:11,380	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000487/checkpoint-487
[2m[36m(RolloutWor

2022-08-15 15:49:19,178	INFO trainable.py:159 -- Trainable.setup took 23.679 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(RolloutWorker pid=18099)[0m 2022-08-15 15:49:26,673	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000394/checkpoint-394
[2m[36m(RolloutWorker pid=18099)[0m 2022-08-15 15:49:26,674	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 394, '_timesteps_total': None, '_time_total': 13182.43779706955, '_episodes_total': 14180}
[2m[36m(RolloutWorker pid=18100)[0m 2022-08-15 15:49:26,984	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000394/checkpoint-394
[2m[36m(RolloutWorker pid=18100)[0m 2022-08-15 15:49:26,985	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 394, '_timesteps_total': None, '_time_total': 13182.43779706955, '_episodes_total': 14180}


[2m[36m(RolloutWorker pid=18107)[0m 2022-08-15 15:49:27,981	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000487/checkpoint-487
[2m[36m(RolloutWorker pid=18107)[0m 2022-08-15 15:49:27,981	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 487, '_timesteps_total': None, '_time_total': 15745.162350177765, '_episodes_total': 17520}
[2m[36m(RolloutWorker pid=18102)[0m 2022-08-15 15:49:28,220	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000487/checkpoint-487
[2m[36m(RolloutWorker pid=18102)[0m 2022-08-15 15:49:28,220	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 487, '_timesteps_total': None, '_time_total': 15745.162350177765, '_episodes_total': 17520}


   1 	r_mean: -50.4 	r_max: -43.0 	r_min: -60.0 	e_mean: -50.6 	e_max: -40.0 	e_min: -63.0
   2 	r_mean: -46.2 	r_max: -33.0 	r_min: -60.0 	e_mean: -42.1 	e_max: -30.0 	e_min: -63.0
   3 	r_mean: -39.2 	r_max: -6.0 	r_min: -65.0 	e_mean: -33.0 	e_max: -7.0 	e_min: -74.0
   4 	r_mean: -36.3 	r_max: -5.0 	r_min: -79.0 	e_mean: -25.1 	e_max: -6.0 	e_min: -85.0
   5 	r_mean: -32.5 	r_max: -2.0 	r_min: -79.0 	e_mean: -18.2 	e_max: -2.0 	e_min: -82.0
   6 	r_mean: -25.1 	r_max: -2.0 	r_min: -79.0 	e_mean: -18.8 	e_max: -2.0 	e_min: -76.0
   7 	r_mean: -24.0 	r_max: -2.0 	r_min: -100.0 	e_mean: -25.7 	e_max: -2.0 	e_min: -98.0
   8 	r_mean: -28.4 	r_max: -1.0 	r_min: -100.0 	e_mean: -37.8 	e_max: 0.0 	e_min: -100.0
   9 	r_mean: -32.8 	r_max: 0.0 	r_min: -100.0 	e_mean: -40.3 	e_max: 0.0 	e_min: -100.0
  10 	r_mean: -39.3 	r_max: 0.0 	r_min: -100.0 	e_mean: -40.0 	e_max: 0.0 	e_min: -100.0
  11 	r_mean: -46.5 	r_max: 0.0 	r_min: -100.0 	e_mean: -40.2 	e_max: 0.0 	e_min: -100.0
  12 	r_mean: -

KeyboardInterrupt: 

In [None]:
ppo.DEFAULT_CONFIG

In [8]:
print(agent.save("hierarchy"))

hierarchy/checkpoint_000022/checkpoint-22


In [11]:
config.update({"num_gpus": 0, "num_workers": 0,})
agent = PPOTrainer(config=config, env="multienv")
agent.restore("hierarchy/checkpoint_000022/checkpoint-22")
evaluate([100])

2022-08-15 16:14:17,225	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000394/checkpoint-394
2022-08-15 16:14:17,226	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 394, '_timesteps_total': None, '_time_total': 13182.43779706955, '_episodes_total': 14180}
2022-08-15 16:14:18,093	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000487/checkpoint-487
2022-08-15 16:14:18,094	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 487, '_timesteps_total': None, '_time_total': 15745.162350177765, '_episodes_total': 17520}
2022-08-15 16:14:19,706	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: hierarchy/checkpoint_000022/checkpoint-22
2022-08-15 16:14:19,707	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 22, '_timesteps_total': None, '_time_total': 1400.5436375141144, '_episodes_total': 440}
2022-08-15 16:14:20,954	INFO trainable

[2m[36m(RolloutWorker pid=19906)[0m 2022-08-15 16:14:27,432	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000487/checkpoint-487
[2m[36m(RolloutWorker pid=19906)[0m 2022-08-15 16:14:27,433	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 487, '_timesteps_total': None, '_time_total': 15745.162350177765, '_episodes_total': 17520}
[2m[36m(RolloutWorker pid=19914)[0m 2022-08-15 16:14:27,473	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000487/checkpoint-487
[2m[36m(RolloutWorker pid=19914)[0m 2022-08-15 16:14:27,473	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 487, '_timesteps_total': None, '_time_total': 15745.162350177765, '_episodes_total': 17520}
[2m[36m(RolloutWorker pid=19907)[0m 2022-08-15 16:14:27,468	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000487/checkpoint-487
[2m[36m(RolloutWor

Average reward for red agent RedMeanderAgent at steps 100 is: -26.8 with a standard deviation of 7.1


2022-08-15 16:16:41,435	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000394/checkpoint-394
2022-08-15 16:16:41,436	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 394, '_timesteps_total': None, '_time_total': 13182.43779706955, '_episodes_total': 14180}
2022-08-15 16:16:42,230	INFO trainable.py:588 -- Restored on 172.28.0.2 from checkpoint: supervisor_ppo/checkpoint_000487/checkpoint-487
2022-08-15 16:16:42,231	INFO trainable.py:597 -- Current state after restoring: {'_iteration': 487, '_timesteps_total': None, '_time_total': 15745.162350177765, '_episodes_total': 17520}


Average reward for red agent B_lineAgent at steps 100 is: -17.8 with a standard deviation of 9.0


[-26.84000000000002, -17.821999999999992]

In [18]:
!ls supervisor_ppo

checkpoint_000046  checkpoint_000107  checkpoint_000177  checkpoint_000338
checkpoint_000050  checkpoint_000108  checkpoint_000178  checkpoint_000343
checkpoint_000052  checkpoint_000113  checkpoint_000182  checkpoint_000365
checkpoint_000067  checkpoint_000120  checkpoint_000183  checkpoint_000370
checkpoint_000068  checkpoint_000133  checkpoint_000192  checkpoint_000394
checkpoint_000069  checkpoint_000138  checkpoint_000200  checkpoint_000400
checkpoint_000070  checkpoint_000140  checkpoint_000202  checkpoint_000435
checkpoint_000071  checkpoint_000143  checkpoint_000203  checkpoint_000436
checkpoint_000073  checkpoint_000144  checkpoint_000205  checkpoint_000487
checkpoint_000074  checkpoint_000153  checkpoint_000214  checkpoint_000491
checkpoint_000076  checkpoint_000154  checkpoint_000216  checkpoint_000498
checkpoint_000077  checkpoint_000156  checkpoint_000239  checkpoint_000561
checkpoint_000085  checkpoint_000157  checkpoint_000272  checkpoint_000569
checkpoint_0

In [25]:
rm -r supervisor_ppo/checkpoint_000070

In [None]:
        b_line.restore("supervisor_ppo/checkpoint_000394/checkpoint-394")
        meander = PPOTrainer(config=ppo_config,env="CybORG")
        meander.restore("supervisor_ppo/checkpoint_000487/checkpoint-487")

In [27]:
t = "checkpoint_000046  checkpoint_000107  checkpoint_000177  checkpoint_000338
checkpoint_000050  checkpoint_000108  checkpoint_000178  checkpoint_000343
checkpoint_000052  checkpoint_000113  checkpoint_000182  checkpoint_000365
checkpoint_000067  checkpoint_000120  checkpoint_000183  checkpoint_000370
checkpoint_000068  checkpoint_000133  checkpoint_000192  checkpoint_000394
checkpoint_000069  checkpoint_000138  checkpoint_000200  checkpoint_000400
checkpoint_000070  checkpoint_000140  checkpoint_000202  checkpoint_000435
checkpoint_000071  checkpoint_000143  checkpoint_000203  checkpoint_000436
checkpoint_000073  checkpoint_000144  checkpoint_000205  checkpoint_000487
checkpoint_000074  checkpoint_000153  checkpoint_000214  checkpoint_000491
checkpoint_000076  checkpoint_000154  checkpoint_000216  checkpoint_000498
checkpoint_000077  checkpoint_000156  checkpoint_000239  checkpoint_000561
checkpoint_000085  checkpoint_000157  checkpoint_000272  checkpoint_000569
checkpoint_000098  checkpoint_000165  checkpoint_000318  checkpoint_000571
checkpoint_000099  checkpoint_000169  checkpoint_000325  checkpoint_000576
checkpoint_000104  checkpoint_000175  checkpoint_000326  checkpoint_000577
checkpoint_000106  checkpoint_000176  checkpoint_000327  checkpoint_000668"

SyntaxError: EOL while scanning string literal (<ipython-input-27-3957115643d1>, line 1)

In [35]:

l = os.listdir('supervisor_ppo')
l.remove('checkpoint_000394')
l.remove('checkpoint_000487')
for p in l:
    shutil.rmtree('supervisor_ppo/' +p) 

NameError: name 'shutil' is not defined