# Setting up a Ray cluster with SmartSim

## 1. Start the cluster
We set up a SmartSim experiment, which will handle the launch of the Ray cluster.

First we import the relevant modules.

In [1]:
import numpy as np
import time
import argparse
import os

from ray.tune.progress_reporter import JupyterNotebookReporter
import ray
from ray import tune
import ray.util

from smartsim import Experiment
from smartsim.ext.ray import RayCluster

NUM_WORKERS = 2
alloc=None
launcher='slurm'

In [2]:
exp = Experiment("ray-cluster", launcher=launcher)
cluster = RayCluster(name="ray-cluster", run_args={}, path='', ray_args={"num-cpus": 48},
                     launcher=launcher, workers=NUM_WORKERS, alloc=alloc, batch=True)

In [3]:
if cluster.batch:
    cluster.head_model.batch_settings.add_preamble( ["source ~/.bashrc", "conda activate smartsim"])
    if NUM_WORKERS:
        cluster.worker_model.batch_settings.add_preamble ( ["source ~/.bashrc", "conda activate smartsim"])

In [4]:
exp.generate(cluster, overwrite=True)

09:13:23 horizon SmartSim[9197] INFO Working in previously created experiment


In [5]:
exp.start(cluster, block=False, summary=False)

09:13:32 horizon SmartSim[9197] INFO Ray cluster launched on nodes: ['nid00000', 'nid00002', 'nid00001']


## 2. Start the ray driver script

In [11]:
#ray.client(cluster.head_model.address +":10001").connect()
#ray.redis_password=cluster._ray_password
job_config = ray.job_config.JobConfig(runtime_env={"redis_password": cluster._ray_password}, worker_env={"redis_password": cluster._ray_password})
#ray.util.connect(cluster.head_model.address+":10001")
#print(cluster.head_model.address)
#print("ray://"+cluster.head_model.address+":10001")
ray.init("ray://"+cluster.head_model.address+":10001", _redis_password=cluster._ray_password)
#ray.client("ray://"+cluster.head_model.address+":10001").connect()


RuntimeError: Unexpected keyword argument(s) for Ray Client: _redis_password

In [6]:
print('''This cluster consists of
    {} nodes in total
    {} CPU resources in total
'''.format(len(ray.nodes()), ray.cluster_resources()['CPU']))

This cluster consists of
    16 nodes in total
    768.0 CPU resources in total



In [7]:
tune.run(
    "PPO",
    stop={"episode_reward_max": 200},
    config={
        "framework": "torch",
        "env": "CartPole-v0",
        "num_gpus": 0,
        "lr": tune.grid_search(np.linspace (0.001, 0.01, 100).tolist()),
        "log_level": "ERROR",
    },
    local_dir="/lus/scratch/arigazzi/ray_local/",
    verbose=0,
    fail_fast=True,
    log_to_file=True,
)

[2m[36m(pid=70525)[0m 2021-07-27 09:50:23,517	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=70529)[0m 2021-07-27 09:50:23,531	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=70533)[0m 2021-07-27 09:50:23,514	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=70545)[0m 2021-07-27 09:50:23,510	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=70546)[0m 2021-07-27 09:50:23,608	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=70539)[0m 2021-07-27 09:50:23,709	INFO trainer.py:696 -- Cur

<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7feec024b250>

In [9]:

tune.run(
    "PPO",
    stop={"episode_reward_max": 200},
    config={
        "framework": "torch",
        "env": "CartPole-v0",
    #    "num_gpus": 0,
        "lr": tune.grid_search(np.linspace (0.001, 0.01, 200).tolist()),
        "log_level": "ERROR",
    #    "num_cpus_per_worker": 1,
    },
    local_dir="/lus/scratch/arigazzi/ray_local/",
    verbose=1,
    #fail_fast=True,
    #log_to_file=True,
    #progress_reporter = JupyterNotebookReporter(True),
)

[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 6.5/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 3.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (199 PENDING, 1 RUNNING)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 


[2m[36m(pid=29257)[0m Instructions for updating:
[2m[36m(pid=29257)[0m non-resource variables are not supported in the long term
[2m[36m(pid=29227)[0m Instructions for updating:
[2m[36m(pid=29227)[0m non-resource variables are not supported in the long term
[2m[36m(pid=29261)[0m Instructions for updating:
[2m[36m(pid=29261)[0m non-resource variables are not supported in the long term
[2m[36m(pid=29263)[0m Instructions for updating:
[2m[36m(pid=29263)[0m non-resource variables are not supported in the long term
[2m[36m(pid=29259)[0m Instructions for updating:
[2m[36m(pid=29259)[0m non-resource variables are not supported in the long term
[2m[36m(pid=29264)[0m Instructions for updating:
[2m[36m(pid=29264)[0m non-resource variables are not supported in the long term
[2m[36m(pid=29256)[0m Instructions for updating:
[2m[36m(pid=29256)[0m non-resource variables are not supported in the long term
[2m[36m(pid=29253)[0m Instructions for updating:
[2

[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 14.2/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 336.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (88 PENDING, 112 RUNNING)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 


[2m[36m(pid=36702)[0m 2021-07-20 14:55:17,184	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=36707)[0m 2021-07-20 14:55:17,184	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=29254)[0m Instructions for updating:
[2m[36m(pid=29254)[0m non-resource variables are not supported in the long term
[2m[36m(pid=46436)[0m Instructions for updating:
[2m[36m(pid=46436)[0m non-resource variables are not supported in the long term
[2m[36m(pid=36710)[0m 2021-07-20 14:55:17,784	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=36700)[0m 2021-07-20 14:55:17,871	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -

[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 31.6/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 336.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (88 PENDING, 112 RUNNING)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 


[2m[36m(pid=11983)[0m 2021-07-20 14:55:39,843	INFO trainable.py:101 -- Trainable.setup took 16.704 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(pid=11977)[0m 2021-07-20 14:55:40,140	INFO trainable.py:101 -- Trainable.setup took 16.893 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(pid=11980)[0m 2021-07-20 14:55:40,614	INFO trainable.py:101 -- Trainable.setup took 17.272 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(pid=11981)[0m 2021-07-20 14:55:40,937	INFO trainable.py:101 -- Trainable.setup took 17.588 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(pid=11992)[0m 2021-07-20 14:55:40,975	INFO trainable.py:101 -- Trainable.setup took 17.569 second

[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 31.7/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 336.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (88 PENDING, 112 RUNNING)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 31.9/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 336.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (88 PENDING, 112 RUNNING)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m == Status ==
[2m[36m(p

[2m[36m(pid=42404)[0m Instructions for updating:
[2m[36m(pid=42404)[0m non-resource variables are not supported in the long term
[2m[36m(pid=42404)[0m 2021-07-20 14:55:59,449	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 32.5/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 336.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (83 PENDING, 112 RUNNING, 5 TERMINATED)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 


[2m[36m(pid=54407)[0m Instructions for updating:
[2m[36m(pid=54407)[0m non-resource variables are not supported in the long term
[2m[36m(pid=42476)[0m Instructions for updating:
[2m[36m(pid=42476)[0m non-resource variables are not supported in the long term
[2m[36m(pid=54478)[0m Instructions for updating:
[2m[36m(pid=54478)[0m non-resource variables are not supported in the long term
[2m[36m(pid=42477)[0m Instructions for updating:
[2m[36m(pid=42477)[0m non-resource variables are not supported in the long term
[2m[36m(pid=19280)[0m Instructions for updating:
[2m[36m(pid=19280)[0m non-resource variables are not supported in the long term
[2m[36m(pid=19291)[0m Instructions for updating:
[2m[36m(pid=19291)[0m non-resource variables are not supported in the long term
[2m[36m(pid=54407)[0m 2021-07-20 14:56:03,131	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m

[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 32.6/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 333.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (75 PENDING, 111 RUNNING, 14 TERMINATED)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 


[2m[36m(pid=43513)[0m Instructions for updating:
[2m[36m(pid=43513)[0m non-resource variables are not supported in the long term
[2m[36m(pid=43610)[0m Instructions for updating:
[2m[36m(pid=43610)[0m non-resource variables are not supported in the long term
[2m[36m(pid=43587)[0m Instructions for updating:
[2m[36m(pid=43587)[0m non-resource variables are not supported in the long term
[2m[36m(pid=43615)[0m Instructions for updating:
[2m[36m(pid=43615)[0m non-resource variables are not supported in the long term
[2m[36m(pid=19434)[0m Instructions for updating:
[2m[36m(pid=19434)[0m non-resource variables are not supported in the long term
[2m[36m(pid=43400)[0m 2021-07-20 14:56:06,728	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=43401)[0m 2021-07-20 14:56:06,731	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': '

[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 31.5/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 336.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (62 PENDING, 112 RUNNING, 26 TERMINATED)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 


[2m[36m(pid=44721)[0m Instructions for updating:
[2m[36m(pid=44721)[0m non-resource variables are not supported in the long term
[2m[36m(pid=44714)[0m Instructions for updating:
[2m[36m(pid=44714)[0m non-resource variables are not supported in the long term
[2m[36m(pid=43514)[0m 2021-07-20 14:56:11,147	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=44676)[0m Instructions for updating:
[2m[36m(pid=44676)[0m non-resource variables are not supported in the long term
[2m[36m(pid=44723)[0m Instructions for updating:
[2m[36m(pid=44723)[0m non-resource variables are not supported in the long term
[2m[36m(pid=44721)[0m 2021-07-20 14:56:12,517	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=20981)[0m 2021-07-20 14:56:12,523	INFO trainer.py:694 -- Current log_level 

[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 27.9/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 336.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (31 PENDING, 112 RUNNING, 57 TERMINATED)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 


[2m[36m(pid=21895)[0m 2021-07-20 14:56:15,974	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=56086)[0m 2021-07-20 14:56:16,064	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=21838)[0m 2021-07-20 14:56:16,046	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=21885)[0m 2021-07-20 14:56:16,138	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=21928)[0m 2021-07-20 14:56:16,156	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=56026)[0m Instructions for updating:
[2m[36m(pid=56026)[0

[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 31.5/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 333.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (23 PENDING, 111 RUNNING, 66 TERMINATED)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 


[2m[36m(pid=23714)[0m Instructions for updating:
[2m[36m(pid=23714)[0m non-resource variables are not supported in the long term
[2m[36m(pid=23727)[0m Instructions for updating:
[2m[36m(pid=23727)[0m non-resource variables are not supported in the long term
[2m[36m(pid=45715)[0m Instructions for updating:
[2m[36m(pid=45715)[0m non-resource variables are not supported in the long term
[2m[36m(pid=45713)[0m Instructions for updating:
[2m[36m(pid=45713)[0m non-resource variables are not supported in the long term
[2m[36m(pid=45685)[0m Instructions for updating:
[2m[36m(pid=45685)[0m non-resource variables are not supported in the long term
[2m[36m(pid=45720)[0m Instructions for updating:
[2m[36m(pid=45720)[0m non-resource variables are not supported in the long term
[2m[36m(pid=23721)[0m Instructions for updating:
[2m[36m(pid=23721)[0m non-resource variables are not supported in the long term
[2m[36m(pid=23715)[0m Instructions for updating:
[2

[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 23.1/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 291.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (97 RUNNING, 103 TERMINATED)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 


[2m[36m(pid=46294)[0m Instructions for updating:
[2m[36m(pid=46294)[0m non-resource variables are not supported in the long term
[2m[36m(pid=46293)[0m Instructions for updating:
[2m[36m(pid=46293)[0m non-resource variables are not supported in the long term
[2m[36m(pid=46334)[0m Instructions for updating:
[2m[36m(pid=46334)[0m non-resource variables are not supported in the long term
[2m[36m(pid=46327)[0m Instructions for updating:
[2m[36m(pid=46327)[0m non-resource variables are not supported in the long term
[2m[36m(pid=46291)[0m Instructions for updating:
[2m[36m(pid=46291)[0m non-resource variables are not supported in the long term
[2m[36m(pid=46273)[0m Instructions for updating:
[2m[36m(pid=46273)[0m non-resource variables are not supported in the long term
[2m[36m(pid=46329)[0m Instructions for updating:
[2m[36m(pid=46329)[0m non-resource variables are not supported in the long term
[2m[36m(pid=46238)[0m Instructions for updating:
[2

[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 23.7/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 276.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (92 RUNNING, 108 TERMINATED)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 


[2m[36m(pid=56736)[0m 2021-07-20 14:56:31,936	INFO trainable.py:101 -- Trainable.setup took 10.682 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(pid=26536)[0m Instructions for updating:
[2m[36m(pid=26536)[0m non-resource variables are not supported in the long term
[2m[36m(pid=26607)[0m Instructions for updating:
[2m[36m(pid=26607)[0m non-resource variables are not supported in the long term
[2m[36m(pid=58928)[0m Instructions for updating:
[2m[36m(pid=58928)[0m non-resource variables are not supported in the long term
[2m[36m(pid=45684)[0m 2021-07-20 14:56:32,394	INFO trainable.py:101 -- Trainable.setup took 10.138 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(pid=26558)[0m Instructions for updating:
[2m[36m(pid=26558)[0m non-resource variables are not supported in the long term
[2m[36m(pid=

[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 26.6/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 261.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (87 RUNNING, 113 TERMINATED)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m == Status ==
[2m[36m(pid=3481)[0m Memory usage on this node: 26.6/187.6 GiB
[2m[36m(pid=3481)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=3481)[0m Resources requested: 261.0/336 CPUs, 0/0 GPUs, 0.0/381.81 GiB heap, 0.0/167.63 GiB objects
[2m[36m(pid=3481)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=3481)[0m Number of trials: 200/200 (87 RUNNING, 113 TERMINATED)
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m 
[2m[36m(pid=3481)[0m == Status ==
[2m

[2m[36m(pid=3481)[0m 2021-07-20 14:57:18,575	INFO tune.py:549 -- Total run time: 127.53 seconds (127.05 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7ffff0dd5160>

## 3. Stop cluster and release allocation

In [6]:
if alloc:
    slurm.release_allocation(alloc)

14:15:33 nid00000 smartsim.launcher.slurm.slurm[49585] INFO Releasing allocation: 1461745
14:15:33 nid00000 smartsim.launcher.util.shell[49585] DEBUG Executing Popen cmd: /opt/slurm/20.11.5/bin/scancel 1461745
14:15:33 nid00000 smartsim.launcher.slurm.slurm[49585] INFO Successfully freed allocation 1461745
14:15:34 nid00000 smartsim.launcher.taskManager[49585] DEBUG Removing Task 49675
14:15:34 nid00000 smartsim.launcher.taskManager[49585] DEBUG Sleeping, no tasks to monitor


In [6]:
exp.stop(cluster)

11:31:07 nid00054 SmartSim[37275] INFO Stopping model workers with job name workers-CD42DFIIKH5D
11:31:07 nid00054 SmartSim[37275] INFO Stopping model head with job name head-CD42DC9I3OQO


In [10]:
analysis = tune.run(
    "PPO",
    stop={"episode_reward_max": 200},
    config={
        "framework": "torch",
        "env": "CartPole-v0",
        "lr": tune.grid_search([0.001, 0.002, 0.003, 0.004, 0.005]),
    },
    progress_reporter = JupyterNotebookReporter(True)
)

<IPython.core.display.HTML object>


[2m[36m(pid=110180)[0m Instructions for updating:
[2m[36m(pid=110180)[0m non-resource variables are not supported in the long term
[2m[36m(pid=40586)[0m Instructions for updating:
[2m[36m(pid=40586)[0m non-resource variables are not supported in the long term
[2m[36m(pid=110180)[0m 2021-05-25 10:04:22,637	INFO trainer.py:694 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=42854)[0m Instructions for updating:
[2m[36m(pid=42854)[0m non-resource variables are not supported in the long term
[2m[36m(pid=40586)[0m 2021-05-25 10:04:22,978	INFO trainer.py:694 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=34269)[0m Instructions for updating:
[2m[36m(pid=34269)[0m non-resource variables are not supported in the long term
[2m[36m(pid=2297)[0m Instructions for updating:
[2m[36m(pid=2297)[0m non-resource v

[2K[36m(pid=27027)[0m [2K
[2m[36m(pid=27027)[0m <IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


[2m[36m(pid=27027)[0m 2021-05-25 10:04:54,258	INFO tune.py:549 -- Total run time: 34.48 seconds (34.15 seconds for the tuning loop).
[2m[36m(pid=42854)[0m 2021-05-25 10:04:54,221	ERROR worker.py:382 -- SystemExit was raised from the worker
[2m[36m(pid=42854)[0m Traceback (most recent call last):
[2m[36m(pid=42854)[0m   File "python/ray/_raylet.pyx", line 495, in ray._raylet.execute_task
[2m[36m(pid=42854)[0m   File "python/ray/_raylet.pyx", line 505, in ray._raylet.execute_task
[2m[36m(pid=42854)[0m   File "python/ray/_raylet.pyx", line 449, in ray._raylet.execute_task.function_executor
[2m[36m(pid=42854)[0m   File "/lus/scratch/arigazzi/anaconda3/envs/smartsim/lib/python3.8/site-packages/ray/_private/function_manager.py", line 556, in actor_method_executor
[2m[36m(pid=42854)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(pid=42854)[0m   File "/lus/scratch/arigazzi/anaconda3/envs/smartsim/lib/python3.8/site-packages/ray/actor.py", line 1001, in __

'A'