# Setting up a Ray cluster with SmartSim

## 1. Start the cluster
We set up a SmartSim experiment, which will handle the launch of the Ray cluster.

First we import the relevant modules.

In [1]:
import numpy as np
import time
import argparse
import os

from ray.tune.progress_reporter import JupyterNotebookReporter
import ray
from ray import tune
import ray.util

from smartsim import Experiment
from smartsim.ray import RayCluster

NUM_WORKERS = 0
alloc=None
#alloc=slurm.get_allocation(nodes=1+NUM_WORKERS, time="12:00:00", options={"ntasks": str(1+NUM_WORKERS), "partition": "spider", "C": "V100"})

In [2]:
exp = Experiment("ray-cluster", launcher='local')
cluster = RayCluster(name="ray-cluster", run_args={}, path='',
                     launcher='local', workers=NUM_WORKERS, alloc=alloc, batch=True, ray_num_cpus=38)

if cluster.batch:
    cluster.head_model.batch_settings.add_preamble( ["source ~/.bashrc", "conda activate smartsim"])
    if NUM_WORKERS:
        cluster.worker_model.batch_settings.add_preamble ( ["source ~/.bashrc", "conda activate smartsim"])

exp.generate(cluster, overwrite=True)

08:02:32 nid00000 SmartSim[113648] INFO Working in previously created experiment


In [3]:
exp.start(cluster, block=False, summary=False)

08:02:33 nid00000 SmartSim[113648] INFO Ray cluster launched.


## 2. Start the ray driver script

In [4]:
ray.util.connect(cluster.head_model.address +":10001")

tune.run(
    "PPO",
    stop={"episode_reward_max": 200},
    config={
        "framework": "torch",
        "env": "CartPole-v0",
        "num_gpus": 0,
        "lr": tune.grid_search(np.linspace (0.001, 0.01, 2).tolist()),
        "log_level": "ERROR",
    },
    local_dir="/lus/scratch/arigazzi/ray_local/",
    verbose=1,
    fail_fast=True,
    log_to_file=True,
)

[2m[36m(pid=113950)[0m Instructions for updating:
[2m[36m(pid=113950)[0m non-resource variables are not supported in the long term


[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 5.7/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (100 PENDING)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=113984)[0m Instructions for updating:
[2m[36m(pid=113984)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113980)[0m Instructions for updating:
[2m[36m(pid=113980)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113992)[0m Instructions for updating:
[2m[36m(pid=113992)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113981)[0m Instructions for updating:
[2m[36m(pid=113981)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113990)[0m Instructions for updating:
[2m[36m(pid=113990)[0m non-resource variables are not supported in the long term
[2m[36m(pid=114200)[0m Instructions for updating:
[2m[36m(pid=114200)[0m non-resource variables are not supported in the long term
[2m[36m(pid=114120)[0m Instructions for updating:
[2m[36m(pid=114120)[0m non-resource variables are not supported in the long term
[2m[36m(pid=114209)[0m Instructions fo

[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 10.3/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 36.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (88 PENDING, 12 RUNNING)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=114575)[0m Instructions for updating:
[2m[36m(pid=114575)[0m non-resource variables are not supported in the long term
[2m[36m(pid=114588)[0m Instructions for updating:
[2m[36m(pid=114588)[0m non-resource variables are not supported in the long term
[2m[36m(pid=114473)[0m Instructions for updating:
[2m[36m(pid=114473)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113989)[0m Instructions for updating:
[2m[36m(pid=113989)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113988)[0m Instructions for updating:
[2m[36m(pid=113988)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113985)[0m Instructions for updating:
[2m[36m(pid=113985)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113986)[0m Instructions for updating:
[2m[36m(pid=113986)[0m non-resource variables are not supported in the long term
[2m[36m(pid=114106)[0m Instructions fo

[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 13.9/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (88 PENDING, 11 RUNNING, 1 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=114011)[0m Instructions for updating:
[2m[36m(pid=114011)[0m non-resource variables are not supported in the long term
[2m[36m(pid=1764)[0m Instructions for updating:
[2m[36m(pid=1764)[0m non-resource variables are not supported in the long term
[2m[36m(pid=1773)[0m Instructions for updating:
[2m[36m(pid=1773)[0m non-resource variables are not supported in the long term
[2m[36m(pid=1767)[0m Instructions for updating:
[2m[36m(pid=1767)[0m non-resource variables are not supported in the long term
[2m[36m(pid=1769)[0m Instructions for updating:
[2m[36m(pid=1769)[0m non-resource variables are not supported in the long term
[2m[36m(pid=114011)[0m 2021-06-04 08:03:39,309	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=1771)[0m Instructions for updating:
[2m[36m(pid=1771)[0m non-resource variables are not supported in the long term
[2m[36m(pi

[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 13.9/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (76 PENDING, 11 RUNNING, 13 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=1814)[0m Instructions for updating:
[2m[36m(pid=1814)[0m non-resource variables are not supported in the long term
[2m[36m(pid=1814)[0m 2021-06-04 08:03:56,427	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=3533)[0m Instructions for updating:
[2m[36m(pid=3533)[0m non-resource variables are not supported in the long term
[2m[36m(pid=3537)[0m Instructions for updating:
[2m[36m(pid=3537)[0m non-resource variables are not supported in the long term
[2m[36m(pid=3546)[0m Instructions for updating:
[2m[36m(pid=3546)[0m non-resource variables are not supported in the long term
[2m[36m(pid=3530)[0m Instructions for updating:
[2m[36m(pid=3530)[0m non-resource variables are not supported in the long term
[2m[36m(pid=3542)[0m Instructions for updating:
[2m[36m(pid=3542)[0m non-resource variables are not supported in the long term
[2m[36m(pid=3532

[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 14.0/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (64 PENDING, 11 RUNNING, 25 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=3559)[0m Instructions for updating:
[2m[36m(pid=3559)[0m non-resource variables are not supported in the long term
[2m[36m(pid=3559)[0m 2021-06-04 08:04:15,725	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 13.7/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (60 PENDING, 11 RUNNING, 29 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=5467)[0m Instructions for updating:
[2m[36m(pid=5467)[0m non-resource variables are not supported in the long term
[2m[36m(pid=5466)[0m Instructions for updating:
[2m[36m(pid=5466)[0m non-resource variables are not supported in the long term
[2m[36m(pid=5526)[0m Instructions for updating:
[2m[36m(pid=5526)[0m non-resource variables are not supported in the long term
[2m[36m(pid=5557)[0m Instructions for updating:
[2m[36m(pid=5557)[0m non-resource variables are not supported in the long term
[2m[36m(pid=5550)[0m Instructions for updating:
[2m[36m(pid=5550)[0m non-resource variables are not supported in the long term
[2m[36m(pid=5549)[0m Instructions for updating:
[2m[36m(pid=5549)[0m non-resource variables are not supported in the long term
[2m[36m(pid=5572)[0m Instructions for updating:
[2m[36m(pid=5572)[0m non-resource variables are not supported in the long term
[2m[36m(pid=5590)[0m Instructions for updating:
[2m[36m(pid=5590

[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 14.0/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (52 PENDING, 11 RUNNING, 37 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=6717)[0m Instructions for updating:
[2m[36m(pid=6717)[0m non-resource variables are not supported in the long term
[2m[36m(pid=6717)[0m 2021-06-04 08:04:34,440	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 13.4/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (50 PENDING, 11 RUNNING, 39 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=7352)[0m Instructions for updating:
[2m[36m(pid=7352)[0m non-resource variables are not supported in the long term
[2m[36m(pid=7353)[0m Instructions for updating:
[2m[36m(pid=7353)[0m non-resource variables are not supported in the long term
[2m[36m(pid=7416)[0m Instructions for updating:
[2m[36m(pid=7416)[0m non-resource variables are not supported in the long term
[2m[36m(pid=7415)[0m Instructions for updating:
[2m[36m(pid=7415)[0m non-resource variables are not supported in the long term
[2m[36m(pid=7416)[0m 2021-06-04 08:04:39,476	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=7414)[0m Instructions for updating:
[2m[36m(pid=7414)[0m non-resource variables are not supported in the long term
[2m[36m(pid=7410)[0m Instructions for updating:
[2m[36m(pid=7410)[0m non-resource variables are not supported in the long term
[2m[36m(pid=7417

[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 14.0/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (40 PENDING, 11 RUNNING, 49 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=7657)[0m Instructions for updating:
[2m[36m(pid=7657)[0m non-resource variables are not supported in the long term
[2m[36m(pid=7657)[0m 2021-06-04 08:04:53,446	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 14.0/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (37 PENDING, 11 RUNNING, 52 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=9401)[0m Instructions for updating:
[2m[36m(pid=9401)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9399)[0m Instructions for updating:
[2m[36m(pid=9399)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9407)[0m Instructions for updating:
[2m[36m(pid=9407)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9491)[0m Instructions for updating:
[2m[36m(pid=9491)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9484)[0m Instructions for updating:
[2m[36m(pid=9484)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9442)[0m Instructions for updating:
[2m[36m(pid=9442)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9401)[0m 2021-06-04 08:04:57,500	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=9436

[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 14.0/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (28 PENDING, 11 RUNNING, 61 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=9962)[0m Instructions for updating:
[2m[36m(pid=9962)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9962)[0m 2021-06-04 08:05:12,247	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=11363)[0m Instructions for updating:
[2m[36m(pid=11363)[0m non-resource variables are not supported in the long term


[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 11.9/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (22 PENDING, 11 RUNNING, 67 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=11413)[0m Instructions for updating:
[2m[36m(pid=11413)[0m non-resource variables are not supported in the long term
[2m[36m(pid=11412)[0m Instructions for updating:
[2m[36m(pid=11412)[0m non-resource variables are not supported in the long term
[2m[36m(pid=11428)[0m Instructions for updating:
[2m[36m(pid=11428)[0m non-resource variables are not supported in the long term
[2m[36m(pid=11363)[0m 2021-06-04 08:05:15,644	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=11478)[0m Instructions for updating:
[2m[36m(pid=11478)[0m non-resource variables are not supported in the long term
[2m[36m(pid=11425)[0m Instructions for updating:
[2m[36m(pid=11425)[0m non-resource variables are not supported in the long term
[2m[36m(pid=11429)[0m Instructions for updating:
[2m[36m(pid=11429)[0m non-resource variables are not supported in the long term
[2m

[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 14.0/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (16 PENDING, 11 RUNNING, 73 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=11864)[0m Instructions for updating:
[2m[36m(pid=11864)[0m non-resource variables are not supported in the long term
[2m[36m(pid=11864)[0m 2021-06-04 08:05:31,038	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 13.5/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (14 PENDING, 11 RUNNING, 75 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=13493)[0m Instructions for updating:
[2m[36m(pid=13493)[0m non-resource variables are not supported in the long term
[2m[36m(pid=13489)[0m Instructions for updating:
[2m[36m(pid=13489)[0m non-resource variables are not supported in the long term
[2m[36m(pid=13511)[0m Instructions for updating:
[2m[36m(pid=13511)[0m non-resource variables are not supported in the long term
[2m[36m(pid=13564)[0m Instructions for updating:
[2m[36m(pid=13564)[0m non-resource variables are not supported in the long term
[2m[36m(pid=13489)[0m 2021-06-04 08:05:35,203	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=13565)[0m Instructions for updating:
[2m[36m(pid=13565)[0m non-resource variables are not supported in the long term
[2m[36m(pid=13515)[0m Instructions for updating:
[2m[36m(pid=13515)[0m non-resource variables are not supported in the long term
[2m

[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 14.0/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (4 PENDING, 11 RUNNING, 85 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=14625)[0m Instructions for updating:
[2m[36m(pid=14625)[0m non-resource variables are not supported in the long term
[2m[36m(pid=14625)[0m 2021-06-04 08:05:49,759	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 13.8/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 33.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (1 PENDING, 11 RUNNING, 88 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=15204)[0m Instructions for updating:
[2m[36m(pid=15204)[0m non-resource variables are not supported in the long term
[2m[36m(pid=15203)[0m Instructions for updating:
[2m[36m(pid=15203)[0m non-resource variables are not supported in the long term
[2m[36m(pid=15261)[0m Instructions for updating:
[2m[36m(pid=15261)[0m non-resource variables are not supported in the long term
[2m[36m(pid=15261)[0m 2021-06-04 08:05:54,607	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=15339)[0m Instructions for updating:
[2m[36m(pid=15339)[0m non-resource variables are not supported in the long term
[2m[36m(pid=15344)[0m Instructions for updating:
[2m[36m(pid=15344)[0m non-resource variables are not supported in the long term
[2m[36m(pid=15339)[0m 2021-06-04 08:05:55,670	INFO trainer.py:694 -- Current log_level is ERROR. For more information, set 'log_level': '

[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 8.8/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 9.0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (3 RUNNING, 97 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m == Status ==
[2m[36m(pid=113950)[0m Memory usage on this node: 7.5/187.6 GiB
[2m[36m(pid=113950)[0m Using FIFO scheduling algorithm.
[2m[36m(pid=113950)[0m Resources requested: 0/38 CPUs, 0/0 GPUs, 0.0/120.3 GiB heap, 0.0/55.55 GiB objects
[2m[36m(pid=113950)[0m Result logdir: /lus/scratch/arigazzi/ray_local/PPO
[2m[36m(pid=113950)[0m Number of trials: 100/100 (100 TERMINATED)
[2m[36m(pid=113950)[0m 
[2m[36m(pid=113950)[0m 


[2m[36m(pid=113950)[0m 2021-06-04 08:06:06,750	INFO tune.py:549 -- Total run time: 174.44 seconds (171.10 seconds for the tuning loop).


Instructions for updating:
non-resource variables are not supported in the long term


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fffd8e51b50>

In [5]:

print('''This cluster consists of
    {} nodes in total
    {} CPU resources in total
'''.format(len(ray.nodes()), ray.cluster_resources()['CPU']))

This cluster consists of
    1 nodes in total
    38.0 CPU resources in total



## 3. Stop cluster and release allocation

In [5]:
if alloc:
    slurm.release_allocation(alloc)

In [6]:
exp.stop(cluster)

The autoscaler failed with the following error:
Terminated with signal 15
  File "/lus/scratch/arigazzi/anaconda3/envs/smartsim/lib/python3.8/site-packages/ray/_private/monitor.py", line 376, in <module>
    monitor.run()
  File "/lus/scratch/arigazzi/anaconda3/envs/smartsim/lib/python3.8/site-packages/ray/_private/monitor.py", line 284, in run
    self._run()
  File "/lus/scratch/arigazzi/anaconda3/envs/smartsim/lib/python3.8/site-packages/ray/_private/monitor.py", line 202, in _run
    time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)



08:11:09 nid00000 SmartSim[113648] INFO Stopping model head with job name head-CBUUR75HNRCV


In [9]:

tune.run(
    "PPO",
    stop={"episode_reward_max": 200},
    config={
        "framework": "torch",
        "env": "CartPole-v0",
    #    "num_gpus": 0,
        "lr": tune.grid_search(np.linspace (0.001, 0.01, 50).tolist()),
        "log_level": "ERROR",
    #    "num_cpus_per_worker": 1,
    },
    local_dir="/lus/scratch/arigazzi/ray_local/",
    verbose=1,
    #fail_fast=True,
    #log_to_file=True,
    progress_reporter = JupyterNotebookReporter(True),
)

<IPython.core.display.HTML object>


[2m[36m(pid=39779)[0m Instructions for updating:
[2m[36m(pid=39779)[0m non-resource variables are not supported in the long term
[2m[36m(pid=39781)[0m Instructions for updating:
[2m[36m(pid=39781)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113189)[0m Instructions for updating:
[2m[36m(pid=113189)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113188)[0m Instructions for updating:
[2m[36m(pid=113188)[0m non-resource variables are not supported in the long term
[2m[36m(pid=39780)[0m Instructions for updating:
[2m[36m(pid=39780)[0m non-resource variables are not supported in the long term
[2m[36m(pid=39774)[0m Instructions for updating:
[2m[36m(pid=39774)[0m non-resource variables are not supported in the long term
[2m[36m(pid=39770)[0m Instructions for updating:
[2m[36m(pid=39770)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113191)[0m Instructions for updating

<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
[2K[36m(pid=27027)[0m [2K
[2m[36m(pid=27027)[0m <IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


[2m[36m(pid=27027)[0m 2021-05-25 10:02:22,533	INFO tune.py:549 -- Total run time: 55.35 seconds (55.19 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fd356fd07c0>

In [10]:
analysis = tune.run(
    "PPO",
    stop={"episode_reward_max": 200},
    config={
        "framework": "torch",
        "env": "CartPole-v0",
        "lr": tune.grid_search([0.001, 0.002, 0.003, 0.004, 0.005]),
    },
    progress_reporter = JupyterNotebookReporter(True)
)

<IPython.core.display.HTML object>


[2m[36m(pid=110180)[0m Instructions for updating:
[2m[36m(pid=110180)[0m non-resource variables are not supported in the long term
[2m[36m(pid=40586)[0m Instructions for updating:
[2m[36m(pid=40586)[0m non-resource variables are not supported in the long term
[2m[36m(pid=110180)[0m 2021-05-25 10:04:22,637	INFO trainer.py:694 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=42854)[0m Instructions for updating:
[2m[36m(pid=42854)[0m non-resource variables are not supported in the long term
[2m[36m(pid=40586)[0m 2021-05-25 10:04:22,978	INFO trainer.py:694 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=34269)[0m Instructions for updating:
[2m[36m(pid=34269)[0m non-resource variables are not supported in the long term
[2m[36m(pid=2297)[0m Instructions for updating:
[2m[36m(pid=2297)[0m non-resource v

[2K[36m(pid=27027)[0m [2K
[2m[36m(pid=27027)[0m <IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


[2m[36m(pid=27027)[0m 2021-05-25 10:04:54,258	INFO tune.py:549 -- Total run time: 34.48 seconds (34.15 seconds for the tuning loop).
[2m[36m(pid=42854)[0m 2021-05-25 10:04:54,221	ERROR worker.py:382 -- SystemExit was raised from the worker
[2m[36m(pid=42854)[0m Traceback (most recent call last):
[2m[36m(pid=42854)[0m   File "python/ray/_raylet.pyx", line 495, in ray._raylet.execute_task
[2m[36m(pid=42854)[0m   File "python/ray/_raylet.pyx", line 505, in ray._raylet.execute_task
[2m[36m(pid=42854)[0m   File "python/ray/_raylet.pyx", line 449, in ray._raylet.execute_task.function_executor
[2m[36m(pid=42854)[0m   File "/lus/scratch/arigazzi/anaconda3/envs/smartsim/lib/python3.8/site-packages/ray/_private/function_manager.py", line 556, in actor_method_executor
[2m[36m(pid=42854)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(pid=42854)[0m   File "/lus/scratch/arigazzi/anaconda3/envs/smartsim/lib/python3.8/site-packages/ray/actor.py", line 1001, in __