# Setting up a Ray cluster with SmartSim

## 1. Start the cluster
We set up a SmartSim experiment, which will handle the launch of the Ray cluster.

First we import the relevant modules.

In [2]:
import numpy as np
import time
import argparse
import os

from ray.tune.progress_reporter import JupyterNotebookReporter
import ray
from ray import tune
import ray.util

from smartsim import Experiment
from smartsim.ray import RayCluster

NUM_WORKERS = 3
alloc=None
#alloc=slurm.get_allocation(nodes=1+NUM_WORKERS, time="12:00:00", options={"ntasks": str(1+NUM_WORKERS), "partition": "spider", "C": "V100"})

In [3]:
exp = Experiment("ray-cluster", launcher='slurm')
cluster = RayCluster(name="ray-cluster", run_args={}, path='',
                     launcher='slurm', workers=NUM_WORKERS, alloc=alloc, batch=False, ray_num_cpus=38)

if cluster.batch:
    cluster.head_model.batch_settings._preamble += ["source ~/.bashrc", "conda activate smartsim"]
    if NUM_WORKERS:
        cluster.worker_model.batch_settings._preamble += ["source ~/.bashrc", "conda activate smartsim"]

exp.generate(cluster, overwrite=True)

13:10:43 nid00000 SmartSim[76197] INFO Working in previously created experiment


In [4]:
exp.start(cluster, block=False, summary=False)

13:10:52 nid00000 SmartSim[76197] INFO Ray cluster launched on nodes: ['nid00000', 'nid00003', 'nid00002', 'nid00001']


## 2. Start the ray driver script

In [5]:
ray.util.connect(cluster.head_model.address +":10001")

tune.run(
    "PPO",
    stop={"episode_reward_max": 200},
    config={
        "framework": "torch",
        "env": "CartPole-v0",
        "num_gpus": 0,
        "lr": tune.grid_search(np.linspace (0.001, 0.01, 100).tolist()),
        "log_level": "ERROR",
    },
    local_dir="/lus/scratch/arigazzi/ray_local/",
    verbose=0,
    fail_fast=True,
    log_to_file=True,
)

[2m[36m(pid=100789)[0m Instructions for updating:
[2m[36m(pid=100789)[0m non-resource variables are not supported in the long term
[2m[36m(pid=100875)[0m Instructions for updating:
[2m[36m(pid=100875)[0m non-resource variables are not supported in the long term
[2m[36m(pid=100817)[0m Instructions for updating:
[2m[36m(pid=100817)[0m non-resource variables are not supported in the long term
[2m[36m(pid=100873)[0m Instructions for updating:
[2m[36m(pid=100873)[0m non-resource variables are not supported in the long term
[2m[36m(pid=100874)[0m Instructions for updating:
[2m[36m(pid=100874)[0m non-resource variables are not supported in the long term
[2m[36m(pid=100810)[0m Instructions for updating:
[2m[36m(pid=100810)[0m non-resource variables are not supported in the long term
[2m[36m(pid=100815)[0m Instructions for updating:
[2m[36m(pid=100815)[0m non-resource variables are not supported in the long term
[2m[36m(pid=100805)[0m Instructions fo

Instructions for updating:
non-resource variables are not supported in the long term


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fffd8e2c610>

## 3. Stop cluster and release allocation

In [5]:
if alloc:
    slurm.release_allocation(alloc)

In [6]:
exp.stop(cluster)

13:19:07 nid00000 SmartSim[76197] INFO Stopping model workers with job name workers-CBMJ1SJQUDGS
13:19:07 nid00000 SmartSim[76197] INFO Stopping model head with job name head-CBMJ1QLG5SN9


In [9]:

tune.run(
    "PPO",
    stop={"episode_reward_max": 200},
    config={
        "framework": "torch",
        "env": "CartPole-v0",
    #    "num_gpus": 0,
        "lr": tune.grid_search(np.linspace (0.001, 0.01, 50).tolist()),
        "log_level": "ERROR",
    #    "num_cpus_per_worker": 1,
    },
    local_dir="/lus/scratch/arigazzi/ray_local/",
    verbose=1,
    #fail_fast=True,
    #log_to_file=True,
    progress_reporter = JupyterNotebookReporter(True),
)

<IPython.core.display.HTML object>


[2m[36m(pid=39779)[0m Instructions for updating:
[2m[36m(pid=39779)[0m non-resource variables are not supported in the long term
[2m[36m(pid=39781)[0m Instructions for updating:
[2m[36m(pid=39781)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113189)[0m Instructions for updating:
[2m[36m(pid=113189)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113188)[0m Instructions for updating:
[2m[36m(pid=113188)[0m non-resource variables are not supported in the long term
[2m[36m(pid=39780)[0m Instructions for updating:
[2m[36m(pid=39780)[0m non-resource variables are not supported in the long term
[2m[36m(pid=39774)[0m Instructions for updating:
[2m[36m(pid=39774)[0m non-resource variables are not supported in the long term
[2m[36m(pid=39770)[0m Instructions for updating:
[2m[36m(pid=39770)[0m non-resource variables are not supported in the long term
[2m[36m(pid=113191)[0m Instructions for updating

<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
[2K[36m(pid=27027)[0m [2K
[2m[36m(pid=27027)[0m <IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


[2m[36m(pid=27027)[0m 2021-05-25 10:02:22,533	INFO tune.py:549 -- Total run time: 55.35 seconds (55.19 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fd356fd07c0>

In [10]:
analysis = tune.run(
    "PPO",
    stop={"episode_reward_max": 200},
    config={
        "framework": "torch",
        "env": "CartPole-v0",
        "lr": tune.grid_search([0.001, 0.002, 0.003, 0.004, 0.005]),
    },
    progress_reporter = JupyterNotebookReporter(True)
)

<IPython.core.display.HTML object>


[2m[36m(pid=110180)[0m Instructions for updating:
[2m[36m(pid=110180)[0m non-resource variables are not supported in the long term
[2m[36m(pid=40586)[0m Instructions for updating:
[2m[36m(pid=40586)[0m non-resource variables are not supported in the long term
[2m[36m(pid=110180)[0m 2021-05-25 10:04:22,637	INFO trainer.py:694 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=42854)[0m Instructions for updating:
[2m[36m(pid=42854)[0m non-resource variables are not supported in the long term
[2m[36m(pid=40586)[0m 2021-05-25 10:04:22,978	INFO trainer.py:694 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=34269)[0m Instructions for updating:
[2m[36m(pid=34269)[0m non-resource variables are not supported in the long term
[2m[36m(pid=2297)[0m Instructions for updating:
[2m[36m(pid=2297)[0m non-resource v

[2K[36m(pid=27027)[0m [2K
[2m[36m(pid=27027)[0m <IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


[2m[36m(pid=27027)[0m 2021-05-25 10:04:54,258	INFO tune.py:549 -- Total run time: 34.48 seconds (34.15 seconds for the tuning loop).
[2m[36m(pid=42854)[0m 2021-05-25 10:04:54,221	ERROR worker.py:382 -- SystemExit was raised from the worker
[2m[36m(pid=42854)[0m Traceback (most recent call last):
[2m[36m(pid=42854)[0m   File "python/ray/_raylet.pyx", line 495, in ray._raylet.execute_task
[2m[36m(pid=42854)[0m   File "python/ray/_raylet.pyx", line 505, in ray._raylet.execute_task
[2m[36m(pid=42854)[0m   File "python/ray/_raylet.pyx", line 449, in ray._raylet.execute_task.function_executor
[2m[36m(pid=42854)[0m   File "/lus/scratch/arigazzi/anaconda3/envs/smartsim/lib/python3.8/site-packages/ray/_private/function_manager.py", line 556, in actor_method_executor
[2m[36m(pid=42854)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(pid=42854)[0m   File "/lus/scratch/arigazzi/anaconda3/envs/smartsim/lib/python3.8/site-packages/ray/actor.py", line 1001, in __