# Setting up a Ray cluster with SmartSim

## 1. Start the cluster
We set up a SmartSim experiment, which will handle the launch of the Ray cluster.

First we import the relevant modules.

In [1]:
from smartsim import Experiment, slurm
from smartsim.ray import RayCluster

NUM_WORKERS = 3
alloc=slurm.get_allocation(nodes=1+NUM_WORKERS, time="12:00:00", options={"ntasks": str(1+NUM_WORKERS), "partition": "spider", "C": "V100"})

11:29:54 osprey.us.cray.com SmartSim[139114] INFO Allocation successful with Job ID: 247529


In [2]:
exp = Experiment("ray-cluster", launcher='slurm')
cluster = RayCluster(name="ray-cluster", run_args={"time":"06:00:00", "unbuffered": None}, path='',
                     launcher='slurm', workers=NUM_WORKERS, alloc=alloc, batch=False, ray_num_cpus=36)

if cluster.batch:
    cluster.head_model.batch_settings._preamble = ["source ~/.bashrc", "conda activate smartsim"]
    if NUM_WORKERS:
        cluster.worker_model.batch_settings._preamble = ["source ~/.bashrc", "conda activate smartsim"]

exp.generate(cluster, overwrite=True)

11:29:54 osprey.us.cray.com SmartSim[139114] INFO Working in previously created experiment


In [3]:
exp.start(cluster, block=False, summary=False)

11:30:01 osprey.us.cray.com SmartSim[139114] INFO Ray cluster launched on nodes: ['spider-0006', 'spider-0007', 'spider-0009', 'spider-0008']


## 2. Start the ray driver script

In [8]:
cluster.start_ray_job('/lus/scratch/arigazzi/smartsim-dev/SmartSim/tutorials/05_starting_ray/templates/ppo_tune.py')

In [30]:
cluster.start_ray_job('/lus/scratch/arigazzi/smartsim-dev/SmartSim/tutorials/05_starting_ray/templates/ppo_train.py')

In [4]:
cluster.start_ray_job('/lus/scratch/arigazzi/smartsim-dev/SmartSim/tutorials/05_starting_ray/templates/mnist_pytorch_trainable.py')

## 3. Stop cluster and release allocation

In [32]:
exp.stop(cluster)

11:29:32 osprey.us.cray.com SmartSim[85487] INFO Stopping model workers with job name workers-CBFNVCH1UMPX
11:29:32 osprey.us.cray.com SmartSim[85487] INFO Stopping model head with job name head-CBFNVAHRSZHG


In [33]:
if alloc:
    slurm.release_allocation(alloc)

11:29:34 osprey.us.cray.com SmartSim[85487] INFO Releasing allocation: 247528
11:29:34 osprey.us.cray.com SmartSim[85487] INFO Successfully freed allocation 247528


In [13]:
import ray
from ray import tune
import ray.util
import time
import numpy as np
import argparse
import os
from ray.tune.progress_reporter import JupyterNotebookReporter

if not connected:
    connected = True
    ray.util.connect(cluster.head_model.address +":10001")
    print("connected")
#ray.init(address=args.ray_address, _redis_password=args.redis_password)

reporter = JupyterNotebookReporter(overwrite=False)

tune.run(
    "PPO",
    stop={"episode_reward_max": 200},
    config={
        "framework": "torch",
        "env": "CartPole-v0",
        "num_gpus": 0,
        "lr": tune.grid_search(np.linspace (0.001, 0.01, 100).tolist()),
        "log_level": "ERROR",
        "num_cpus_per_worker": 1,
        "num_cpus_for_driver": 1,
    },
    local_dir="/lus/scratch/arigazzi/ray_local/",
    verbose=1,
    fail_fast=True,
    progress_reporter = reporter,
    log_to_file=True,
)

[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>


[2m[36m(pid=37955, ip=10.10.2.74)[0m 2021-05-17 12:26:25,775	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=37962, ip=10.10.2.74)[0m 2021-05-17 12:26:25,839	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=37966, ip=10.10.2.74)[0m 2021-05-17 12:26:26,000	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=37969, ip=10.10.2.74)[0m 2021-05-17 12:26:26,011	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=67573)[0m 2021-05-17 12:26:26,062	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=6

[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>


[2m[36m(pid=39482, ip=10.10.2.74)[0m 2021-05-17 12:27:05,611	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=39484, ip=10.10.2.74)[0m 2021-05-17 12:27:05,634	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>


[2m[36m(pid=39518, ip=10.10.2.74)[0m 2021-05-17 12:27:05,995	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=38673, ip=10.10.2.73)[0m 2021-05-17 12:27:06,049	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=38692, ip=10.10.2.73)[0m 2021-05-17 12:27:06,297	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=38690, ip=10.10.2.73)[0m 2021-05-17 12:27:06,317	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=38705, ip=10.10.2.73)[0m 2021-05-17 12:27:06,569	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>


[2m[36m(pid=40505, ip=10.10.2.74)[0m 2021-05-17 12:27:20,178	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=40509, ip=10.10.2.74)[0m 2021-05-17 12:27:20,218	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=40510, ip=10.10.2.74)[0m 2021-05-17 12:27:20,251	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=40528, ip=10.10.2.74)[0m 2021-05-17 12:27:20,427	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=40564, ip=10.10.2.74)[0m 2021-05-17 12:27:20,650	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>


[2m[36m(pid=71868)[0m 2021-05-17 12:27:25,443	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=71870)[0m 2021-05-17 12:27:25,703	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=71942)[0m 2021-05-17 12:27:26,957	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=71941)[0m 2021-05-17 12:27:28,586	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=71938)[0m 2021-05-17 12:27:28,589	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=72027)[0m 2021-05-17 12:27:28,626	INFO trainer.py:696 -- Cur

[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>


[2m[36m(pid=72771)[0m 2021-05-17 12:27:30,320	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=72772)[0m 2021-05-17 12:27:30,333	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=72773)[0m 2021-05-17 12:27:30,383	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=38119, ip=10.10.2.75)[0m 2021-05-17 12:27:36,625	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>


[2m[36m(pid=38975, ip=10.10.2.75)[0m 2021-05-17 12:27:49,432	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>


[2m[36m(pid=42704, ip=10.10.2.74)[0m 2021-05-17 12:27:59,588	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=42705, ip=10.10.2.74)[0m 2021-05-17 12:27:59,562	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=40991, ip=10.10.2.73)[0m 2021-05-17 12:28:00,574	INFO trainer.py:696 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>
[2m[36m(pid=35264)[0m <IPython.core.display.HTML object>


[2m[36m(pid=35264)[0m 2021-05-17 12:28:44,158	INFO tune.py:549 -- Total run time: 144.96 seconds (144.14 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis object at 0x7fd06f0b2050>


In [4]:
connected=False

In [7]:
from IPython.display import display, HTML