# Setting up a Ray cluster with SmartSim

## 1. Start the cluster
We set up a SmartSim experiment, which will handle the launch of the Ray cluster.

First we import the relevant modules.

In [6]:
from smartsim import Experiment, slurm
from smartsim.ray import RayCluster

NUM_WORKERS = 0
alloc=None

In [7]:
exp = Experiment("ray-cluster", launcher='slurm')
cluster = RayCluster(name="ray-cluster", run_args={"partition": "ccm_queue", "time":"06:00:00", "exclusive": None, "hint": "nomultithread"}, path='',
                     launcher='slurm', workers=NUM_WORKERS, alloc=alloc, batch=False, ray_num_cpus=12)

if cluster.batch:
    cluster.head_model.batch_settings._preamble = ["module load ccm", "source ~/.bashrc",
                                                   "conda activate smartsim",
                                                   "ulimit -s unlimited", "ulimit -u unlimited",
                                                   "ulimit -n unlimited", "ulimit -c unlimited",
                                                   "ulimit -a"]

# if NUM_WORKERS:
#     cluster.worker_model.batch_settings._preamble = ["source ~/.bashrc", "conda activate smartsim"]
exp.generate(cluster, overwrite=True)

03:00:24 nid00152 SmartSim[9315] INFO Working in previously created experiment


In [8]:
exp.start(cluster, block=False, summary=False)

03:00:34 nid00152 SmartSim[9315] INFO Ray cluster launched on nodes: ['nid00152']


## 2. Start the ray driver script

In [9]:
cluster.start_ray_job('/lus/scratch/arigazzi/smartsim-dev/SmartSim/tutorials/05_starting_ray/templates/ppo_tune.py')

## 3. Stop cluster and release allocation

In [5]:
exp.stop(cluster)

03:00:18 nid00152 SmartSim[9315] INFO Stopping model head with job name head-CBBY9A6N8OND


In [9]:
# slurm.release_allocation(alloc)

05:51:53 osprey.us.cray.com SmartSim[131434] INFO Releasing allocation: 230194
05:51:53 osprey.us.cray.com SmartSim[131434] INFO Successfully freed allocation 230194


In [15]:
!squeue

   JOBID     USER ACCOUNT           NAME  ST REASON    START_TIME                TIME  TIME_LEFT NODES CPUS
 1288493    kshry  (null)   check-pstree   R None      2021-05-12T13:10:51       0:06      59:54     1  112
 1288067 arigazzi  (null)    interactive   R None      2021-05-12T11:31:15    1:39:42   10:20:18     5  180
 1212692     ssuj  (null)          aries  PD ReqNodeNo N/A                       0:00    1:00:00     1    1
 1212693     ssuj  (null)          aries  PD ReqNodeNo N/A                       0:00    1:00:00     1    1
 1244466     vers  (null)    xtmemtester  PD ReqNodeNo N/A                       0:00      20:00     1    1
 1246955     vers  (null)    xtmemtester  PD ReqNodeNo N/A                       0:00      40:00     1    1
 1256833     vers  (null)    xtmemtester  PD ReqNodeNo N/A                       0:00      40:00     1    1
 1261674     vers  (null)    xtmemtester  PD ReqNodeNo N/A                       0:00      40:00     1    1
 1267800     vers  (null)   

In [6]:
!scancel 1277211
