# Setting up a Ray cluster with SmartSim

## 1. Start the cluster
We set up a SmartSim experiment, which will handle the launch of the Ray cluster.

First we import the relevant modules.

In [1]:
from smartsim import Experiment, slurm
from smartsim.ray import RayCluster

NUM_WORKERS = 0
alloc=slurm.get_allocation(nodes=1+NUM_WORKERS, time="12:00:00", options={"ntasks": str(1+NUM_WORKERS), "partition": "spider", "C": "V100"})

17:12:08 osprey.us.cray.com SmartSim[84734] INFO Allocation successful with Job ID: 243912


In [3]:
exp = Experiment("ray-cluster", launcher='slurm')
cluster = RayCluster(name="ray-cluster", run_args={"time":"06:00:00", "unbuffered": None}, path='',
                     launcher='slurm', workers=NUM_WORKERS, alloc=alloc, batch=False, ray_num_cpus=56)

if cluster.batch:
    cluster.head_model.batch_settings._preamble = ["source ~/.bashrc", "conda activate smartsim"]
    if NUM_WORKERS:
        cluster.worker_model.batch_settings._preamble = ["source ~/.bashrc", "conda activate smartsim"]

exp.generate(cluster, overwrite=True)

17:12:30 osprey.us.cray.com SmartSim[84734] INFO Working in previously created experiment


In [7]:
exp.start(cluster, block=False, summary=False)

RuntimeError: Could not find Ray cluster head address.

## 2. Start the ray driver script

In [8]:
cluster.start_ray_job('/lus/scratch/arigazzi/smartsim-dev/SmartSim/tutorials/05_starting_ray/templates/ppo_tune.py')

In [30]:
cluster.start_ray_job('/lus/scratch/arigazzi/smartsim-dev/SmartSim/tutorials/05_starting_ray/templates/ppo_train.py')

In [4]:
cluster.start_ray_job('/lus/scratch/arigazzi/smartsim-dev/SmartSim/tutorials/05_starting_ray/templates/mnist_pytorch_trainable.py')

## 3. Stop cluster and release allocation

In [6]:
exp.stop(cluster)

TypeError: can only concatenate str (not "NoneType") to str

In [6]:
if alloc:
    slurm.release_allocation(alloc)

17:09:49 osprey.us.cray.com SmartSim[35040] INFO Releasing allocation: 243910
17:09:49 osprey.us.cray.com SmartSim[35040] INFO Successfully freed allocation 243910


In [6]:
!squeue

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            242403     bdw18  Chpl-ep  chapelu  R       6:04     16 prod-[0001-0016]
            242404     bdw18 head-CBC arigazzi  R       2:57      1 prod-0017
            242394     clx28 sstsim.x visharma  R      40:31     32 prod-[0065-0096]


In [7]:
!scancel 242404


11:26:17 osprey.us.cray.com SmartSim[75449] INFO head(242404): Failed


In [None]:
import ray
from ray import tune
import ray.util
import time
import numpy as np
import argparse
import os
from ray.tune.progress_reporter import JupyterNotebookReporter

ray.util.connect(cluster.head_model.address +":10001")
print("connected")
#ray.init(address=args.ray_address, _redis_password=args.redis_password)

reporter = JupyterNotebookReporter(overwrite=True, max_progress_rows=10)

print("initialized")
tune.run(
    "PPO",
    stop={"episode_reward_max": 200},
    config={
        "framework": "torch",
        "env": "CartPole-v0",
        "num_gpus": 0,
        "lr": tune.grid_search(np.linspace (0.001, 0.01, 1000).tolist()),
        "log_level": "ERROR",
        "num_cpus_per_worker": 1,
        "num_cpus_for_driver": 1,
    },
    local_dir="/lus/scratch/arigazzi/ray_local/",
    verbose=3,
    fail_fast=True,
    progress_reporter = reporter,
    log_to_file=True,
)