In [1]:
from smartsim import Experiment
from smartsim.database import SlurmOrchestrator
from smartsim.settings import SrunSettings
from smartsim import slurm

def launch_cluster_orc(experiment, port, alloc):
    """Just spin up a database cluster, check the status
       and tear it down"""

    db = SlurmOrchestrator(port=port,
                            db_nodes=1,
                            batch=False,
                            alloc=alloc,
                            interface="ib0")
    

    # generate directories for output files
    # pass in objects to make dirs for
    experiment.generate(db, overwrite=True)

    # start the database on interactive allocation
    experiment.start(db, block=True)

    return db

def create_uploader(experiment, alloc, nodes=1, tasks_per_node=1):

    srun = SrunSettings(exe="python",
                        exe_args="data_uploader.py",
                        env_vars={"PYTHONUNBUFFERED": "1"},
                        alloc=alloc)
    srun.set_nodes(nodes)
    srun.set_tasks_per_node(tasks_per_node)

    uploader = experiment.create_ensemble("uploader", replicas=2, run_settings=srun)

    # create directories for the output files and copy
    # scripts to execution location inside newly created dir
    # only necessary if its not an executable (python is executable here)
    uploader.attach_generator_files(to_copy=["./torch/data_uploader.py"])
    experiment.generate(uploader, overwrite=True)
    return uploader


def create_trainer(experiment, alloc):

    srun = SrunSettings(exe="python",
                        exe_args="training_service.py",
                        env_vars={"PYTHONUNBUFFERED": "1"},
                        alloc=alloc)
    srun.set_tasks(1)

    trainer = experiment.create_model("trainer", srun)

    # create directories for the output files and copy
    # scripts to execution location inside newly created dir
    # only necessary if its not an executable (python is executable here)
    trainer.attach_generator_files(to_copy="./torch/training_service.py")
    experiment.generate(trainer, overwrite=True)
    return trainer


def create_trainer_hvd(experiment, alloc, nodes=1, tasks_per_node=1):

    srun = SrunSettings(exe="python",
                        exe_args="training_service_hvd.py",
                        env_vars={"PYTHONUNBUFFERED": "1"},
                        alloc=alloc)
    srun.set_nodes(nodes)
    srun.set_tasks_per_node(tasks_per_node)

    trainer = experiment.create_model("trainer", srun)

    # create directories for the output files and copy
    # scripts to execution location inside newly created dir
    # only necessary if its not an executable (python is executable here)
    trainer.attach_generator_files(to_copy="./torch/training_service_hvd.py")
    experiment.generate(trainer, overwrite=True)
    return trainer

In [2]:
alloc = slurm.get_allocation(nodes=4, time="03:00:00", options={"constraint": "V100", "partition": "spider"})

10:35:49 osprey.us.cray.com SmartSim[139345] INFO Allocation successful with Job ID: 529595


In [3]:
exp = Experiment("launch_streaming", launcher="slurm")

db_port = 6780

# start the database
db = launch_cluster_orc(exp, db_port, alloc)
uploader_model = create_uploader(exp, alloc, 1, 8)
uploader_model.enable_key_prefixing()
exp.start(uploader_model, block=False, summary=False)
trainer_model = create_trainer(exp, alloc)
for uploader in uploader_model.entities:
    trainer_model.register_incoming_entity(uploader)

exp.start(trainer_model, block=True, summary=False)

# shutdown the database because we don't need it anymore
exp.stop(db)

print(exp.summary())

10:35:49 osprey.us.cray.com SmartSim[139345] INFO Working in previously created experiment
10:36:22 osprey.us.cray.com SmartSim[139345] INFO Working in previously created experiment
10:36:29 osprey.us.cray.com SmartSim[139345] INFO Working in previously created experiment
10:36:37 osprey.us.cray.com SmartSim[139345] INFO uploader_0(529595.1): New
10:36:37 osprey.us.cray.com SmartSim[139345] INFO uploader_1(529595.2): New
10:36:37 osprey.us.cray.com SmartSim[139345] INFO trainer(529595.3): New
10:36:42 osprey.us.cray.com SmartSim[139345] INFO uploader_0(529595.1): Running
10:36:42 osprey.us.cray.com SmartSim[139345] INFO uploader_1(529595.2): Running
10:36:42 osprey.us.cray.com SmartSim[139345] INFO trainer(529595.3): Running
10:36:47 osprey.us.cray.com SmartSim[139345] INFO uploader_0(529595.1): Running
10:36:47 osprey.us.cray.com SmartSim[139345] INFO uploader_1(529595.2): Running
10:36:47 osprey.us.cray.com SmartSim[139345] INFO trainer(529595.3): Running
10:36:52 osprey.us.cray.com 

KeyboardInterrupt: 

In [6]:
exp.stop(db, uploader_model, trainer_model)

10:49:58 osprey.us.cray.com SmartSim[139345] INFO Stopping model trainer with job name trainer-CG5TCJCPLF95
10:49:58 osprey.us.cray.com SmartSim[139345] INFO Stopping model uploader_0 with job name uploader_0-CG5TCGCL1IF0
10:49:58 osprey.us.cray.com SmartSim[139345] INFO Stopping model uploader_1 with job name uploader_1-CG5TCGCL2IEP
10:49:58 osprey.us.cray.com SmartSim[139345] INFO Stopping model orchestrator_0 with job name orchestrator_0-CG5TC100ILIU


In [7]:
exp = Experiment("launch_streaming_hvd", launcher="slurm")

db_port = 6780

# start the database
db = launch_cluster_orc(exp, db_port, alloc)
uploader_model = create_uploader(exp, alloc, 1, 16)
uploader_model.enable_key_prefixing()
exp.start(uploader_model, block=False, summary=False)
trainer_model = create_trainer_hvd(exp, alloc, 1, 8)
for uploader in uploader_model.entities:
    trainer_model.register_incoming_entity(uploader)

exp.start(trainer_model, block=True, summary=False)

# shutdown the database because we don't need it anymore
exp.stop(db)

print(exp.summary())

10:49:58 osprey.us.cray.com SmartSim[139345] INFO Working in previously created experiment
10:50:32 osprey.us.cray.com SmartSim[139345] INFO Working in previously created experiment
10:50:38 osprey.us.cray.com SmartSim[139345] INFO Working in previously created experiment
10:50:47 osprey.us.cray.com SmartSim[139345] INFO uploader_0(529595.9): New
10:50:47 osprey.us.cray.com SmartSim[139345] INFO uploader_1(529595.10): New
10:50:47 osprey.us.cray.com SmartSim[139345] INFO trainer(529595.11): New
10:50:52 osprey.us.cray.com SmartSim[139345] INFO uploader_0(529595.9): Running
10:50:52 osprey.us.cray.com SmartSim[139345] INFO uploader_1(529595.10): Running
10:50:52 osprey.us.cray.com SmartSim[139345] INFO trainer(529595.11): Running
10:50:57 osprey.us.cray.com SmartSim[139345] INFO uploader_0(529595.9): Running
10:50:57 osprey.us.cray.com SmartSim[139345] INFO uploader_1(529595.10): Running
10:50:57 osprey.us.cray.com SmartSim[139345] INFO trainer(529595.11): Running
10:51:02 osprey.us.cra

KeyboardInterrupt: 

In [8]:
slurm.release_allocation(alloc)

13:44:03 osprey.us.cray.com SmartSim[104531] INFO Releasing allocation: 528402
13:44:03 osprey.us.cray.com SmartSim[104531] INFO Successfully freed allocation 528402
13:44:13 osprey.us.cray.com SmartSim[104531] INFO uploader_0(528402.9): Cancelled
13:44:13 osprey.us.cray.com SmartSim[104531] INFO uploader_1(528402.10): Cancelled
13:44:13 osprey.us.cray.com SmartSim[104531] INFO trainer(528402.11): Cancelled
13:44:13 osprey.us.cray.com SmartSim[104531] INFO orchestrator_0(528402.8): Cancelled
