Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatic launcher detection #120

Merged
merged 9 commits into from
Jan 12, 2022
12 changes: 9 additions & 3 deletions smartsim/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import time
import os.path as osp
import time
from os import getcwd
from pprint import pformat

Expand All @@ -38,7 +38,7 @@
from .generation import Generator
from .settings import settings
from .utils import get_logger
from .utils.helpers import colorize, init_default
from .utils.helpers import colorize, detect_launcher, init_default
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is a user ever going to want to use this function? If so, we may considering creating a new module thats more user facing as the helpers are going to be moved to the _core directory.

I think it's possible a user may want to inspect the value returned by detect_launcher for specific use cases. I could go either way on this one. If you don't think so, just leave it.

If you do think we should have a module, names?

expUtils?
wlmUtils?
wlm?

I could see the last one being good if we end up moving the slurm (allocation) api to the wlm module.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, definitely wlm is the one which makes sense to me. I think you're right, users might want to include detect_launcher explicitly in their code.


logger = get_logger(__name__)

Expand All @@ -65,7 +65,9 @@ def __init__(self, name, exp_path=None, launcher="local"):
:param exp_path: path to location of ``Experiment`` directory if generated
:type exp_path: str, optional
:param launcher: type of launcher being used, options are "slurm", "pbs",
"cobalt", "lsf", or "local". Defaults to "local"
"cobalt", "lsf", or "local", if set to "auto",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Period instead of comma

an attempt will be made to find an available launcher on the system.
Defaults to "local"
:type launcher: str, optional
"""
self.name = name
Expand All @@ -76,6 +78,10 @@ def __init__(self, name, exp_path=None, launcher="local"):
raise NotADirectoryError("Experiment path provided does not exist")
exp_path = osp.abspath(exp_path)
self.exp_path = init_default(osp.join(getcwd(), name), exp_path, str)

if launcher == "auto":
launcher = detect_launcher()

self._control = Controller(launcher=launcher)
self._launcher = launcher.lower()

Expand Down
2 changes: 1 addition & 1 deletion smartsim/launcher/util/shell.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import time
from subprocess import PIPE, Popen, TimeoutExpired
from subprocess import PIPE, TimeoutExpired

import psutil

Expand Down
17 changes: 9 additions & 8 deletions smartsim/settings/cobaltSettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,15 @@ def __init__(
:param batch_args: extra batch arguments, defaults to None
:type batch_args: dict[str, str], optional
"""
super().__init__("qsub",
batch_args=batch_args,
nodes=nodes,
account=account,
queue=queue,
time=time,
**kwargs)

super().__init__(
"qsub",
batch_args=batch_args,
nodes=nodes,
account=account,
queue=queue,
time=time,
**kwargs,
)

def set_walltime(self, walltime):
"""Set the walltime of the job
Expand Down
22 changes: 12 additions & 10 deletions smartsim/settings/lsfSettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def set_tasks_per_rs(self, tasks_per_rs):

def set_tasks_per_node(self, tasks_per_node):
"""Set the number of tasks per resource set.

This function is an alias for `set_tasks_per_rs`.

:param tasks_per_node: number of tasks per resource set
Expand All @@ -138,13 +138,13 @@ def set_hostlist(self, host_list):

This function is only available to unify LSFSettings
to other WLM settings classes.

"""
pass

def set_cpus_per_task(self, cpus_per_task):
"""Set the number of cpus per tasks.

This function is an alias for `set_cpus_per_rs`.

:param cpus_per_task: number of cpus per resource set
Expand Down Expand Up @@ -335,13 +335,15 @@ def __init__(
kwargs.pop("account", None)
else:
project = kwargs.pop("account", None)

super().__init__("bsub",
batch_args=batch_args,
nodes=nodes,
account=project,
time=time,
**kwargs)

super().__init__(
"bsub",
batch_args=batch_args,
nodes=nodes,
account=project,
time=time,
**kwargs,
)

if smts:
self.set_smts(smts)
Expand Down
16 changes: 9 additions & 7 deletions smartsim/settings/pbsSettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,15 @@ def __init__(
:type batch_args: dict[str, str], optional
"""

super().__init__("qsub",
batch_args=batch_args,
nodes=nodes,
account=account,
queue=queue,
time=time,
**kwargs)
super().__init__(
"qsub",
batch_args=batch_args,
nodes=nodes,
account=account,
queue=queue,
time=time,
**kwargs,
)
self.resources = init_default({}, resources, dict)

self._ncpus = ncpus
Expand Down
21 changes: 17 additions & 4 deletions smartsim/settings/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from ..error import SmartSimError
from ..utils.helpers import is_valid_cmd
from ..utils.helpers import detect_launcher, is_valid_cmd
from . import *


Expand All @@ -36,7 +36,8 @@ def create_batch_settings(

See Experiment.create_batch_settings for details

:param launcher: launcher for this experiment
:param launcher: launcher for this experiment, if set to 'auto',
an attempt will be made to find an available launcher on the system
:type launcher: str
:param nodes: number of nodes for batch job, defaults to 1
:type nodes: int, optional
Expand All @@ -60,6 +61,9 @@ def create_batch_settings(
"lsf": BsubBatchSettings,
}

if launcher == "auto":
launcher = detect_launcher()

if launcher == "local":
raise SmartSimError("Local launcher does not support batch workloads")

Expand All @@ -68,7 +72,12 @@ def create_batch_settings(
try:
batch_class = by_launcher[launcher]
batch_settings = batch_class(
nodes=nodes, time=time, batch_args=batch_args, queue=queue, account=account, **kwargs
nodes=nodes,
time=time,
batch_args=batch_args,
queue=queue,
account=account,
**kwargs,
)
return batch_settings

Expand All @@ -91,7 +100,8 @@ def create_run_settings(

See Experiment.create_run_settings docstring for more details

:param launcher: launcher to create settings for
:param launcher: launcher to create settings for, if set to 'auto',
an attempt will be made to find an available launcher on the system
:type launcher: str
:param run_command: command to run the executable
:type run_command: str
Expand Down Expand Up @@ -124,6 +134,9 @@ def create_run_settings(
"lsf": ["jsrun", "mpirun"],
}

if launcher == "auto":
launcher = detect_launcher()

def _detect_command(launcher):
if launcher in by_launcher:
for cmd in by_launcher[launcher]:
Expand Down
14 changes: 8 additions & 6 deletions smartsim/settings/slurmSettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,14 @@ def __init__(self, nodes=None, time="", account=None, batch_args=None, **kwargs)
:param batch_args: extra batch arguments, defaults to None
:type batch_args: dict[str, str], optional
"""
super().__init__("sbatch",
batch_args=batch_args,
nodes=nodes,
account=account,
time=time,
**kwargs)
super().__init__(
"sbatch",
batch_args=batch_args,
nodes=nodes,
account=account,
time=time,
**kwargs,
)

def set_walltime(self, walltime):
"""Set the walltime of the job
Expand Down
35 changes: 34 additions & 1 deletion smartsim/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import socket
from os import environ
from shutil import which
from subprocess import run

import psutil

Expand Down Expand Up @@ -73,7 +74,7 @@ def init_default(default, init_value, expected_type=None):
def expand_exe_path(exe):
"""Takes an executable and returns the full path to that executable

:param exe: exectable or file
:param exe: executable or file
:type exe: str
"""

Expand Down Expand Up @@ -182,3 +183,35 @@ def cat_arg_and_value(arg_name, value):
return " ".join(("-" + arg_name, str(value)))
else:
return "=".join(("--" + arg_name, str(value)))


def detect_launcher():
"""Detect available launcher."""
# Precedence: PBS, Cobalt, LSF, Slurm, local
if which("qsub") and which("qstat") and which("qdel"):
qsub_version = run(
["qsub", "--version"], shell=False, capture_output=True, encoding="utf-8"
)
if "pbs" in (qsub_version.stdout).lower():
return "pbs"
if "cobalt" in (qsub_version.stdout).lower():
return "cobalt"
if (
which("bsub")
and which("jsrun")
and which("jslist")
and which("bjobs")
and which("bkill")
):
return "lsf"
if (
which("sacct")
and which("srun")
and which("salloc")
and which("sbatch")
and which("scancel")
and which("sstat")
and which("sinfo")
):
return "slurm"
return "local"
2 changes: 1 addition & 1 deletion tests/on_wlm/test_launch_orc_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_incoming_entities(fileutils, wlmutils):
launcher = wlmutils.get_test_launcher()
if launcher != "slurm":
pytest.skip("Test only runs on systems with Slurm as WLM")

exp_name = "test-incoming-entities"
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
test_dir = fileutils.make_test_dir(exp_name)
Expand Down
8 changes: 6 additions & 2 deletions tests/on_wlm/test_simple_base_settings_on_wlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
def test_simple_model_on_wlm(fileutils, wlmutils):
launcher = wlmutils.get_test_launcher()
if launcher not in ["pbs", "slurm", "cobalt", "lsf"]:
pytest.skip("Test only runs on systems with LSF, PBSPro, Slurm, or Cobalt as WLM")
pytest.skip(
"Test only runs on systems with LSF, PBSPro, Slurm, or Cobalt as WLM"
)

exp_name = "test-simplebase-settings-model-launch"
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
Expand All @@ -44,7 +46,9 @@ def test_simple_model_on_wlm(fileutils, wlmutils):
def test_simple_model_stop_on_wlm(fileutils, wlmutils):
launcher = wlmutils.get_test_launcher()
if launcher not in ["pbs", "slurm", "cobalt", "lsf"]:
pytest.skip("Test only runs on systems with LSF, PBSPro, Slurm, or Cobalt as WLM")
pytest.skip(
"Test only runs on systems with LSF, PBSPro, Slurm, or Cobalt as WLM"
)

exp_name = "test-simplebase-settings-model-stop"
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
Expand Down
19 changes: 8 additions & 11 deletions tests/test_batch_settings.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
from smartsim.settings import (
QsubBatchSettings,
SbatchSettings,
BsubBatchSettings
)
from smartsim.settings import BsubBatchSettings, QsubBatchSettings, SbatchSettings
from smartsim.settings.settings import create_batch_settings


def test_create_pbs_batch():
pbs_batch = create_batch_settings(
"pbs", nodes=1, time="10:00:00", queue="default", account="myproject", ncpus=10
) # test that kwargs make it to class init
args = pbs_batch.format_batch_args()
assert(isinstance(pbs_batch, QsubBatchSettings))
assert isinstance(pbs_batch, QsubBatchSettings)
assert args == [
"-l select=1:ncpus=10",
"-l place=scatter",
Expand All @@ -33,7 +30,7 @@ def test_create_sbatch():
) # test that kwargs from
# pbs doesn't effect slurm (im thinking this will be common)

assert(isinstance(slurm_batch, SbatchSettings))
assert isinstance(slurm_batch, SbatchSettings)
assert slurm_batch.batch_args["partition"] == "default"
args = slurm_batch.format_batch_args()
assert args == [
Expand All @@ -52,10 +49,10 @@ def test_create_bsub():
"lsf",
nodes=1,
time="10:00:00",
account="myproject", # test that account is set
account="myproject", # test that account is set
queue="default",
batch_args=batch_args
batch_args=batch_args,
)
assert(isinstance(bsub, BsubBatchSettings))
assert isinstance(bsub, BsubBatchSettings)
args = bsub.format_batch_args()
assert(args == ['-core_isolation', '-nnodes 1', '-q default'])
assert args == ["-core_isolation", "-nnodes 1", "-q default"]
1 change: 1 addition & 0 deletions tests/test_configs/smartredis/consumer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import torch
import torch.nn as nn

from smartredis import Client

if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions tests/test_configs/smartredis/producer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import torch
import torch.nn as nn

from smartredis import Client


Expand Down
5 changes: 5 additions & 0 deletions tests/test_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,8 @@ def test_summary(fileutils):
assert m.type == row["Entity-Type"]
assert 0 == int(row["RunID"])
assert 0 == int(row["Returncode"])


def test_launcher_detection(wlmutils):
exp = Experiment("test-launcher-detection", launcher="auto")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about the case where we run the local test suite on a system with Slurm? Does this still pass?

Copy link
Collaborator Author

@al-rigazzi al-rigazzi Jan 12, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Spartee I changed my mind on this a couple of times. My current opinion is that we keep this test, but we add an exception in case wlmutils.get_test_launcher()=="local" and there is another launcher on the system. This way, at least, the function is always tested, but the result is ignored on clusters/supercomputers, if there is a launcher and you have set SMARTSIM_TEST_LAUNCHER=local.

assert exp._launcher == wlmutils.get_test_launcher()
3 changes: 2 additions & 1 deletion tests/test_smartredis.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@

shouldrun = True
try:
import smartredis
import torch

import smartredis
except ImportError:
shouldrun = False

Expand Down