In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
import joblib

import pandas as pd
import numpy as np

sys.path.append('../')

## Example Notebook

This notebook shows you how to use slurm_utils. It shows the following steps:
- create config
- initialize experiment
- test locally
- install remote setup
- run remotely on SLURM
- look at results


Because of sensitive output, I'm not gonna run the test below. If anything is unclearn, please message me.

In [None]:
import logging
from slurm_utils.logging import init_logging

init_logging(level=logging.INFO)

## Setup Configuration

In [None]:
remote_user_name="test_user"


config = {
    "experiment_name": "exp_1",
    "run_settings":{
        "train_file": "empty_train.py", # we might have multiple starting points, e.g. for JAX / PyTorch
        "objective": "accuracy",        
        "hyperparam_algorithm": "grid", # currently grid and bayesian are available
        "hyperparameter_params": {}
    },
    "data": {
        "local_data_dir": f"/not/used/or/needed/as/we/dont/compute/anything",
        "remote_data_dir": f"/not/used/or/needed/as/we/dont/compute/anything"
    },
    "server_settings": { 
        # see examples for the following two scripts in examples/
        "gateway_script": f"/storage/{remote_user_name}/server_setup/init_gateway.sh",
        "slurm_script": f"/storage/{remote_user_name}/server_setup/init_slurm.sh",
        # here we can define the resources per run:
        "num_cpu": 55,
        "mem": 32, # in GB
        "num_gpu": 0
    },
    "parameters": 
    # See https://parameter-sherpa.readthedocs.io/en/latest/gettingstarted/guide.html for an intro
    # to sherpa paramters
    [ 
        {
            "name": "train_parameter",
            "type": "choice",
            "range": list(range(10))
        }
    ]
}

In [None]:
from slurm_utils.experiment import SLURMExperiment


proj_dir = os.path.dirname(os.getcwd()) # main folder, 1 lvl above notebooks/ directory


exp = SLURMExperiment(
    hostname="vda-dgx",                    # name of the remote hostname. Should be specified in ~/.ssh/config
    proj_name="slurm_utils_test_repo",     # project name; should be same as repo name 
    local_proj_dir=proj_dir,               # local project repository
    experiment_config=config,              # python dict or json file
)
exp.connect()

## Test locally

Now we use the following two lines until we are happy with our training code and verify it runs locally.

In [None]:
exp.prepare_local_experiment(clean_existing=True) # delete everything from last run
exp.run_locally()

## Prepare remote repository

This involves the following steps automatically:
- create remote python virtual env
- upload current project code to our PyPi
- install on the remote server via pip
- create experiment directories and upload config files

In [None]:
exp.prepare_remote_experiment(clean_existing=True)

In [None]:
# check which jobs are currently running
exp.connect()
exp.squeue()

In [None]:
exp.available_resources()

## Run SLURM job

- If "upload_code" is set to true, the possibly updated code is uploaded again
- One Experiment should only run one remote job. But if you want to run multiple ones, add cancel_running_job=True. Note that a currently running job is stopped.
- After the job is submitted, the result of an "squeue" query is returned

In [None]:
exp.run_remote(upload_code=True, cancel_running_job=True)

## Observe the current trainig run

In [None]:
exp.get_job_info()

In [None]:
exp.print_current_output()

## Results

When the trainining is complete, we can look at all runs

In [None]:
exp.get_results()