In [1]:
from recursive_clustering.experiment.tested_models import models_dict
from pathlib import Path
from itertools import product

In [2]:
models_nicknames = list(models_dict.keys())
script_dir = Path() / 'scripts'
script_dir.mkdir(parents=True, exist_ok=True)
models_nicknames.remove('WBMS')
models_nicknames.sort()
models_nicknames

['AffinityPropagation',
 'AverageAgglomerativeClustering',
 'Clique',
 'CompleteAgglomerativeClustering',
 'DBSCAN',
 'HDBSCAN',
 'IRFLLRR',
 'KMeans',
 'KMeansProj',
 'MeanShift',
 'OPTICS',
 'Proclus',
 'RecursiveClustering',
 'RecursiveClusteringHDBSCAN',
 'RecursiveClusteringHDBSCAN_full',
 'RecursiveClusteringSCSRGF',
 'RecursiveClusteringSCSRGF_full',
 'RecursiveClustering_full',
 'SingleAgglomerativeClustering',
 'SpectralClustering',
 'SpectralSubspaceRandomization',
 'WardAgglomerativeClustering']

In [3]:
len(models_nicknames)

22

In [4]:
# # sensible parameters for n_classes=5
# models_params = {
#     'AffinityPropagation': {"damping": 0.9},
#     'AverageAgglomerativeClustering': {'n_clusters': 5},
#     'Clique': {},
#     'CompleteAgglomerativeClustering': {'n_clusters': 5},
#     'DBSCAN': {},
#     'HDBSCAN': {},
#     'IRFLLRR': {'sc_n_clusters': 5},
#     'KMeans': {'n_clusters': 5},
#     'KMeansProj': {},
#     'MeanShift': {},
#     'OPTICS': {},
#     'Proclus': {'n_clusters': 5},
#     'RecursiveClustering': {},
#     'SingleAgglomerativeClustering': {'n_clusters': 5},
#     'SpectralClustering': {'n_clusters': 5},
#     'SpectralSubspaceRandomization': {'sc_n_clusters': 5},
#     'WardAgglomerativeClustering': {'n_clusters': 5},
# }

In [5]:
models_samplers = {
    'AffinityPropagation': 'tpe',
    'AverageAgglomerativeClustering': 'grid',
    'Clique': 'tpe',
    'CompleteAgglomerativeClustering': 'grid',
    'DBSCAN': 'tpe',
    'HDBSCAN': 'grid',
    'IRFLLRR': 'tpe',
    'KMeans': 'grid',
    'KMeansProj': 'grid',
    'MeanShift': 'grid',
    'OPTICS': 'grid',
    'Proclus': 'tpe',
    'RecursiveClustering': 'tpe',
    'RecursiveClustering_full': 'tpe',
    'SingleAgglomerativeClustering': 'grid',
    'SpectralClustering': 'grid',
    'SpectralSubspaceRandomization': 'tpe',
    'WardAgglomerativeClustering': 'grid',
}

In [7]:
models_nicknames = ['RecursiveClustering', 'KMeans']
n_samples = "1000"
n_informative_features = "1000"
n_random_features = "0,100,500,1000,2300,9000"  # aprox 0, 0.1, 0.3, 0.5, 0.7, 0.9
n_features = ""
pct_random_features = ""
# pct_random_features = "0.3"
n_centers = "5"
distances = "50"
seeds_unified = "0-1-2-3-4"
# seeds_unified = "3"
direction = 'maximize'
# hpo_metric = 'silhouette'
hpo_metric = 'silhouette'
n_trials = '30'
pruner = 'none'
timeout_hpo = 7200
timeout_trial = 600
suffix = f'{hpo_metric}'
c = 40
# dask
# dask_cluster_type = "local"
# n_workers = 20
# n_cores_per_worker = 4
# n_processes_per_worker = 1
# n_threads_per_worker = 4
# n_cores_per_task = 4
# n_processes_per_task = 1
# n_threads_per_task = 4
# dask_memory= '8GB'
dask_cluster_type = 'local'
n_workers = '10'
n_cores_per_worker = '4'
n_processes_per_worker = '1'
n_threads_per_worker = '8'
n_cores_per_task = '4'
n_processes_per_task = '1'
n_threads_per_task = '8'
dask_memory= '12GB'
# calculate job arrays
combinations = product(n_samples.replace(',','-').split('-'), n_features.replace(',','-').split('-'), seeds_unified.replace(',','-').split('-'), pct_random_features.replace(',','-').split('-'), distances.replace(',','-').split('-'), n_random_features.replace(',','-').split('-'), n_informative_features.replace(',','-').split('-'))
n_combinations = 6
array_start = 0
array_end = int(n_combinations - 1)
array_step = 1
array_simultaneous = min(10, int(n_combinations))
array_batch = array_simultaneous
array_fist_start = 0
array_fist_end = int(array_simultaneous - 1)
for model in models_nicknames:
    sampler = models_samplers[model]
    if model.find('RecursiveClustering') != -1:
        # we will use the threads of the main process to accelerate
        n_jobs = 1
    else:
        n_jobs = 2
    file = script_dir / f'gaussian_random_{model}_{suffix}.sbatch'
    file_content=f"""#!/bin/bash
#SBATCH --job-name={file.name[:-len('.sbatch')]}
#SBATCH -c {c}
#SBATCH --exclude=clust[6-12]
#SBATCH --output=/home/users/belucci/recursive_clustering/results/sbatch_outputs/%x-%A_%a.out
#SBATCH --error=/home/users/belucci/recursive_clustering/results/sbatch_errors/%x-%A_%a.err
#SBATCH --time=1-00:00:00
#SBATCH --array={array_fist_start}-{array_fist_end}:{array_step}%{array_simultaneous}

# export OMP_NUM_THREADS=4
# export MKL_NUM_THREADS=4
# export OPENBLAS_NUM_THREADS=4

# we are limited at 300 submitted jobs, so we will submit small array jobs and the last array will resubmit the next array job
array_start={array_start}
array_end={array_end}
array_step={array_step}
array_simultaneous={array_simultaneous}
array_batch={array_batch}
script_path="/home/users/belucci/recursive_clustering/run_scripts/gaussian_random/gaussian_random/${{SLURM_JOB_NAME}}.sbatch"


# 1 model, 64 datasets, 10 fold -> 640 models, but slurm is configured to accept at maximum 1001 (array from 0-1000)
# then copy another template or modify this one accordingly

experiment_root_dir="/home/users/belucci/recursive_clustering/results/gaussian_random"
experiment_work_root_dir="/tmp/${{SLURM_JOB_NAME}}"
# experiment_save_root_dir=""

# For job step/array
srun_output="${{experiment_root_dir}}/sbatch_output/srun-%x-%A_%a.out"
srun_error="${{experiment_root_dir}}/sbatch_error/srun-%x-%A_%a.err"
# dbs_dir="${{experiment_work_root_dir}}/dbs"
# mkdir -p $dbs_dir

# then copy another template or modify this one accordingly
environment_name="cohirf"
experiment_python_location="/home/users/belucci/recursive_clustering/recursive_clustering/experiment/hpo_gaussian_clustering_experiment.py"

# Create a dictionary with argument names and values
declare -A args_dict=(
# base
["models_nickname"]="{model}"
["seeds_models"]=""
["experiment_name"]="${{SLURM_JOB_NAME}}"
["n_jobs"]="{n_jobs}"
["models_params"]=''
["fits_params"]=""
["error_score"]=""
["timeout_fit"]=""
["timeout_combination"]=""
["log_dir"]="${{experiment_root_dir}}/logs"
#["log_file_name"]=""
["work_root_dir"]="${{experiment_work_root_dir}}/work"
["save_root_dir"]=""
["mlflow_tracking_uri"]="http://clust9.ceremade.dauphine.lan:5002/"
["dask_cluster_type"]="{dask_cluster_type}"
["n_workers"]="{n_workers}"
["n_cores_per_worker"]="{n_cores_per_worker}"
["n_processes_per_worker"]="{n_processes_per_worker}"
["n_threads_per_worker"]="{n_threads_per_worker}"
["n_cores_per_task"]="{n_cores_per_task}"
["n_processes_per_task"]="{n_processes_per_task}"
["n_threads_per_task"]="{n_threads_per_task}"
["dask_memory"]="{dask_memory}"
["dask_job_extra_directives"]=""
["dask_address"]=""
["n_gpus_per_worker"]=""
["n_gpus_per_task"]=""
# hpo
["hpo_framework"]=""
["n_trials"]="{n_trials}"
["timeout_hpo"]="{timeout_hpo}"
["timeout_trial"]="{timeout_trial}"
["max_concurrent_trials"]=""
["sampler"]="{sampler}"
["pruner"]="{pruner}"
["direction"]="{direction}"
#["hpo_metric"]="{hpo_metric}"

# gaussian clustering
)

declare -A bool_args_dict=(
# base
["create_validation_set"]=0
["do_not_clean_work_dir"]=0
["do_not_log_to_mlflow"]=0
["do_not_check_if_exists"]=0
["do_not_retry_on_oom"]=0
["raise_on_fit_error"]=0
)

declare -A array_args_dict=(
# Note that bash does not allow arrays inside dictionaries, so we will use strings with ',' as separators and '-' when
# we want multiple values for the same argument (basically they will be replaced by ' ')
# base
["n_samples"]="{n_samples}"
# ["n_features"]="{n_features}"
# ["pct_random_features"]="{pct_random_features}"
["n_random_features"]="{n_random_features}"
["n_informative_features"]="{n_informative_features}"
["n_centers"]="{n_centers}"
["distances"]="{distances}"
["seeds_unified"]="{seeds_unified}"
["hpo_metric"]="{hpo_metric}"
# in this example we are expecting the combinations Model1+0-1, Model1+2-3, Model1+4, Model2+0-1, Model2+2-3, Model2+4
)

# bash does not necessarily keep the order of the keys in the dictionary, so we will specify the order here
# declare -a array_args_dict_order=("n_samples" "n_random_features" "n_informative_features" "n_centers" "distances" "seeds_unified" "hpo_metric" "n_features" "pct_random_features")
declare -a array_args_dict_order=("n_samples" "n_random_features" "n_informative_features" "n_centers" "distances" "seeds_unified" "hpo_metric")

# Construct the argument string
args_str=""
for key in "${{!args_dict[@]}}"; do
  if [ -n "${{args_dict[$key]}}" ]; then
    args_str="$args_str --$key ${{args_dict[$key]}}"
  fi
done

# Add arguments strings that are boolean
for key in "${{!bool_args_dict[@]}}"; do
  if [ "${{bool_args_dict[$key]}}" -eq 1 ]; then
    args_str="${{args_str}} --${{key}}"
  fi
done

# Construct the cartesian product of the arrays
# the idea is to create a string like {{Model1,Model2}}+{{0-1,2-3,4}} and then evaluate it to get the cartesian product
# using bash's brace expansion
string_for_cartesian_product=""
for key in "${{array_args_dict_order[@]}}"; do
  str_array=${{array_args_dict[$key]}}
  n_elements=$(echo $str_array | tr ',' ' ' | wc -w)
  if [ ${{n_elements}} -eq 0 ]; then
    continue
  elif [ ${{n_elements}} -eq 1 ]; then
    string_for_cartesian_product="${{string_for_cartesian_product}}+${{str_array}}"
  else
    string_for_cartesian_product="${{string_for_cartesian_product}}+{{${{str_array}}}}"
  fi
done

# Remove the first '+' character
string_for_cartesian_product=${{string_for_cartesian_product:1}}

# Evaluate the string to get the cartesian product
cartesian_product=$(eval echo $string_for_cartesian_product)

# Split the string into an array (1 combination per element)
IFS=' ' read -r -a cartesian_product <<< "${{cartesian_product}}"
# cartesian_product is now an array like ["Model1+0-1", "Model1+2-3", "Model1+4", "Model2+0-1", "Model2+2-3", "Model2+4"]

# Activate the conda environment
eval "$(conda shell.bash hook)"
conda activate $environment_name

# alternatively we could also run an array job and execute the combination given by the SLURM_ARRAY_TASK_ID
i_combination=$SLURM_ARRAY_TASK_ID
if [ "${{i_combination}}" -ge "${{#cartesian_product[@]}}" ]; then
  echo "SLURM_ARRAY_TASK_ID is greater than the number of combinations"
  exit 1
fi
string_combination=""
IFS='+' read -r -a combination <<< "${{cartesian_product[$i_combination]}}"
i_arg_name=0
for key in "${{array_args_dict_order[@]}}"; do
  value=${{combination[$i_arg_name]//[-]/ }}  # replace '-' by ' '
  string_combination="${{string_combination}} --${{key}} ${{value}}"
  i_arg_name=$((i_arg_name+1))
done
# mlflow_tracking_uri="sqlite:////${{dbs_dir}}/${{cartesian_product[$i_combination]}}.db"
log_file_name="${{SLURM_JOB_NAME}}-${{cartesian_product[$i_combination]}}"
echo "Running: srun --exclusive -n 1 -c ${{SLURM_CPUS_PER_TASK}} --output=${{srun_output}} --error=${{srun_error}} python ${{experiment_python_location}} ${{args_str}} ${{string_combination}} --log_file_name ${{log_file_name}}"
srun --exclusive -n 1 -c ${{SLURM_CPUS_PER_TASK}} --output=${{srun_output}} --error=${{srun_error}} python ${{experiment_python_location}} ${{args_str}} ${{string_combination}} --log_file_name ${{log_file_name}}
wait

# If we are at the last array task, submit the next array job
if [ "${{SLURM_ARRAY_TASK_ID}}" -eq "${{SLURM_ARRAY_TASK_MAX}}" ]; then
  next_array_start=$(($SLURM_ARRAY_TASK_MIN+array_batch))
  # next_array_end will be the minimum between array_end and next_array_start+array_batch
  next_array_end=$(( array_end < next_array_start+array_batch ? array_end : next_array_start+array_batch ))
  # if next_array_start <= array_end, then we submit job, otherwise we are done
  if [ "${{next_array_start}}" -le "${{next_array_end}}" ]; then
    sbatch --array=${{next_array_start}}-${{next_array_end}}:${{array_step}}%${{array_simultaneous}} ${{script_path}}
  fi
fi
"""
    file.write_text(file_content)
