In [16]:
import os
import itertools
from dotenv import load_dotenv
import copy
import yaml

import os, sys; sys.path.insert(0, os.path.abspath('../..')) # add project root dir to path
from experiments.sorting.recurrent_model import get_experiment_name
from utils.utils import AttributeDict

load_dotenv()

def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)


In [None]:
# global job parameters

job_directory = f"job_scripts"
out_dir = f'.out'
time_str = '00-4:00:00'
partition = 'gpu'
ntasks = 1
nodes = 1
cpu_per_gpu = 8
mem_per_cpu = 64
n_gpus = 1

cluster = 'misha'

if cluster == 'grace':
    gpus_constraints = '"a100|rtx3090|v100|rtx2080ti"' # for grace
# gpus_constraints = "a40" #'"h100|a100"' # for misha

netid = os.getenv('NETID')
project_dir = f"/home/{netid}/project/neural-algorithmic-reasoning/experiments/sorting"

mkdir(job_directory)
mkdir(out_dir)

In [18]:
# load base model, train, and data config
import yaml
base_config_dir = f'{project_dir}/experiment_configs/base_config_recurrent'

with open(os.path.join(base_config_dir, 'model_config.yaml')) as f:
    base_model_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

with open(os.path.join(base_config_dir, 'train_config.yaml')) as f:
    base_train_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

with open(os.path.join(base_config_dir, 'data_config.yaml')) as f:
    base_data_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

config_out_dir = f'{project_dir}/experiment_configs'

In [None]:
n_layers = [2] # [2, 4]
d_model = [256]
dff_expansion = [2]


# TODO: add variation over MLP activation functions, e.g. softmax linear unit, etc.
# search over dff_expansion, or other MLP params

random_train_length = [True, False]
pos_enc_type = ['sinusoidal', 'rotary', 't5', 'alibi']
input_recall = [True, False]
attn_score_fn = ['softmax'] # , 'topk-softmax', 'hard', 'sigmoid']
discrete_intermediate_map = ['solu', 'softmax', 'topk-softmax', 'hard', 'gumbel-softmax'] # None,'sigmoid', 'relu'
progressive_training = [True, False]
incremental_training = [True, False]
weight_tying = [False] # [True, False]

jobs_overwrite_params = []
for L, D, F, posenc, ir, attn_fn, disc_map, progtr, inctr, randlen in itertools.product(
    n_layers, d_model, dff_expansion, pos_enc_type, input_recall, attn_score_fn, discrete_intermediate_map, progressive_training, incremental_training, random_train_length):

    # copy base configs
    job_model_config = copy.deepcopy(base_model_config)
    job_train_config = copy.deepcopy(base_train_config)
    job_data_config = copy.deepcopy(base_data_config)

    ### update model config

    # attn_kwargs params
    attn_kwargs = dict(attn_score_fn=attn_fn)
    if attn_fn == 'topk-softmax':
        attn_kwargs['attn_score_fn_params'] = dict(k=3, straight_through=True)
    elif attn_fn == 'hard':
        attn_kwargs['attn_score_fn_params'] = dict(straight_through=True)

    # discrete_intermediate args
    intermediate_discretization = dict(discrete_intermediate=(disc_map is not None))
    if disc_map is not None:
        intermediate_discretization['discretize_map'] = disc_map
    match disc_map:
        case 'gumbel-softmax':
            intermediate_discretization['discretization_map_params'] = dict(tau=1, hard=False)
        case 'hard':
            intermediate_discretization['discretization_map_params'] = dict(straight_through=True)
        case 'topk-softmax':
            intermediate_discretization['discretization_map_params'] = dict(k=3, straight_through=True)

    job_model_config.update(dict(
        n_layers=L, d_model=D, dff=D*F,
        pos_enc_type=posenc,
        input_recall=ir,
        attn_kwargs=attn_kwargs,
        intermediate_discretization=intermediate_discretization
        ))

    # ----------------------------

    ### Train config
    job_train_config.update(dict(
        progressive_training=progtr,
        incremental_training=inctr
    ))

    # ----------------------------

    ### Data config
    job_data_config.update(dict(
        train_random_sequence_length=randlen,
    ))

    if inctr and not progtr:
        continue

    job_config = dict(model_config=job_model_config, train_config=job_train_config, data_config=job_data_config)
    job_config = AttributeDict(job_config)
    jobs_overwrite_params.append(job_config)

print(f"Generated {len(jobs_overwrite_params)} jobs")

Generated 192 jobs


In [20]:
def create_job_config(config_upate, out_dir):
    global base_model_config, base_train_config, base_data_config
    model_config, train_config, data_config = tuple(copy.deepcopy(c) for c in (base_model_config, base_train_config, base_data_config))

    model_config.update(config_upate.get('model_config', {}))
    train_config.update(config_upate.get('train_config', {}))
    data_config.update(config_upate.get('data_config', {}))

    experiment_name, _ = get_experiment_name(model_config, data_config, train_config)
    experiment_name = experiment_name.replace(' ', '')

    mkdir(os.path.join(out_dir, experiment_name))

    with open(os.path.join(out_dir, f'{experiment_name}/model_config.yaml'), 'w') as f:
        yaml.dump(model_config.todict(), f)

    with open(os.path.join(out_dir, f'{experiment_name}/train_config.yaml'), 'w') as f:
        yaml.dump(train_config.todict(), f)

    with open(os.path.join(out_dir, f'{experiment_name}/data_config.yaml'), 'w') as f:
        yaml.dump(data_config.todict(), f)

    return model_config, train_config, data_config, experiment_name

In [None]:
def create_job_script(experiment_name):
    filename = f'{job_directory}/{experiment_name}.job'
    with open(filename, 'w') as fh:
        fh.writelines(f"#!/bin/bash\n")
        fh.writelines(f"#SBATCH --partition={partition}\n")
        fh.writelines(f"#SBATCH --job-name={experiment_name}\n")
        fh.writelines(f"#SBATCH --output={out_dir}/%j-{experiment_name}.out\n")
        fh.writelines(f"#SBATCH --ntasks={ntasks} --nodes={nodes}\n")
        fh.writelines(f"#SBATCH --cpus-per-gpu={cpu_per_gpu}\n")
        fh.writelines(f"#SBATCH --mem-per-cpu={mem_per_cpu}G\n")
        fh.writelines(f"#SBATCH --time={time_str}\n")
        fh.writelines(f"#SBATCH --mail-type=ALL\n")
        fh.writelines(f"#SBATCH --gpus={n_gpus}\n")
        # fh.writelines(f"#SBATCH --constraint={gpus_constraints}\n")

        fh.writelines('\n')
        fh.writelines('module load StdEnv\n')
        fh.writelines('export SLURM_EXPORT_ENV=ALL\n')
        fh.writelines('\n')

        if cluster == 'grace':
            fh.writelines(f"module restore python_env\n") # load modules i need
        elif cluster == 'misha':
            fh.writelines(f"module load miniconda\n") # load modules i need
        else:
            raise ValueError(f"Cluster {cluster} not supported")

        # fh.writelines(f"conda init\n")
        fh.writelines(f"conda activate neural_prog\n") # activate conda environment
        fh.writelines(f"conda info --envs\n") # activate conda environment

        fh.writelines('\n')
        fh.writelines(f"nvidia-smi -L\n") # print gpu information
        fh.writelines('\n')

        fh.writelines(f"cd {project_dir}\n") # navigate to project directory
        fh.writelines('\n')

        # run python script
        fh.writelines(f"srun python train_recurrent.py --config_dir experiment_configs/{experiment_name}\n") # run python script

    return filename


In [22]:
job_script_files = []

for job_params in jobs_overwrite_params:
    base_model_config, base_train_config, base_data_config, experiment_name = create_job_config(job_params, config_out_dir)

    print(f"Experiment Name: {experiment_name}")

    job_script = create_job_script(experiment_name)
    job_script_files.append(job_script)

Experiment Name: L2H4D256_sinusoidal_IRTrue_discinterm-softmax-progressive_incremental-MaxVal64-TrainLen16RandLen
Experiment Name: L2H4D256_sinusoidal_IRTrue_discinterm-softmax-progressive_incremental-MaxVal64-TrainLen16
Experiment Name: L2H4D256_sinusoidal_IRTrue_discinterm-softmax-progressive-MaxVal64-TrainLen16RandLen
Experiment Name: L2H4D256_sinusoidal_IRTrue_discinterm-softmax-progressive-MaxVal64-TrainLen16
Experiment Name: L2H4D256_sinusoidal_IRTrue_discinterm-softmax--MaxVal64-TrainLen16RandLen
Experiment Name: L2H4D256_sinusoidal_IRTrue_discinterm-softmax--MaxVal64-TrainLen16
Experiment Name: L2H4D256_sinusoidal_IRTrue_discinterm-topk-softmax-progressive_incremental-MaxVal64-TrainLen16RandLen
Experiment Name: L2H4D256_sinusoidal_IRTrue_discinterm-topk-softmax-progressive_incremental-MaxVal64-TrainLen16
Experiment Name: L2H4D256_sinusoidal_IRTrue_discinterm-topk-softmax-progressive-MaxVal64-TrainLen16RandLen
Experiment Name: L2H4D256_sinusoidal_IRTrue_discinterm-topk-softmax-p

In [23]:
n_trials = 1

confirm = input("Do you want to submit the jobs? (y/n): ")

if confirm == 'y':
    for ir in range(n_trials):
        print('Trial:', ir)
        for job_script in job_script_files:
            os.system(f'sbatch "{job_script}"')
        print()
else:
    print("Not submitting jobs")

Trial: 0
Submitted batch job 118141
Submitted batch job 118142
Submitted batch job 118143
Submitted batch job 118144
Submitted batch job 118145
Submitted batch job 118146
Submitted batch job 118147
Submitted batch job 118148
Submitted batch job 118149
Submitted batch job 118150
Submitted batch job 118151
Submitted batch job 118152
Submitted batch job 118153
Submitted batch job 118154
Submitted batch job 118155
Submitted batch job 118156
Submitted batch job 118157
Submitted batch job 118158
Submitted batch job 118159
Submitted batch job 118160
Submitted batch job 118161
Submitted batch job 118162
Submitted batch job 118163
Submitted batch job 118164
Submitted batch job 118165
Submitted batch job 118166
Submitted batch job 118167
Submitted batch job 118168
Submitted batch job 118169
Submitted batch job 118170
Submitted batch job 118171
Submitted batch job 118172
Submitted batch job 118173
Submitted batch job 118174
Submitted batch job 118175
Submitted batch job 118176
Submitted batch job