In [10]:
import os
import itertools
from dotenv import load_dotenv
import copy
import time
import subprocess
import yaml


import os, sys; sys.path.insert(0, os.path.abspath('../..')) # add project root dir to path
from experiments.sorting.recurrent_model import get_experiment_name
from utils.utils import AttributeDict

load_dotenv()

def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)


In [11]:
# global job parameters

job_directory = f"job_scripts"
out_dir = f'.out'
time_str = '00-4:00:00'
partition = 'gpu'
ntasks = 1
nodes = 1
cpu_per_gpu = 8
mem_per_cpu = 64
n_gpus = 1

cluster = 'misha'

if cluster == 'grace':
    gpus_constraints = '"a100|rtx3090|v100|rtx2080ti"' # for grace
# gpus_constraints = "a40" #'"h100|a100"' # for misha

netid = os.getenv('NETID')
project_dir = f"/home/{netid}/project/neural-algorithmic-reasoning/experiments/sorting"

mkdir(job_directory)
mkdir(out_dir)

In [12]:
# load base model, train, and data config
import yaml
base_config_dir = f'{project_dir}/experiment_configs/base_config_recurrent'

with open(os.path.join(base_config_dir, 'model_config.yaml')) as f:
    base_model_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

with open(os.path.join(base_config_dir, 'train_config.yaml')) as f:
    base_train_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

with open(os.path.join(base_config_dir, 'data_config.yaml')) as f:
    base_data_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

config_out_dir = f'{project_dir}/experiment_configs'

In [13]:
wandb_project = 'sorting-recurrent-resstream-exploration'

n_layers = [2] # [2, 4]
d_model = [256]
dff_expansion = [2]

# TODO: add variation over MLP activation functions, e.g. softmax linear unit, etc.
# search over dff_expansion, or other MLP params

random_train_length = [True] # , False
pos_enc_type = ['rotary'] # , 't5', 'alibi', 'sinusoidal'
input_recall = [True] # False
attn_score_fn = ['adaptive-temperature-softmax'] # softmax, 'topk-softmax', 'hard', 'sigmoid']
# discrete_intermediate_map = ['solu', 'softmax', 'topk-softmax', 'hard', 'gumbel-softmax'] # None,'sigmoid', 'relu'
discrete_intermediate_map = [None, 'softmax', 'gumbel-softmax', 'hard', 'sigmoid'] # [None, 'softmax', 'hard', 'sigmoid', 'solu'] # 'sigmoid', 'relu'
progressive_training = [True] # False
incremental_training = [False] # True
weight_tying_emb = [False] # [True, False]
wt_intermediate = [False] # [True, False]

predisc_norm = [False, True]
postdisc_norm = [False, True]

norm_config = [
    dict(norm_method='none', norm_type='rmsnorm'),
    dict(norm_method='pre-norm', norm_type='rmsnorm'),
    dict(norm_method='post-norm', norm_type='rmsnorm'),
    dict(norm_method='pre+post-norm', norm_type='rmsnorm'),
    dict(norm_method='hypersphere-interpolation', norm_type='rmsnorm')
    ]

use_eos_bos = [True]

for L, D, F, posenc, norm, ir, wt_emb, wt_interm, attn_fn, disc_map, prenorm, postnorm, progtr, inctr, randlen, eosbos in itertools.product(
    n_layers, d_model, dff_expansion, pos_enc_type, norm_config, input_recall, weight_tying_emb, wt_intermediate, attn_score_fn, discrete_intermediate_map, predisc_norm, postdisc_norm, progressive_training, incremental_training, random_train_length, use_eos_bos):

    # copy base configs
    job_model_config = copy.deepcopy(base_model_config)
    job_train_config = copy.deepcopy(base_train_config)
    job_data_config = copy.deepcopy(base_data_config)

    ### update model config

    # attn_kwargs params
    attn_kwargs = dict(attn_score_fn=attn_fn)
    if attn_fn == 'topk-softmax':
        attn_kwargs['attn_score_fn_params'] = dict(k=3, straight_through=True)
    elif attn_fn == 'hard':
        attn_kwargs['attn_score_fn_params'] = dict(straight_through=True)

    # discrete_intermediate args
    intermediate_discretization = dict(discrete_intermediate=(disc_map is not None))
    if disc_map is not None:
        intermediate_discretization['discretize_map'] = disc_map
    match disc_map:
        case 'gumbel-softmax':
            intermediate_discretization['discretization_map_params'] = dict(tau=1, hard=False)
        case 'hard':
            intermediate_discretization['discretization_map_params'] = dict(straight_through=True)
        case 'topk-softmax':
            intermediate_discretization['discretization_map_params'] = dict(k=3, straight_through=True)

    if disc_map is None and wt_interm:
        continue

    job_model_config.update(dict(
        n_layers=L, d_model=D, dff=D*F,
        pos_enc_type=posenc,
        norm_config=norm,
        input_recall=ir,
        predisc_norm=prenorm,
        postdisc_norm=postnorm,
        weight_tie_embed_to_token=wt_emb,
        weight_tie_discrete_interm=wt_interm,
        attn_kwargs=attn_kwargs,
        intermediate_discretization=intermediate_discretization
        ))

    # ----------------------------

    ### Train config
    job_train_config.update(dict(
        progressive_training=progtr,
        incremental_training=inctr,
        wandb_config=(job_train_config['wandb_config'] | dict(wandb_project=wandb_project)),
    ))

    # ----------------------------

    ### Data config
    job_data_config.update(dict(
        train_random_sequence_length=randlen,
        include_bos_eos=eosbos
    ))

    if inctr and not progtr:
        continue

    job_config = dict(model_config=job_model_config, train_config=job_train_config, data_config=job_data_config)
    job_config = AttributeDict(job_config)
    jobs_overwrite_params.append(job_config)

print(f"Generated {len(jobs_overwrite_params)} jobs")

Generated 100 jobs


In [14]:
def create_job_config(config_upate, out_dir, uid=None):
    global base_model_config, base_train_config, base_data_config
    model_config, train_config, data_config = tuple(copy.deepcopy(c) for c in (base_model_config, base_train_config, base_data_config))

    model_config.update(config_upate.get('model_config', {}))
    train_config.update(config_upate.get('train_config', {}))
    data_config.update(config_upate.get('data_config', {}))

    experiment_name, _ = get_experiment_name(model_config, data_config, train_config)
    experiment_name = experiment_name.replace(' ', '')
    if uid is not None:
        experiment_name = f"UID{uid}-{experiment_name}"

    mkdir(os.path.join(out_dir, experiment_name))

    with open(os.path.join(out_dir, f'{experiment_name}/model_config.yaml'), 'w') as f:
        yaml.dump(model_config.todict(), f)

    with open(os.path.join(out_dir, f'{experiment_name}/train_config.yaml'), 'w') as f:
        yaml.dump(train_config.todict(), f)

    with open(os.path.join(out_dir, f'{experiment_name}/data_config.yaml'), 'w') as f:
        yaml.dump(data_config.todict(), f)

    return model_config, train_config, data_config, experiment_name

In [25]:
def create_job_script(experiment_name):
    filename = f'{job_directory}/{experiment_name}.job'
    with open(filename, 'w') as fh:
        fh.writelines(f"#!/bin/bash\n")
        fh.writelines(f"#SBATCH --partition={partition}\n")
        fh.writelines(f"#SBATCH --job-name={experiment_name}\n")
        fh.writelines(f"#SBATCH --output={out_dir}/%j-{experiment_name}.out\n")
        fh.writelines(f"#SBATCH --ntasks={ntasks} --nodes={nodes}\n")
        fh.writelines(f"#SBATCH --cpus-per-gpu={cpu_per_gpu}\n")
        fh.writelines(f"#SBATCH --mem-per-cpu={mem_per_cpu}G\n")
        fh.writelines(f"#SBATCH --time={time_str}\n")
        fh.writelines(f"#SBATCH --mail-type=ALL\n")
        fh.writelines(f"#SBATCH --gres=gpu:{n_gpus}\n")
        # fh.writelines(f"#SBATCH --constraint={gpus_constraints}\n")

        fh.writelines('\n')
        fh.writelines('module load StdEnv\n')
        fh.writelines('export SLURM_EXPORT_ENV=ALL\n')
        fh.writelines('\n')

        if cluster == 'grace':
            fh.writelines(f"module restore python_env\n") # load modules i need
        elif cluster == 'misha':
            fh.writelines(f"module load miniconda\n") # load modules i need
        else:
            raise ValueError(f"Cluster {cluster} not supported")

        # fh.writelines(f"conda init\n")
        fh.writelines(f"conda activate neural_prog\n") # activate conda environment
        fh.writelines(f"conda info --envs\n") # activate conda environment

        fh.writelines('\n')
        fh.writelines(f"nvidia-smi -L\n") # print gpu information
        fh.writelines('\n')

        fh.writelines(f"cd {project_dir}\n") # navigate to project directory
        fh.writelines('\n')

        # run python script
        fh.writelines(f"srun python train_recurrent.py --config_dir experiment_configs/{experiment_name}\n") # run python script

    return filename


In [26]:
job_script_files = []

for uid, job_params in enumerate(jobs_overwrite_params):
    base_model_config, base_train_config, base_data_config, experiment_name = create_job_config(job_params, config_out_dir, uid=uid)

    print(f"Experiment Name: {experiment_name}")

    job_script = create_job_script(experiment_name)
    job_script_files.append(job_script)

Experiment Name: UID0-HASH-1166851971076522499
Experiment Name: UID1-HASH--791047997625647921
Experiment Name: UID2-HASH--8947547962214129698
Experiment Name: UID3-HASH-1939744671199455593
Experiment Name: UID4-HASH--1096642177693627903
Experiment Name: UID5-HASH-6848552955685120694
Experiment Name: UID6-HASH--7741991760838337767
Experiment Name: UID7-HASH--1511231792858688456
Experiment Name: UID8-HASH--8824826762611496564
Experiment Name: UID9-HASH--5617723092104166830
Experiment Name: UID10-HASH--7800834381702155406
Experiment Name: UID11-HASH-1887772761741376285
Experiment Name: UID12-HASH-3346903417947276123
Experiment Name: UID13-HASH-29069865286714795
Experiment Name: UID14-HASH--7426031391859039041
Experiment Name: UID15-HASH--5322354303423183792
Experiment Name: UID16-HASH--1348160817266416299
Experiment Name: UID17-HASH-6601401082330057736
Experiment Name: UID18-HASH-2363302450390050102
Experiment Name: UID19-HASH-8711659510561927199
Experiment Name: UID20-HASH-11668519710765

In [17]:
wait_time = 0.5 # number of seconds to wait between job submissions
n_trials = 1

confirm = input("Do you want to submit the jobs? (y/n): ")

responses = []

if confirm == 'y':
    for ir in range(n_trials):
        print('Trial:', ir)
        for job_script in job_script_files:
            response = subprocess.run(['sbatch', job_script], capture_output=True)
            print(f"response: {response.stdout.decode('utf-8').strip()}, return_code={response.returncode}, job_script={job_script}")
            responses.append(response)
            time.sleep(wait_time)
        print()
else:
    print("Not submitting jobs")

Not submitting jobs


In [18]:
# check if any jobs failed to submit
for response in responses:
    if not response.stdout.decode('utf-8').startswith('Submitted batch job') or response.returncode != 0:
        print(f"Failed to submit job: {response.stdout.decode('utf-8')}")
        print(f"stderr: {response.stderr.decode('utf-8')}")
        print(f"Full response: {response}")
        print()