In [1]:
import os
import itertools
from dotenv import load_dotenv
import copy
import yaml

import os, sys; sys.path.insert(0, os.path.abspath('../..')) # add project root dir to path
from experiments.sorting.model import get_experiment_name
from models.utils import AttributeDict

load_dotenv()

def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)


In [2]:
# global job parameters

job_directory = f"job_scripts"
out_dir = f'.out'
time_str = '00-4:00:00'
partition = 'gpu'
ntasks = 1
nodes = 1
cpu_per_gpu = 8
mem_per_cpu = 64
n_gpus = 1

# gpus_constraints = '"a100|rtx3090|v100|rtx2080ti"' # for grace
# gpus_constraints = "a40" #'"h100|a100"' # for misha

netid = os.getenv('NETID')
project_dir = f"/home/{netid}/project/neural-algorithmic-reasoning/experiments/sorting"

mkdir(job_directory)
mkdir(out_dir)

In [3]:
# load base model, train, and data config
import yaml
base_config_dir = f'{project_dir}/experiment_configs/base_config'

with open(os.path.join(base_config_dir, 'model_config.yaml')) as f:
    base_model_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

with open(os.path.join(base_config_dir, 'train_config.yaml')) as f:
    base_train_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

with open(os.path.join(base_config_dir, 'data_config.yaml')) as f:
    base_data_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

config_out_dir = f'{project_dir}/experiment_configs'

In [4]:
n_layers = [2, 4]
d_model = [256]
F = [2]
pos_enc_type = ['sinusoidal', 'rotary', 't5', 'alibi']
input_recall = [True, False]
# attn_score_fn = ['softmax', 'topk-softmax', 'hard', 'sigmoid']
attn_score_fn = ['topk-softmax', 'hard']

jobs_overwrite_params = []
for L, D, F, posenc, ir, attn_fn in itertools.product(n_layers, d_model, F, pos_enc_type, input_recall, attn_score_fn):
    # copy base configs
    job_model_config = copy.deepcopy(base_model_config)
    job_train_config = copy.deepcopy(base_train_config)
    job_data_config = copy.deepcopy(base_data_config)

    # update model config
    attn_kwargs = dict(attn_score_fn=attn_fn)
    if attn_fn == 'topk-softmax':
        attn_kwargs['attn_score_fn_params'] = dict(k=3, straight_through=True)
    elif attn_fn == 'hard':
        attn_kwargs['attn_score_fn_params'] = dict(straight_through=True)

    job_model_config.update(dict(n_layers=L, d_model=D, dff=D*F, pos_enc_type=posenc, input_recall=ir, attn_kwargs=attn_kwargs))

    job_config = dict(model_config=job_model_config, train_config=job_train_config, data_config=job_data_config)
    job_config = AttributeDict(job_config)
    jobs_overwrite_params.append(job_config)

print(f"Generated {len(jobs_overwrite_params)} jobs")

Generated 32 jobs


In [5]:
def create_job_config(config_upate, out_dir):
    global base_model_config, base_train_config, base_data_config
    model_config, train_config, data_config = tuple(copy.deepcopy(c) for c in (base_model_config, base_train_config, base_data_config))

    model_config.update(config_upate.get('model_config', {}))
    train_config.update(config_upate.get('train_config', {}))
    data_config.update(config_upate.get('data_config', {}))

    experiment_name, _ = get_experiment_name(model_config, data_config, train_config)
    experiment_name = experiment_name.replace(' ', '')

    mkdir(os.path.join(out_dir, experiment_name))

    with open(os.path.join(out_dir, f'{experiment_name}/model_config.yaml'), 'w') as f:
        yaml.dump(model_config.todict(), f)

    with open(os.path.join(out_dir, f'{experiment_name}/train_config.yaml'), 'w') as f:
        yaml.dump(train_config.todict(), f)

    with open(os.path.join(out_dir, f'{experiment_name}/data_config.yaml'), 'w') as f:
        yaml.dump(data_config.todict(), f)

    return model_config, train_config, data_config, experiment_name

In [6]:
def create_job_script(experiment_name):
    filename = f'{job_directory}/{experiment_name}.job'
    with open(filename, 'w') as fh:
        fh.writelines(f"#!/bin/bash\n")
        fh.writelines(f"#SBATCH --partition={partition}\n")
        fh.writelines(f"#SBATCH --job-name={experiment_name}\n")
        fh.writelines(f"#SBATCH --output={out_dir}/%j-{experiment_name}.out\n")
        fh.writelines(f"#SBATCH --ntasks={ntasks} --nodes={nodes}\n")
        fh.writelines(f"#SBATCH --cpus-per-gpu={cpu_per_gpu}\n")
        fh.writelines(f"#SBATCH --mem-per-cpu={mem_per_cpu}G\n")
        fh.writelines(f"#SBATCH --time={time_str}\n")
        fh.writelines(f"#SBATCH --mail-type=ALL\n")
        fh.writelines(f"#SBATCH --gpus={n_gpus}\n")
        # fh.writelines(f"#SBATCH --constraint={gpus_constraints}\n")

        fh.writelines('\n')
        fh.writelines('module load StdEnv\n')
        fh.writelines('export SLURM_EXPORT_ENV=ALL\n')
        fh.writelines('\n')

        # fh.writelines(f"module restore python_env\n") # load modules i need
        fh.writelines(f"module load miniconda\n") # load modules i need
        # fh.writelines(f"conda init\n")
        fh.writelines(f"conda activate neural_prog\n") # activate conda environment
        fh.writelines(f"conda info --envs\n") # activate conda environment

        fh.writelines('\n')
        fh.writelines(f"nvidia-smi -L\n") # print gpu information
        fh.writelines('\n')

        fh.writelines(f"cd {project_dir}\n") # navigate to project directory
        fh.writelines('\n')

        # run python script
        fh.writelines(f"srun python train.py --config_dir experiment_configs/{experiment_name}\n") # run python script

    return filename

In [7]:
job_script_files = []

for job_params in jobs_overwrite_params:
    base_model_config, base_train_config, base_data_config, experiment_name = create_job_config(job_params, config_out_dir)

    print(f"Experiment Name: {experiment_name}")

    job_script = create_job_script(experiment_name)
    job_script_files.append(job_script)

Experiment Name: MaxVal64-TrainLen16-L2H4D256_sinusoidal_IRTrue_topk-softmax-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_sinusoidal_IRTrue_hard-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_sinusoidal_IRFalse_topk-softmax-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_sinusoidal_IRFalse_hard-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_rotary_IRTrue_topk-softmax-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_rotary_IRTrue_hard-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_rotary_IRFalse_topk-softmax-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_rotary_IRFalse_hard-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_t5_IRTrue_topk-softmax-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_t5_IRTrue_hard-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_t5_IRFalse_topk-softmax-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_t5_IRFalse_hard-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_alibi_IRTrue_topk-softmax-ST
Experiment Name: MaxVal64-TrainLen16-L2H4D256_alibi_IRT

In [8]:
n_trials = 4

confirm = input("Do you want to submit the jobs? (y/n): ")

if confirm == 'y':
    for ir in range(n_trials):
        print('Trial:', ir)
        for job_script in job_script_files:
            os.system(f'sbatch "{job_script}"')
        print()
else:
    print("Not submitting jobs")

Trial: 0
Submitted batch job 82075
Submitted batch job 82076
Submitted batch job 82077
Submitted batch job 82078
Submitted batch job 82079
Submitted batch job 82080
Submitted batch job 82081
Submitted batch job 82082
Submitted batch job 82083
Submitted batch job 82084
Submitted batch job 82085
Submitted batch job 82086
Submitted batch job 82087
Submitted batch job 82088
Submitted batch job 82089
Submitted batch job 82090
Submitted batch job 82091
Submitted batch job 82092
Submitted batch job 82093
Submitted batch job 82094
Submitted batch job 82095
Submitted batch job 82096
Submitted batch job 82097
Submitted batch job 82098
Submitted batch job 82099
Submitted batch job 82100
Submitted batch job 82101
Submitted batch job 82102
Submitted batch job 82103
Submitted batch job 82104
Submitted batch job 82105
Submitted batch job 82106

Trial: 1
Submitted batch job 82107
Submitted batch job 82108
Submitted batch job 82109
Submitted batch job 82110
Submitted batch job 82111
Submitted batch job