In [1]:
import os
import itertools
from dotenv import load_dotenv
import copy
import time
import subprocess
import yaml


import os, sys; sys.path.insert(0, os.path.abspath('../..')) # add project root dir to path
from experiments.language_modeling.model import get_experiment_name
from utils.utils import AttributeDict

load_dotenv()

def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)


In [2]:
# global job parameters

job_directory = f"job_scripts"
out_dir = f'.out'

time_str = '00-24:00:00'
max_time = '00:23:55:00' # 5 minutes less than the time_str; this is the format PL uses

partition = 'gpu'
ntasks = 1
nodes = 1
cpu_per_gpu = 8
mem_per_cpu = 8
n_gpus = 1

cluster = 'misha'

if cluster == 'grace':
    gpus_constraints = '"a100|rtx3090|v100|rtx2080ti"' # for grace
# gpus_constraints = "a40" #'"h100|a100"' # for misha

netid = os.getenv('NETID')
project_dir = f"/home/{netid}/project/neural-algorithmic-reasoning/experiments/language_modeling"

mkdir(job_directory)
mkdir(out_dir)

In [3]:
# load base model, train, and data config
import yaml
base_config_dir = f'{project_dir}/experiment_configs/base_config'

with open(os.path.join(base_config_dir, 'model_config.yaml')) as f:
    base_model_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

with open(os.path.join(base_config_dir, 'train_config.yaml')) as f:
    base_train_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

with open(os.path.join(base_config_dir, 'data_config.yaml')) as f:
    base_data_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

config_out_dir = f'{project_dir}/experiment_configs'

In [4]:
model_configs = dict(
    n_layers=[6], d_model=[256], dff_expansion=[2],
    pos_enc_type = ['rotary', 'none'], # , 't5', 'alibi', 'sinusoidal'
    attn_score_fn = ['softmax'], # softmax, 'adaptive-temperature-softmax', 'topk-softmax', 'hard', 'sigmoid']
    norm_config = [
        dict(norm_method='none', norm_type='rmsnorm'),
        dict(norm_method='pre-norm', norm_type='rmsnorm'),
        dict(norm_method='post-norm', norm_type='rmsnorm'),
        dict(norm_method='pre+post-norm', norm_type='rmsnorm'),
        dict(norm_method='hypersphere-interpolation', lerp_weight_constraint='none'),
        dict(norm_method='hypersphere-spherical-interpolation', single_weight=True),
        dict(norm_method='adaptive-hypersphere-interpolation', single_weight=True),
        ],
    )

wandb_project = 'language-modeling'

sequence_length = 512

In [5]:
jobs_overwrite_params = []

for model_config_update in itertools.product(*[[(k, v) for v in vs] for k, vs in model_configs.items()]):
    # copy base configs
    job_model_config = copy.deepcopy(base_model_config)
    job_train_config = copy.deepcopy(base_train_config)
    job_data_config = copy.deepcopy(base_data_config)

    # update model config
    for k, v in model_config_update:
        job_model_config[k] = v

    # update train config
    job_train_config['wandb_config'] = job_train_config['wandb_config'] | dict(wandb_project=wandb_project)

    job_train_config['max_time'] = max_time

    # update data config
    job_data_config['sequence_length'] = sequence_length

    job_config = dict(model_config=job_model_config, train_config=job_train_config, data_config=job_data_config)
    job_config = AttributeDict(job_config)
    jobs_overwrite_params.append(job_config)

print('number of jobs', len(jobs_overwrite_params))


number of jobs 14


In [6]:
def create_job_config(config_upate, out_dir, uid=None):
    global base_model_config, base_train_config, base_data_config
    model_config, train_config, data_config = tuple(copy.deepcopy(c) for c in (base_model_config, base_train_config, base_data_config))

    model_config.update(config_upate.get('model_config', {}))
    train_config.update(config_upate.get('train_config', {}))
    data_config.update(config_upate.get('data_config', {}))

    experiment_name, _ = get_experiment_name(model_config, data_config, train_config)
    experiment_name = experiment_name.replace(' ', '')
    if uid is not None:
        experiment_name = f"UID{uid}-{experiment_name}"

    mkdir(os.path.join(out_dir, experiment_name))

    with open(os.path.join(out_dir, f'{experiment_name}/model_config.yaml'), 'w') as f:
        yaml.dump(model_config.todict(), f)

    with open(os.path.join(out_dir, f'{experiment_name}/train_config.yaml'), 'w') as f:
        yaml.dump(train_config.todict(), f)

    with open(os.path.join(out_dir, f'{experiment_name}/data_config.yaml'), 'w') as f:
        yaml.dump(data_config.todict(), f)

    return model_config, train_config, data_config, experiment_name

In [7]:
def create_job_script(experiment_name):
    filename = f'{job_directory}/{experiment_name}.job'
    with open(filename, 'w') as fh:
        fh.writelines(f"#!/bin/bash\n")
        fh.writelines(f"#SBATCH --partition={partition}\n")
        fh.writelines(f"#SBATCH --job-name={experiment_name}\n")
        fh.writelines(f"#SBATCH --output={out_dir}/%j-{experiment_name}.out\n")
        fh.writelines(f"#SBATCH --ntasks={ntasks} --nodes={nodes}\n")
        fh.writelines(f"#SBATCH --cpus-per-gpu={cpu_per_gpu}\n")
        fh.writelines(f"#SBATCH --mem-per-cpu={mem_per_cpu}G\n")
        fh.writelines(f"#SBATCH --time={time_str}\n")
        fh.writelines(f"#SBATCH --mail-type=ALL\n")
        fh.writelines(f"#SBATCH --gpus={n_gpus}\n")
        # fh.writelines(f"#SBATCH --constraint={gpus_constraints}\n")

        fh.writelines('\n')
        fh.writelines('module load StdEnv\n')
        fh.writelines('export SLURM_EXPORT_ENV=ALL\n')
        fh.writelines('\n')

        if cluster == 'grace':
            fh.writelines(f"module restore python_env\n") # load modules i need
        elif cluster == 'misha':
            fh.writelines(f"module load miniconda\n") # load modules i need
        else:
            raise ValueError(f"Cluster {cluster} not supported")

        # fh.writelines(f"conda init\n")
        fh.writelines(f"conda activate neural_prog\n") # activate conda environment
        fh.writelines(f"conda info --envs\n") # activate conda environment

        fh.writelines('\n')
        fh.writelines(f"nvidia-smi -L\n") # print gpu information
        fh.writelines('\n')

        fh.writelines(f"cd {project_dir}\n") # navigate to project directory
        fh.writelines('\n')

        # run python script
        fh.writelines(f"srun python train.py --config_dir experiment_configs/{experiment_name}\n") # run python script

    return filename


In [8]:
job_script_files = []

for uid, job_params in enumerate(jobs_overwrite_params):
    base_model_config, base_train_config, base_data_config, experiment_name = create_job_config(job_params, config_out_dir, uid=uid)

    print(f"Experiment Name: {experiment_name}")

    job_script = create_job_script(experiment_name)
    job_script_files.append(job_script)

Experiment Name: UID0-L6H8D256-rotary-none-WTTrue
Experiment Name: UID1-L6H8D256-rotary-pre-norm-WTTrue
Experiment Name: UID2-L6H8D256-rotary-post-norm-WTTrue
Experiment Name: UID3-L6H8D256-rotary-pre+post-norm-WTTrue
Experiment Name: UID4-L6H8D256-rotary-hypersphere-interpolation-WTTrue
Experiment Name: UID5-L6H8D256-rotary-hypersphere-spherical-interpolation-WTTrue
Experiment Name: UID6-L6H8D256-rotary-adaptive-hypersphere-interpolation-WTTrue
Experiment Name: UID7-L6H8D256-none-none-WTTrue
Experiment Name: UID8-L6H8D256-none-pre-norm-WTTrue
Experiment Name: UID9-L6H8D256-none-post-norm-WTTrue
Experiment Name: UID10-L6H8D256-none-pre+post-norm-WTTrue
Experiment Name: UID11-L6H8D256-none-hypersphere-interpolation-WTTrue
Experiment Name: UID12-L6H8D256-none-hypersphere-spherical-interpolation-WTTrue
Experiment Name: UID13-L6H8D256-none-adaptive-hypersphere-interpolation-WTTrue


In [9]:
wait_time = 0.5 # number of seconds to wait between job submissions
n_trials = 1

confirm = input("Do you want to submit the jobs? (y/n): ")

responses = []

if confirm == 'y':
    for ir in range(n_trials):
        print('Trial:', ir)
        for job_script in job_script_files:
            response = subprocess.run(['sbatch', job_script], capture_output=True)
            print(f"response: {response.stdout.decode('utf-8').strip()}, return_code={response.returncode}, job_script={job_script}")
            responses.append(response)
            time.sleep(wait_time)
        print()
else:
    print("Not submitting jobs")

Trial: 0
response: Submitted batch job 137401, return_code=0, job_script=job_scripts/UID0-L6H8D256-rotary-none-WTTrue.job
response: Submitted batch job 137402, return_code=0, job_script=job_scripts/UID1-L6H8D256-rotary-pre-norm-WTTrue.job
response: Submitted batch job 137403, return_code=0, job_script=job_scripts/UID2-L6H8D256-rotary-post-norm-WTTrue.job
response: Submitted batch job 137404, return_code=0, job_script=job_scripts/UID3-L6H8D256-rotary-pre+post-norm-WTTrue.job
response: Submitted batch job 137405, return_code=0, job_script=job_scripts/UID4-L6H8D256-rotary-hypersphere-interpolation-WTTrue.job
response: Submitted batch job 137406, return_code=0, job_script=job_scripts/UID5-L6H8D256-rotary-hypersphere-spherical-interpolation-WTTrue.job
response: Submitted batch job 137407, return_code=0, job_script=job_scripts/UID6-L6H8D256-rotary-adaptive-hypersphere-interpolation-WTTrue.job
response: Submitted batch job 137408, return_code=0, job_script=job_scripts/UID7-L6H8D256-none-none-

In [10]:
# check if any jobs failed to submit
for response in responses:
    if not response.stdout.decode('utf-8').startswith('Submitted batch job') or response.returncode != 0:
        print(f"Failed to submit job: {response.stdout.decode('utf-8')}")
        print(f"stderr: {response.stderr.decode('utf-8')}")
        print(f"Full response: {response}")
        print()