In [15]:
import os
import itertools

In [16]:
def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)

In [17]:
# global job parameters

job_directory = f"vision_cifar10"
# job_directory = f"vision_imagenet"
out_dir = f'{job_directory}/.out'
time_str = '00-08:00:00'
# partition = 'gpu'
partition = 'gpu,scavenge_gpu'
ntasks = 1
nodes = 1
cpu_per_gpu = 8
mem_per_cpu = 4
n_gpus = 1
gpus_constraints = '"a100|rtx3090|v100|rtx5000|rtx2080ti"' # all gpus are pretty good now
# gpus_constraints = None # '"h100|a100"' # all gpus are pretty good now
# project_dir = "/home/ma2393/project/abstract_transformer/experiments/vision_cifar10"
project_dir = "/gpfs/gibbs/project/lafferty/ma2393/abstract_transformer/experiments/vision_cifar10"

mkdir(job_directory)
mkdir(out_dir)

In [18]:
wandb_project = 'dat-cifar'
jobs_params = [
    dict(dataset='cifar10', autoaugment=True),

    dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=6, ra_type='relational_attention', symbol_type='position_relative', symmetric_rels=0, n_kv_heads=3),
    dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=12, ra_type='relational_attention', symbol_type='position_relative', symmetric_rels=0, n_kv_heads=3),
    dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=24, ra_type='relational_attention', symbol_type='position_relative', symmetric_rels=0, n_kv_heads=3),

    dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=6, ra_type='relational_attention', symbol_type='position_relative', symmetric_rels=1),
    dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=6, ra_type='relational_attention', symbol_type='position_relative', symmetric_rels=1, n_kv_heads=3),
    dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=12, ra_type='relational_attention', symbol_type='position_relative', symmetric_rels=1, n_kv_heads=3),
    dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=24, ra_type='relational_attention', symbol_type='position_relative', symmetric_rels=1, n_kv_heads=3),
    # dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=12, ra_type='relational_attention', symbol_type='position_relative', symmetric_rels=1),
    # dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=24, ra_type='relational_attention', symbol_type='position_relative', symmetric_rels=1),
]

# for i in range(len(jobs_params)):
#     j = jobs_params[i].copy()
#     if 'ra' in j:
#         j_ = j.copy()
#         j_['symbol_type'] = 'positional_symbols'
#         jobs_params.append(j_)
#         j_ = j.copy()
#         j_['symbol_type'] = 'symbolic_attention'
#         jobs_params.append(j_)

for i in range(len(jobs_params)):
    j_ = jobs_params[i].copy()
    j_['dataset'] = 'cifar100'
    jobs_params.append(j_)

for j in jobs_params:
    j['pool'] = 'mean'

In [19]:
len(jobs_params)

16

In [20]:
# global config parameters
n_epochs = 100
# max_steps = -1
# log_to_wandb = 1
# patch_size = 16

In [21]:
# create jobs
created_jobs = []
for params in jobs_params:

    # job_name = f"vision-IMAGENET-d{params['d_model']}-sa{params['sa']}-ra{params['ra']}"
    # if "n_relations" in params:
    #     job_name += f"-nr{params['n_relations']}-L{params['n_layers']}"
    # job_name += f"-ra_type_{params['ra_type']}"
    # if "symbol_type" in params:
    #     job_name += f"-symbol_type_{params['symbol_type']}"
    # if 'activation' in params:
    #     job_name += f"-act_{params['activation']}"
    job_name = f"vision-{params['dataset']}-autoaugment"
    if "sa" in params:
        job_name += f"-sa{params['sa']}"
    if "ra" in params:
        job_name += f"-ra{params['ra']}"
    if "n_relations" in params:
        job_name += f"-nr{params['n_relations']}"
    if 'symmetric_rels' in params:
        job_name += f"-symrel{params['symmetric_rels']}"
    if "symbol_type" in params:
        job_name += f"-{params['symbol_type']}"
    if "n_kv_heads" in params:
        job_name += f"-n_kv_heads{params['n_kv_heads']}"

    job_file = os.path.join(job_directory, f"{job_name}.job")

    with open(job_file, 'w') as fh:
        fh.writelines(f"#!/bin/bash\n")
        fh.writelines(f"#SBATCH --partition={partition}\n")
        fh.writelines(f"#SBATCH --job-name={job_name}\n")
        fh.writelines(f"#SBATCH --output={out_dir}/%j-{job_name}.out\n")
        fh.writelines(f"#SBATCH --ntasks={ntasks} --nodes={nodes}\n")
        fh.writelines(f"#SBATCH --cpus-per-gpu={cpu_per_gpu}\n")
        fh.writelines(f"#SBATCH --mem-per-cpu={mem_per_cpu}G\n")
        fh.writelines(f"#SBATCH --time={time_str}\n")
        fh.writelines(f"#SBATCH --mail-type=ALL\n")
        fh.writelines(f"#SBATCH --gpus={n_gpus}\n")
        # fh.writelines(f"#SBATCH --reservation=h100\n") # NOTE: using h100 reservation for noow
        # fh.writelines(f"#SBATCH -C {gpus_constraints}\n")# --gpus={n_gpus}\n")

        fh.writelines('\n')
        fh.writelines('module load StdEnv\n')
        fh.writelines('export SLURM_EXPORT_ENV=ALL\n')
        fh.writelines('\n')

        fh.writelines(f"module restore python_env\n") # load modules i need
        # fh.writelines(f"module load miniconda\n") # load modules i need
        # fh.writelines(f"conda init\n")
        fh.writelines(f"conda activate abstract_transformer\n") # activate conda environment
        fh.writelines(f"conda info --envs\n") # activate conda environment

        fh.writelines('\n')
        fh.writelines(f"nvidia-smi -L\n") # print gpu information
        fh.writelines('\n')

        fh.writelines(f"cd {project_dir}\n") # navigate to project directory
        # run python script
        fh.writelines(f"python train.py ")
        fh.writelines(f"--dataset {params['dataset']} ")
        if params.get('autoaugment', False):
            fh.writelines(f"--autoaugment ")
        if "sa" in params:
            fh.writelines(f"--sa {params['sa']} ")
        if "ra" in params:
            fh.writelines(f"--ra {params['ra']} ")
        if "n_relations" in params:
            fh.writelines(f"--n_relations {params['n_relations']} ")
        if "ra_type" in params:
            fh.writelines(f"--ra_type {params['ra_type']} ")
        if "symbol_type" in params:
            fh.writelines(f"--symbol_type {params['symbol_type']} ")
        if "symmetric_rels" in params:
            fh.writelines(f"--symmetric_rels {params['symmetric_rels']} ")
        if "n_symbols" in params:
            fh.writelines(f"--n_symbols {params['n_symbols']} ")
        if "n_kv_heads" in params:
            fh.writelines(f"--n_kv_heads {params['n_kv_heads']} ")
        if 'pool' in params:
            fh.writelines(f"--pool {params['pool']} ")

        fh.writelines(f"--max-epochs {n_epochs} ")
        fh.writelines(f"--wandb_project {wandb_project} ")

    created_jobs.append(job_file)

In [22]:
created_jobs

['vision_cifar10/vision-cifar10-autoaugment.job',
 'vision_cifar10/vision-cifar10-autoaugment-sa6-ra6-nr6-symrel0-position_relative-n_kv_heads3.job',
 'vision_cifar10/vision-cifar10-autoaugment-sa6-ra6-nr12-symrel0-position_relative-n_kv_heads3.job',
 'vision_cifar10/vision-cifar10-autoaugment-sa6-ra6-nr24-symrel0-position_relative-n_kv_heads3.job',
 'vision_cifar10/vision-cifar10-autoaugment-sa6-ra6-nr6-symrel1-position_relative.job',
 'vision_cifar10/vision-cifar10-autoaugment-sa6-ra6-nr6-symrel1-position_relative-n_kv_heads3.job',
 'vision_cifar10/vision-cifar10-autoaugment-sa6-ra6-nr12-symrel1-position_relative-n_kv_heads3.job',
 'vision_cifar10/vision-cifar10-autoaugment-sa6-ra6-nr24-symrel1-position_relative-n_kv_heads3.job',
 'vision_cifar10/vision-cifar100-autoaugment.job',
 'vision_cifar10/vision-cifar100-autoaugment-sa6-ra6-nr6-symrel0-position_relative-n_kv_heads3.job',
 'vision_cifar10/vision-cifar100-autoaugment-sa6-ra6-nr12-symrel0-position_relative-n_kv_heads3.job',
 'vi

In [23]:
n_trials = 5

confirm = input("CONTINUE TO RUN ALL JOBS? (enter 'Y' or 'y')")
if confirm in ("Y", "y"):
    for trial in range(n_trials):
        for job in created_jobs:
            os.system(f'sbatch {job}')
else:
    print("did not run jobs since you did not confirm.")

Submitted batch job 1399879
Submitted batch job 1399880
Submitted batch job 1399881
Submitted batch job 1399882
Submitted batch job 1399883
Submitted batch job 1399884
Submitted batch job 1399885
Submitted batch job 1399886
Submitted batch job 1399887
Submitted batch job 1399888
Submitted batch job 1399889
Submitted batch job 1399890
Submitted batch job 1399891
Submitted batch job 1399892
Submitted batch job 1399893
Submitted batch job 1399894
Submitted batch job 1399895
Submitted batch job 1399896
Submitted batch job 1399897
Submitted batch job 1399898
Submitted batch job 1399899
Submitted batch job 1399900
Submitted batch job 1399901
Submitted batch job 1399902
Submitted batch job 1399903
Submitted batch job 1399904
Submitted batch job 1399905
Submitted batch job 1399906
Submitted batch job 1399907
Submitted batch job 1399908
Submitted batch job 1399909
Submitted batch job 1399910
Submitted batch job 1399911
Submitted batch job 1399912
Submitted batch job 1399913
Submitted batch job 