In [1]:
import os
import itertools

In [2]:
def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)

In [3]:
# global job parameters

job_directory = f"vision_cifar10"
# job_directory = f"vision_imagenet"
out_dir = f'{job_directory}/.out'
time_str = '00-08:00:00'
partition = 'gpu'
ntasks = 1
nodes = 1
cpu_per_gpu = 8
mem_per_cpu = 16
n_gpus = 1
# gpus_constraints = '"a100|rtx3090|v100|rtx2080ti"' # all gpus are pretty good now
gpus_constraints = None # '"h100|a100"' # all gpus are pretty good now
project_dir = "/home/ma2393/project/abstract_transformer/experiments/vision_cifar10"

mkdir(job_directory)
mkdir(out_dir)

In [4]:
wandb_project = 'dat-cifar10'
jobs_params = [
    dict(dataset='cifar10', autoaugment=True),
    dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=6, ra_type='relational_attention', symbol_type='position_relative', symmetric_rels=1),
    dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=6, ra_type='relational_attention', symbol_type='positional_symbols', symmetric_rels=1),
    dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=6, ra_type='relational_attention', symbol_type='symbolic_attention', symmetric_rels=1),
    dict(dataset='cifar10', autoaugment=True, sa=6, ra=6, n_relations=32, ra_type='relational_attention', symbol_type='symbolic_attention', symmetric_rels=1),

    dict(dataset='cifar100', autoaugment=True),
    dict(dataset='cifar100', autoaugment=True, sa=6, ra=6, n_relations=6, ra_type='relational_attention', symbol_type='position_relative', symmetric_rels=1),
    dict(dataset='cifar100', autoaugment=True, sa=6, ra=6, n_relations=16, ra_type='relational_attention', symbol_type='position_relative', symmetric_rels=1),
    dict(dataset='cifar100', autoaugment=True, sa=6, ra=6, n_relations=6, ra_type='relational_attention', symbol_type='positional_symbols', symmetric_rels=1),
    dict(dataset='cifar100', autoaugment=True, sa=6, ra=6, n_relations=6, ra_type='relational_attention', symbol_type='symbolic_attention', symmetric_rels=1),
    dict(dataset='cifar100', autoaugment=True, sa=6, ra=6, n_relations=32, ra_type='relational_attention', symbol_type='symbolic_attention', symmetric_rels=1),

]

In [5]:
jobs_params

[{'dataset': 'cifar10', 'autoaugment': True},
 {'dataset': 'cifar10',
  'autoaugment': True,
  'sa': 6,
  'ra': 6,
  'n_relations': 6,
  'ra_type': 'relational_attention',
  'symbol_type': 'position_relative',
  'symmetric_rels': 1},
 {'dataset': 'cifar10',
  'autoaugment': True,
  'sa': 6,
  'ra': 6,
  'n_relations': 6,
  'ra_type': 'relational_attention',
  'symbol_type': 'positional_symbols',
  'symmetric_rels': 1},
 {'dataset': 'cifar10',
  'autoaugment': True,
  'sa': 6,
  'ra': 6,
  'n_relations': 6,
  'ra_type': 'relational_attention',
  'symbol_type': 'symbolic_attention',
  'symmetric_rels': 1},
 {'dataset': 'cifar10',
  'autoaugment': True,
  'sa': 6,
  'ra': 6,
  'n_relations': 32,
  'ra_type': 'relational_attention',
  'symbol_type': 'symbolic_attention',
  'symmetric_rels': 1},
 {'dataset': 'cifar100', 'autoaugment': True},
 {'dataset': 'cifar100',
  'autoaugment': True,
  'sa': 6,
  'ra': 6,
  'n_relations': 6,
  'ra_type': 'relational_attention',
  'symbol_type': 'positi

In [6]:
len(jobs_params)

11

In [7]:
# global config parameters
n_epochs = 100
# max_steps = -1
# log_to_wandb = 1
# patch_size = 16

In [8]:
# create jobs
created_jobs = []
for params in jobs_params:

    # job_name = f"vision-IMAGENET-d{params['d_model']}-sa{params['sa']}-ra{params['ra']}"
    # if "n_relations" in params:
    #     job_name += f"-nr{params['n_relations']}-L{params['n_layers']}"
    # job_name += f"-ra_type_{params['ra_type']}"
    # if "symbol_type" in params:
    #     job_name += f"-symbol_type_{params['symbol_type']}"
    # if 'activation' in params:
    #     job_name += f"-act_{params['activation']}"
    job_name = f"vision-{params['dataset']}-autoaugment"
    if "sa" in params:
        job_name += f"-sa{params['sa']}"
    if "ra" in params:
        job_name += f"-ra{params['ra']}"
    if "n_relations" in params:
        job_name += f"-nr{params['n_relations']}"
    if "symbol_type" in params:
        job_name += f"-{params['symbol_type']}"

    job_file = os.path.join(job_directory, f"{job_name}.job")

    with open(job_file, 'w') as fh:
        fh.writelines(f"#!/bin/bash\n")
        fh.writelines(f"#SBATCH --partition={partition}\n")
        fh.writelines(f"#SBATCH --job-name={job_name}\n")
        fh.writelines(f"#SBATCH --output={out_dir}/%j-{job_name}.out\n")
        fh.writelines(f"#SBATCH --ntasks={ntasks} --nodes={nodes}\n")
        fh.writelines(f"#SBATCH --cpus-per-gpu={cpu_per_gpu}\n")
        fh.writelines(f"#SBATCH --mem-per-cpu={mem_per_cpu}G\n")
        fh.writelines(f"#SBATCH --time={time_str}\n")
        fh.writelines(f"#SBATCH --mail-type=ALL\n")
        fh.writelines(f"#SBATCH --gpus={n_gpus}\n")
        # fh.writelines(f"#SBATCH --reservation=h100\n") # NOTE: using h100 reservation for noow
        # fh.writelines(f"#SBATCH -C {gpus_constraints}\n")# --gpus={n_gpus}\n")

        fh.writelines('\n')
        fh.writelines('module load StdEnv\n')
        fh.writelines('export SLURM_EXPORT_ENV=ALL\n')
        fh.writelines('\n')

        # fh.writelines(f"module restore python_env\n") # load modules i need
        fh.writelines(f"module load miniconda\n") # load modules i need
        # fh.writelines(f"conda init\n")
        fh.writelines(f"conda activate abstract_transformer\n") # activate conda environment
        fh.writelines(f"conda info --envs\n") # activate conda environment

        fh.writelines('\n')
        fh.writelines(f"nvidia-smi -L\n") # print gpu information
        fh.writelines('\n')

        fh.writelines(f"cd {project_dir}\n") # navigate to project directory
        # run python script
        fh.writelines(f"python main.py ")
        fh.writelines(f"--dataset {params['dataset']} ")
        if params.get('autoaugment', False):
            fh.writelines(f"--autoaugment ")
        if "sa" in params:
            fh.writelines(f"--sa {params['sa']} ")
        if "ra" in params:
            fh.writelines(f"--ra {params['ra']} ")
        if "n_relations" in params:
            fh.writelines(f"--n_relations {params['n_relations']} ")
        if "ra_type" in params:
            fh.writelines(f"--ra_type {params['ra_type']} ")
        if "symbol_type" in params:
            fh.writelines(f"--symbol_type {params['symbol_type']} ")
        if "symmetric_rels" in params:
            fh.writelines(f"--symmetric_rels {params['symmetric_rels']} ")
        if "n_symbols" in params:
            fh.writelines(f"--n_symbols {params['n_symbols']} ")
        fh.writelines(f"--max-epochs {n_epochs} ")
        fh.writelines(f"--wandb_project {wandb_project} ")

    created_jobs.append(job_file)

In [9]:
created_jobs

['vision_cifar10/vision-cifar10-autoaugment.job',
 'vision_cifar10/vision-cifar10-autoaugment-sa6-ra6-nr6-position_relative.job',
 'vision_cifar10/vision-cifar10-autoaugment-sa6-ra6-nr6-positional_symbols.job',
 'vision_cifar10/vision-cifar10-autoaugment-sa6-ra6-nr6-symbolic_attention.job',
 'vision_cifar10/vision-cifar10-autoaugment-sa6-ra6-nr32-symbolic_attention.job',
 'vision_cifar10/vision-cifar100-autoaugment.job',
 'vision_cifar10/vision-cifar100-autoaugment-sa6-ra6-nr6-position_relative.job',
 'vision_cifar10/vision-cifar100-autoaugment-sa6-ra6-nr16-position_relative.job',
 'vision_cifar10/vision-cifar100-autoaugment-sa6-ra6-nr6-positional_symbols.job',
 'vision_cifar10/vision-cifar100-autoaugment-sa6-ra6-nr6-symbolic_attention.job',
 'vision_cifar10/vision-cifar100-autoaugment-sa6-ra6-nr32-symbolic_attention.job']

In [10]:
confirm = input("CONTINUE TO RUN ALL JOBS? (enter 'Y' or 'y')")
if confirm in ("Y", "y"):
    for job in created_jobs:
        os.system(f'sbatch {job}')
else:
    print("did not run jobs since you did not confirm.")

Submitted batch job 32123
Submitted batch job 32124
Submitted batch job 32125
Submitted batch job 32126
Submitted batch job 32127
Submitted batch job 32128
Submitted batch job 32129
Submitted batch job 32130
Submitted batch job 32131
Submitted batch job 32132
Submitted batch job 32133
