In [1]:
import os
import itertools

In [2]:
def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)

In [3]:
# global job parameters

job_directory = f"vision_cifar10"
out_dir = f'{job_directory}/.out'
time_str = '00-4:00:00'
partition = 'gpu'
ntasks = 1
nodes = 1
cpu_per_task = 8
mem_per_cpu = 2
n_gpus = 1
# gpus_constraints = '"a100|rtx3090|v100|rtx2080ti"' # all gpus are pretty good now
project_dir = "/home/ma2393/scratch/abstract_transformer/experiments/vision"

mkdir(job_directory)
mkdir(out_dir)

In [4]:
# model params
symbol_types = ['pos_relative', 'sym_attn', 'pos_sym_retrieval']
disentangled_rcas = [0, 1]
patch_size = 8

model_params = [
    # dict(d_model=256, n_layers=2, sa=8, rca=0),
    # dict(d_model=256, n_layers=2, sa=6, rca=2),
    # dict(d_model=256, n_layers=2, sa=4, rca=4),
    dict(d_model=256, n_layers=4, sa=8, rca=0),
    dict(d_model=256, n_layers=4, sa=6, rca=2),
    dict(d_model=256, n_layers=4, sa=4, rca=4),
    # dict(d_model=768, n_layers=8, sa=12, rca=0),
    # dict(d_model=768, n_layers=8, sa=8, rca=4),
    # dict(d_model=768, n_layers=8, sa=6, rca=6),
    # dict(d_model=768, n_layers=12, sa=12, rca=0),
    # dict(d_model=768, n_layers=12, sa=8, rca=4),
    # dict(d_model=768, n_layers=12, sa=6, rca=6),
]

jobs_params = []
for mparams in model_params:
    if mparams['rca'] != 0:
        for disrca in disentangled_rcas:
            for symbol_type in symbol_types:
                jobs_params.append({**mparams, 'disentangled_rca': disrca, "symbol_type": symbol_type})
    else:
        jobs_params.append({**mparams, 'disentangled_rca': -1, 'symbol_type': 'NA'})

In [5]:
jobs_params

[{'d_model': 256,
  'n_layers': 4,
  'sa': 8,
  'rca': 0,
  'disentangled_rca': -1,
  'symbol_type': 'NA'},
 {'d_model': 256,
  'n_layers': 4,
  'sa': 6,
  'rca': 2,
  'disentangled_rca': 0,
  'symbol_type': 'pos_relative'},
 {'d_model': 256,
  'n_layers': 4,
  'sa': 6,
  'rca': 2,
  'disentangled_rca': 0,
  'symbol_type': 'sym_attn'},
 {'d_model': 256,
  'n_layers': 4,
  'sa': 6,
  'rca': 2,
  'disentangled_rca': 0,
  'symbol_type': 'pos_sym_retrieval'},
 {'d_model': 256,
  'n_layers': 4,
  'sa': 6,
  'rca': 2,
  'disentangled_rca': 1,
  'symbol_type': 'pos_relative'},
 {'d_model': 256,
  'n_layers': 4,
  'sa': 6,
  'rca': 2,
  'disentangled_rca': 1,
  'symbol_type': 'sym_attn'},
 {'d_model': 256,
  'n_layers': 4,
  'sa': 6,
  'rca': 2,
  'disentangled_rca': 1,
  'symbol_type': 'pos_sym_retrieval'},
 {'d_model': 256,
  'n_layers': 4,
  'sa': 4,
  'rca': 4,
  'disentangled_rca': 0,
  'symbol_type': 'pos_relative'},
 {'d_model': 256,
  'n_layers': 4,
  'sa': 4,
  'rca': 4,
  'disentangl

In [6]:
len(jobs_params)

13

In [7]:
# global config parameters
n_epochs = 50
# max_steps = -1
log_to_wandb = 1

In [8]:
# create jobs
created_jobs = []
for params in jobs_params:

    job_name = (f"vision-CIFAR10-d{params['d_model']}-sa{params['sa']}-rca{params['rca']}-L{params['n_layers']}"
        f"-dis_rca{params['disentangled_rca']}-{params['symbol_type']}")

    job_file = os.path.join(job_directory, f"{job_name}.job")

    with open(job_file, 'w') as fh:
        fh.writelines(f"#!/bin/bash\n")
        fh.writelines(f"#SBATCH --partition={partition}\n")
        fh.writelines(f"#SBATCH --job-name={job_name}\n")
        fh.writelines(f"#SBATCH --output={out_dir}/%j-{job_name}.out\n")
        fh.writelines(f"#SBATCH --ntasks={ntasks} --nodes={nodes}\n")
        fh.writelines(f"#SBATCH --cpus-per-task={cpu_per_task}\n")
        fh.writelines(f"#SBATCH --mem-per-cpu={mem_per_cpu}G\n")
        fh.writelines(f"#SBATCH --time={time_str}\n")
        fh.writelines(f"#SBATCH --mail-type=ALL\n")
        fh.writelines(f"#SBATCH --gpus={n_gpus}\n")
        # fh.writelines(f"#SBATCH -C {gpus_constraints}\n")# --gpus={n_gpus}\n")

        fh.writelines('\n')
        fh.writelines('module load StdEnv\n')
        fh.writelines('export SLURM_EXPORT_ENV=ALL\n')
        fh.writelines('\n')

        # fh.writelines(f"module restore python_env\n") # load modules i need
        fh.writelines(f"module load miniconda\n") # load modules i need
        # fh.writelines(f"conda init\n")
        fh.writelines(f"conda activate abstract_transformer\n") # activate conda environment
        fh.writelines(f"conda info --envs\n") # activate conda environment

        fh.writelines('\n')
        fh.writelines(f"nvidia-smi -L\n") # print gpu information
        fh.writelines('\n')

        fh.writelines(f"cd {project_dir}\n") # navigate to project directory
        # run python script
        fh.writelines(f"python train_vision_model.py ")
        fh.writelines(f"--d_model {params['d_model']} --sa {params['sa']} --rca {params['rca']} --n_layers {params['n_layers']} ")
        fh.writelines(f"--disentangled_rca {params['disentangled_rca']} --symbol_type {params['symbol_type']} --patch_size {patch_size} ")
        fh.writelines(f"--n_epochs {n_epochs} --log_to_wandb {log_to_wandb}\n")

    created_jobs.append(job_file)

In [9]:
created_jobs

['vision_cifar10/vision-CIFAR10-d256-sa8-rca0-L4-dis_rca-1-NA.job',
 'vision_cifar10/vision-CIFAR10-d256-sa6-rca2-L4-dis_rca0-pos_relative.job',
 'vision_cifar10/vision-CIFAR10-d256-sa6-rca2-L4-dis_rca0-sym_attn.job',
 'vision_cifar10/vision-CIFAR10-d256-sa6-rca2-L4-dis_rca0-pos_sym_retrieval.job',
 'vision_cifar10/vision-CIFAR10-d256-sa6-rca2-L4-dis_rca1-pos_relative.job',
 'vision_cifar10/vision-CIFAR10-d256-sa6-rca2-L4-dis_rca1-sym_attn.job',
 'vision_cifar10/vision-CIFAR10-d256-sa6-rca2-L4-dis_rca1-pos_sym_retrieval.job',
 'vision_cifar10/vision-CIFAR10-d256-sa4-rca4-L4-dis_rca0-pos_relative.job',
 'vision_cifar10/vision-CIFAR10-d256-sa4-rca4-L4-dis_rca0-sym_attn.job',
 'vision_cifar10/vision-CIFAR10-d256-sa4-rca4-L4-dis_rca0-pos_sym_retrieval.job',
 'vision_cifar10/vision-CIFAR10-d256-sa4-rca4-L4-dis_rca1-pos_relative.job',
 'vision_cifar10/vision-CIFAR10-d256-sa4-rca4-L4-dis_rca1-sym_attn.job',
 'vision_cifar10/vision-CIFAR10-d256-sa4-rca4-L4-dis_rca1-pos_sym_retrieval.job']

In [10]:
confirm = input("CONTINUE TO RUN ALL JOBS? (enter 'Y' or 'y')")
if confirm in ("Y", "y"):
    for job in created_jobs:
        os.system(f'sbatch {job}')
else:
    print("did not run jobs since you did not confirm.")

Submitted batch job 3720
Submitted batch job 3721
Submitted batch job 3722
Submitted batch job 3723
Submitted batch job 3724
Submitted batch job 3725
Submitted batch job 3726
Submitted batch job 3727
Submitted batch job 3728
Submitted batch job 3729
Submitted batch job 3730
Submitted batch job 3731
Submitted batch job 3732
