In [1]:
import os
import itertools
import time

In [2]:
def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)

In [3]:
# global job parameters

# job_directory = f"vision_cifar10"
job_directory = f"relational_games"
out_dir = f'{job_directory}/.out'
time_str = '00-02:00:00'
partition = 'gpu'
ntasks = 1
nodes = 1
cpu_per_task = 8
mem_per_cpu = 2
n_gpus = 1
# gpus_constraints = '"a100|rtx3090|v100|rtx2080ti"' # all gpus are pretty good now
project_dir = "/home/ma2393/project/abstract_transformer/experiments/relational_games"

mkdir(job_directory)
mkdir(out_dir)

In [4]:
# model params
model_params = [
    dict(d_model=128, dff=256, n_layers=1, sa=0, rca=4, rca_type='relational_attention', symbol_type='positional_symbols'),
    dict(d_model=128, dff=256, n_layers=1, sa=2, rca=2, rca_type='relational_attention', symbol_type='positional_symbols'),
    dict(d_model=128, dff=256, n_layers=1, sa=0, rca=4, rca_type='rca', symbol_type='positional_symbols'),
    dict(d_model=128, dff=256, n_layers=1, sa=2, rca=2, rca_type='rca', symbol_type='positional_symbols'),
    dict(d_model=128, dff=256, n_layers=1, sa=4, rca=0, rca_type='NA', symbol_type='NA'),
    dict(d_model=128, dff=256, n_layers=2, sa=0, rca=4, rca_type='relational_attention', symbol_type='positional_symbols'),
    dict(d_model=128, dff=256, n_layers=2, sa=2, rca=2, rca_type='relational_attention', symbol_type='positional_symbols'),
    dict(d_model=128, dff=256, n_layers=2, sa=0, rca=4, rca_type='rca', symbol_type='positional_symbols'),
    dict(d_model=128, dff=256, n_layers=2, sa=2, rca=2, rca_type='rca', symbol_type='positional_symbols'),
    dict(d_model=128, dff=256, n_layers=2, sa=4, rca=0, rca_type='NA', symbol_type='NA'),
]

# global config parameters
patch_size = 12
n_epochs = 50
# max_steps = -1
log_to_wandb = 1

# tasks
tasks = ['same', 'occurs', 'xoccurs', '1task_between', '1task_match_patt']

In [5]:
jobs_params = []
for task in tasks:
    for mparams in model_params:
        jobs_params.append({'task': task, 'compile': 1, **mparams})

In [6]:
jobs_params

[{'task': 'same',
  'compile': 1,
  'd_model': 128,
  'dff': 256,
  'n_layers': 1,
  'sa': 0,
  'rca': 4,
  'rca_type': 'relational_attention',
  'symbol_type': 'positional_symbols'},
 {'task': 'same',
  'compile': 1,
  'd_model': 128,
  'dff': 256,
  'n_layers': 1,
  'sa': 2,
  'rca': 2,
  'rca_type': 'relational_attention',
  'symbol_type': 'positional_symbols'},
 {'task': 'same',
  'compile': 1,
  'd_model': 128,
  'dff': 256,
  'n_layers': 1,
  'sa': 0,
  'rca': 4,
  'rca_type': 'rca',
  'symbol_type': 'positional_symbols'},
 {'task': 'same',
  'compile': 1,
  'd_model': 128,
  'dff': 256,
  'n_layers': 1,
  'sa': 2,
  'rca': 2,
  'rca_type': 'rca',
  'symbol_type': 'positional_symbols'},
 {'task': 'same',
  'compile': 1,
  'd_model': 128,
  'dff': 256,
  'n_layers': 1,
  'sa': 4,
  'rca': 0,
  'rca_type': 'NA',
  'symbol_type': 'NA'},
 {'task': 'same',
  'compile': 1,
  'd_model': 128,
  'dff': 256,
  'n_layers': 2,
  'sa': 0,
  'rca': 4,
  'rca_type': 'relational_attention',
  's

In [7]:
len(jobs_params)

50

In [8]:
# create jobs
created_jobs = []
for params in jobs_params:

    job_name = (f"relational_games-{params['task']}-d{params['d_model']}-sa{params['sa']}-rca{params['rca']}-L{params['n_layers']}"
        f"-rca_type_{params['rca_type']}-symbol_type_{params['symbol_type']}")

    job_file = os.path.join(job_directory, f"{job_name}.job")

    with open(job_file, 'w') as fh:
        fh.writelines(f"#!/bin/bash\n")
        fh.writelines(f"#SBATCH --partition={partition}\n")
        fh.writelines(f"#SBATCH --job-name={job_name}\n")
        fh.writelines(f"#SBATCH --output={out_dir}/%j-{job_name}.out\n")
        fh.writelines(f"#SBATCH --ntasks={ntasks} --nodes={nodes}\n")
        fh.writelines(f"#SBATCH --cpus-per-task={cpu_per_task}\n")
        fh.writelines(f"#SBATCH --mem-per-cpu={mem_per_cpu}G\n")
        fh.writelines(f"#SBATCH --time={time_str}\n")
        fh.writelines(f"#SBATCH --mail-type=ALL\n")
        fh.writelines(f"#SBATCH --gpus={n_gpus}\n")
        # fh.writelines(f"#SBATCH --reservation=h100\n") # NOTE: using h100 reservation for noow
        # fh.writelines(f"#SBATCH -C {gpus_constraints}\n")# --gpus={n_gpus}\n")

        fh.writelines('\n')
        fh.writelines('module load StdEnv\n')
        fh.writelines('export SLURM_EXPORT_ENV=ALL\n')
        fh.writelines('\n')

        # fh.writelines(f"module restore python_env\n") # load modules i need
        fh.writelines(f"module load miniconda\n") # load modules i need
        # fh.writelines(f"conda init\n")
        fh.writelines(f"conda activate abstract_transformer\n") # activate conda environment
        fh.writelines(f"conda info --envs\n") # activate conda environment

        fh.writelines('\n')
        fh.writelines(f"nvidia-smi -L\n") # print gpu information
        fh.writelines('\n')

        fh.writelines(f"cd {project_dir}\n") # navigate to project directory
        # run python script
        fh.writelines(f"python train_relational_games_models.py --task {params['task']} ")
        fh.writelines(f"--d_model {params['d_model']} --dff {params['dff']} --sa {params['sa']} --rca {params['rca']} --n_layers {params['n_layers']} ")
        fh.writelines(f"--rca_type {params['rca_type']} --symbol_type {params['symbol_type']} --patch_size {patch_size} ")
        fh.writelines(f"--n_epochs {n_epochs} --log_to_wandb {log_to_wandb} --compile {params['compile']} \n")

    created_jobs.append(job_file)

In [9]:
created_jobs

['relational_games/relational_games-same-d128-sa0-rca4-L1-rca_type_disentangled_v2-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa2-rca2-L1-rca_type_disentangled_v2-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa0-rca4-L1-rca_type_standard-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa2-rca2-L1-rca_type_standard-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa4-rca0-L1-rca_type_NA-symbol_type_NA.job',
 'relational_games/relational_games-same-d128-sa0-rca4-L2-rca_type_disentangled_v2-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa2-rca2-L2-rca_type_disentangled_v2-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa0-rca4-L2-rca_type_standard-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa2-rca2-L2-rca_type_standard-symbol_type_pos_sym_r

In [10]:
n_trials = 5

In [11]:
[j for j in created_jobs if (any((task in j) for task in ['occurs', 'same', 'xoccurs']))]

['relational_games/relational_games-same-d128-sa0-rca4-L1-rca_type_disentangled_v2-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa2-rca2-L1-rca_type_disentangled_v2-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa0-rca4-L1-rca_type_standard-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa2-rca2-L1-rca_type_standard-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa4-rca0-L1-rca_type_NA-symbol_type_NA.job',
 'relational_games/relational_games-same-d128-sa0-rca4-L2-rca_type_disentangled_v2-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa2-rca2-L2-rca_type_disentangled_v2-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa0-rca4-L2-rca_type_standard-symbol_type_pos_sym_retriever.job',
 'relational_games/relational_games-same-d128-sa2-rca2-L2-rca_type_standard-symbol_type_pos_sym_r

In [12]:
confirm = input("CONTINUE TO RUN ALL JOBS? (enter 'Y' or 'y')")
if confirm in ("Y", "y"):
    for trial in range(n_trials):
        for job in created_jobs:
            os.system(f'sbatch {job}')
        time.sleep(5)
else:
    print("did not run jobs since you did not confirm.")

Submitted batch job 7487
Submitted batch job 7488
Submitted batch job 7489
Submitted batch job 7490
Submitted batch job 7491
Submitted batch job 7492
Submitted batch job 7493
Submitted batch job 7494
Submitted batch job 7495
Submitted batch job 7496
Submitted batch job 7497
Submitted batch job 7498
Submitted batch job 7499
Submitted batch job 7500
Submitted batch job 7501
Submitted batch job 7502
Submitted batch job 7503
Submitted batch job 7504
Submitted batch job 7505
Submitted batch job 7506
Submitted batch job 7507
Submitted batch job 7508
Submitted batch job 7509
Submitted batch job 7510
Submitted batch job 7511
Submitted batch job 7512
Submitted batch job 7513
Submitted batch job 7514
Submitted batch job 7515
Submitted batch job 7516
Submitted batch job 7517
Submitted batch job 7518
Submitted batch job 7519
Submitted batch job 7520
Submitted batch job 7521
Submitted batch job 7522
Submitted batch job 7523
Submitted batch job 7524
Submitted batch job 7525
Submitted batch job 7526


sbatch: error: Batch job submission failed: Unexpected message received
sbatch: error: Batch job submission failed: Unable to contact slurm controller (connect failure)
sbatch: error: Batch job submission failed: Unable to contact slurm controller (connect failure)
sbatch: error: Batch job submission failed: Unable to contact slurm controller (connect failure)


Submitted batch job 7687
Submitted batch job 7688
Submitted batch job 7689
Submitted batch job 7690
Submitted batch job 7691
Submitted batch job 7692
Submitted batch job 7693
Submitted batch job 7694
Submitted batch job 7695
Submitted batch job 7696
Submitted batch job 7697
Submitted batch job 7698
Submitted batch job 7699
Submitted batch job 7700
Submitted batch job 7701
Submitted batch job 7702
Submitted batch job 7703
Submitted batch job 7704
Submitted batch job 7705
Submitted batch job 7706
Submitted batch job 7707
Submitted batch job 7708
Submitted batch job 7709
Submitted batch job 7710
Submitted batch job 7711
Submitted batch job 7712
Submitted batch job 7713
Submitted batch job 7714
Submitted batch job 7715
Submitted batch job 7716
Submitted batch job 7717
Submitted batch job 7718
Submitted batch job 7719
Submitted batch job 7720
Submitted batch job 7721
Submitted batch job 7722
Submitted batch job 7723
Submitted batch job 7724
Submitted batch job 7725
Submitted batch job 7726
