In [1]:
import os
import itertools
import time

In [2]:
def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)

In [3]:
# global job parameters

job_directory = f"relational_games"
out_dir = f'{job_directory}/.out'
time_str = '00-04:00:00'
partition = 'gpu'
ntasks = 1
nodes = 1
cpu_per_task = 8
mem_per_cpu = 2
n_gpus = 1
# gpus_constraints = '"a100|rtx3090|v100|rtx2080ti"' # all gpus are pretty good now
project_dir = "/home/ma2393/project/abstract_transformer/experiments/relational_games"

mkdir(job_directory)
mkdir(out_dir)

In [4]:
# model params
models = [
    'corelnet_softmax'
    # 'predinet',
    # 'corelnet',
    # 'abstractor'
]

# global config parameters
patch_size = 12
n_epochs = 50
# max_steps = -1
log_to_wandb = 1

# tasks
tasks = ['same', 'occurs', 'xoccurs', '1task_between', '1task_match_patt']
train_sizes = {'1task_match_patt': '5_000 7_500 10_000 12_500 15_000 17_500 20_000 22_500 25_000',
    **{task: '250 500 750 1_000 1_250 1_500 1_750 2_000 2_250 2_500' for task in tasks if task != '1task_match_patt'}}

In [5]:
jobs_params = []
for task in tasks:
    for m in models:
        jobs_params.append({'task': task, 'compile': 1, 'train_sizes': train_sizes[task], 'model': m})

In [6]:
jobs_params

[{'task': 'same',
  'compile': 1,
  'train_sizes': '250 500 750 1_000 1_250 1_500 1_750 2_000 2_250 2_500',
  'model': 'corelnet_softmax'},
 {'task': 'occurs',
  'compile': 1,
  'train_sizes': '250 500 750 1_000 1_250 1_500 1_750 2_000 2_250 2_500',
  'model': 'corelnet_softmax'},
 {'task': 'xoccurs',
  'compile': 1,
  'train_sizes': '250 500 750 1_000 1_250 1_500 1_750 2_000 2_250 2_500',
  'model': 'corelnet_softmax'},
 {'task': '1task_between',
  'compile': 1,
  'train_sizes': '250 500 750 1_000 1_250 1_500 1_750 2_000 2_250 2_500',
  'model': 'corelnet_softmax'},
 {'task': '1task_match_patt',
  'compile': 1,
  'train_sizes': '5_000 7_500 10_000 12_500 15_000 17_500 20_000 22_500 25_000',
  'model': 'corelnet_softmax'}]

In [7]:
len(jobs_params)

5

In [8]:
# create jobs
created_jobs = []
for params in jobs_params:

    job_name = (f"relational_games-{params['task']}-{params['model']}")

    job_file = os.path.join(job_directory, f"{job_name}.job")

    with open(job_file, 'w') as fh:
        fh.writelines(f"#!/bin/bash\n")
        fh.writelines(f"#SBATCH --partition={partition}\n")
        fh.writelines(f"#SBATCH --job-name={job_name}\n")
        fh.writelines(f"#SBATCH --output={out_dir}/%j-{job_name}.out\n")
        fh.writelines(f"#SBATCH --ntasks={ntasks} --nodes={nodes}\n")
        fh.writelines(f"#SBATCH --cpus-per-task={cpu_per_task}\n")
        fh.writelines(f"#SBATCH --mem-per-cpu={mem_per_cpu}G\n")
        fh.writelines(f"#SBATCH --time={time_str}\n")
        fh.writelines(f"#SBATCH --mail-type=ALL\n")
        fh.writelines(f"#SBATCH --gpus={n_gpus}\n")
        # fh.writelines(f"#SBATCH --reservation=h100\n") # NOTE: using h100 reservation for noow
        # fh.writelines(f"#SBATCH -C {gpus_constraints}\n")# --gpus={n_gpus}\n")

        fh.writelines('\n')
        fh.writelines('module load StdEnv\n')
        fh.writelines('export SLURM_EXPORT_ENV=ALL\n')
        fh.writelines('\n')

        # fh.writelines(f"module restore python_env\n") # load modules i need
        fh.writelines(f"module load miniconda\n") # load modules i need
        # fh.writelines(f"conda init\n")
        fh.writelines(f"conda activate abstract_transformer\n") # activate conda environment
        fh.writelines(f"conda info --envs\n") # activate conda environment

        fh.writelines('\n')
        fh.writelines(f"nvidia-smi -L\n") # print gpu information
        fh.writelines('\n')

        fh.writelines(f"cd {project_dir}\n") # navigate to project directory
        # run python script
        fh.writelines(f"python eval_relational_games_learning_curve_baselines.py --task {params['task']} ")
        fh.writelines(f"--model {params['model']} ")
        fh.writelines(f"--train_sizes {params['train_sizes']} --n_epochs {n_epochs} --log_to_wandb {log_to_wandb} --compile {params['compile']} \n")

    created_jobs.append(job_file)

In [9]:
created_jobs

['relational_games/relational_games-same-corelnet_softmax.job',
 'relational_games/relational_games-occurs-corelnet_softmax.job',
 'relational_games/relational_games-xoccurs-corelnet_softmax.job',
 'relational_games/relational_games-1task_between-corelnet_softmax.job',
 'relational_games/relational_games-1task_match_patt-corelnet_softmax.job']

In [10]:
n_trials = 5

In [11]:
confirm = input("CONTINUE TO RUN ALL JOBS? (enter 'Y' or 'y')")
if confirm in ("Y", "y"):
    for trial in range(n_trials):
        for job in created_jobs:
            os.system(f'sbatch {job}')
        time.sleep(5)
else:
    print("did not run jobs since you did not confirm.")

Submitted batch job 28088
Submitted batch job 28089
Submitted batch job 28090
Submitted batch job 28091
Submitted batch job 28092
Submitted batch job 28093
Submitted batch job 28094
Submitted batch job 28095
Submitted batch job 28096
Submitted batch job 28097
Submitted batch job 28098
Submitted batch job 28099
Submitted batch job 28100
Submitted batch job 28101
Submitted batch job 28102
Submitted batch job 28103
Submitted batch job 28104
Submitted batch job 28105
Submitted batch job 28106
Submitted batch job 28107
Submitted batch job 28108
Submitted batch job 28109
Submitted batch job 28110
Submitted batch job 28111
Submitted batch job 28112
