In [8]:
import os
import itertools
from dotenv import load_dotenv

load_dotenv()

True

In [9]:
def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)

In [10]:

# global job parameters

job_directory = f"computation-graph"
out_dir = f'{job_directory}/.out'
time_str = '00-48:00:00'
partition = 'gpu'
ntasks = 1
nodes = 1
cpu_per_gpu = 8
mem_per_cpu = 16
n_gpus = 1

# gpus_constraints = '"a100|rtx3090|v100|rtx2080ti"' # for grace
gpus_constraints = "a40" #'"h100|a100"' # for misha

netid = os.getenv('NETID')
project_dir = f"/home/{netid}/project/transformer-computation-graph/experiments"

mkdir(job_directory)
mkdir(out_dir)

In [11]:
wandb_project = 'transformer-computation-graph'
compile = False

weight_decay = 0.05
lr = 0.002

jobs_params = [
    dict(d_model=384, n_layers=6, train_cumulative=True),
    dict(d_model=384, n_layers=8, train_cumulative=True),
    dict(d_model=768, n_layers=12, n_heads=12, train_cumulative=True),
]

# generate run names
for j in jobs_params:
    j['run_name'] = f"{'_'.join([f'{k}={v}' for k, v in j.items()])}"

In [12]:
# create jobs
created_jobs = []
for params in jobs_params:

    job_file = os.path.join(job_directory, f"{params['run_name']}.job")

    with open(job_file, 'w') as fh:
        fh.writelines(f"#!/bin/bash\n")
        fh.writelines(f"#SBATCH --partition={partition}\n")
        fh.writelines(f"#SBATCH --job-name={params['run_name']}\n")
        fh.writelines(f"#SBATCH --output={out_dir}/%j-{params['run_name']}.out\n")
        fh.writelines(f"#SBATCH --ntasks={ntasks} --nodes={nodes}\n")
        fh.writelines(f"#SBATCH --cpus-per-gpu={cpu_per_gpu}\n")
        fh.writelines(f"#SBATCH --mem-per-cpu={mem_per_cpu}G\n")
        fh.writelines(f"#SBATCH --time={time_str}\n")
        fh.writelines(f"#SBATCH --mail-type=ALL\n")
        fh.writelines(f"#SBATCH --gpus={n_gpus}\n")
        if 'gpus_constraints' in params:
            fh.writelines(f"#SBATCH --constraint={params['gpus_constraints']}\n")

        fh.writelines('\n')
        fh.writelines('module load StdEnv\n')
        fh.writelines('export SLURM_EXPORT_ENV=ALL\n')
        fh.writelines('\n')

        # fh.writelines(f"module restore python_env\n") # load modules i need
        fh.writelines(f"module load miniconda\n") # load modules i need
        # fh.writelines(f"conda init\n")
        fh.writelines(f"conda activate computation_graph\n") # activate conda environment
        fh.writelines(f"conda info --envs\n") # activate conda environment

        fh.writelines('\n')
        fh.writelines(f"nvidia-smi -L\n") # print gpu information
        fh.writelines('\n')

        fh.writelines(f"cd {project_dir}\n") # navigate to project directory
        fh.writelines('\n')

        # add .. to python path
        fh.writelines('export PYTHONPATH="${PYTHONPATH}:.."\n')
        fh.writelines('\n')

        # run python script
        fh.writelines(f"python train.py \\\n")
        if 'd_model' in params:
            fh.writelines(f"\t--d_model {params['d_model']} \\\n")
        if 'n_heads' in params:
            fh.writelines(f"\t--n_heads {params['n_heads']} \\\n")
        if 'n_layers' in params:
            fh.writelines(f"\t--n_layers {params['n_layers']} \\\n")

        fh.writelines(f"\t--lr {lr} \\\n")
        fh.writelines(f"\t--weight_decay {weight_decay} \\\n")

        if params.get('train_cumulative', False):
            fh.writelines(f"\t--train_cumulative \\\n")
        if compile:
            fh.writelines(f"\t--compile \\\n")

        fh.writelines(f"\t--wandb_project {wandb_project} \\\n")

    created_jobs.append(job_file)

In [13]:
created_jobs

['computation-graph/d_model=384_n_layers=6_train_cumulative=True.job',
 'computation-graph/d_model=384_n_layers=8_train_cumulative=True.job',
 'computation-graph/d_model=768_n_layers=12_n_heads=12_train_cumulative=True.job']

In [14]:
confirm = input("CONTINUE TO RUN ALL JOBS?")
if confirm == 'y':
    for job in created_jobs:
        os.system(f'sbatch {job}')
else:
    print('JOBS NOT SUBMITTED')

Submitted batch job 34821
Submitted batch job 34822
Submitted batch job 34823
