In [1]:
import os
import itertools
from dotenv import load_dotenv
import copy
import time
import subprocess
import yaml


import os, sys; sys.path.insert(0, os.path.abspath('../..')) # add project root dir to path
from fineweb.model import get_experiment_name
from utils.utils import AttributeDict

load_dotenv()

def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)


In [2]:
# global job parameters

job_directory = f"job_scripts"
out_dir = f'.out'

time_str = '00-24:00:00'
max_time = '00:23:55:00' # 5 minutes less than the time_str; this is the format PL uses

partition = 'gpu'
ntasks = 1
nodes = 1
cpu_per_gpu = 8
mem_per_cpu = 8
n_gpus = 1

cluster = 'misha'

if cluster == 'grace':
    gpus_constraints = '"a100|rtx3090|v100|rtx2080ti"' # for grace
# gpus_constraints = "a40" #'"h100|a100"' # for misha

netid = os.getenv('NETID')
project_dir = f"/home/{netid}/project/adaptive-hyperspherical-res-stream/fineweb"

mkdir(job_directory)
mkdir(out_dir)

In [3]:
# load base model, train, and data config
import yaml
configs_dir = f'{project_dir}/configs'
base_config_dir = f'{configs_dir}/base_config'

with open(os.path.join(base_config_dir, 'model_config.yaml')) as f:
    base_model_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

with open(os.path.join(base_config_dir, 'train_config.yaml')) as f:
    base_train_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))

with open(os.path.join(base_config_dir, 'data_config.yaml')) as f:
    base_data_config = AttributeDict(yaml.load(f, Loader=yaml.FullLoader))


In [4]:
D, L, H = 256, 6, 4

manual_norm_weights = False

model_configs = dict(
    model_arch = [
        dict(n_layers=L, d_model=D, n_heads=H),
        ],

    model_type = ['nGPT'], # 'llama'

    residual_module_args = [
        # None,
        dict(residual_module='ResidualAdaptiveSphericalLERP', residual_module_kwargs=dict(interpolation_weight_activation='linear')),

        # dict(residual_module='ResidualSphericalLERP'),

        # dict(residual_module='ResidualSphericalSLERP', residual_module_kwargs=dict(single_weight=True)),
        # dict(residual_module='ResidualSphericalSLERP', residual_module_kwargs=dict(single_weight=True, n_spheres=H)),
        # dict(residual_module='ResidualSphericalSLERP', residual_module_kwargs=dict(single_weight=False, n_spheres=H)),
        # dict(residual_module='ResidualSphericalSLERP', residual_module_kwargs=dict(single_weight=False)),

        dict(residual_module='ResidualAdaptiveSphericalSLERP',
                residual_module_kwargs=dict(single_weight=True, slerp_weight_map='NormLinear', interpolation_weight_activation='linear')),
        dict(residual_module='ResidualAdaptiveSphericalSLERP',
            residual_module_kwargs=dict(single_weight=False, slerp_weight_map='NormLinear', interpolation_weight_activation='linear')),

        dict(residual_module='ResidualAdaptiveSphericalSLERP',
            residual_module_kwargs=dict(single_weight=True, slerp_weight_map='NormLinear', interpolation_weight_activation='sigmoid')),
        dict(residual_module='ResidualAdaptiveSphericalSLERP',
            residual_module_kwargs=dict(single_weight=False, slerp_weight_map='NormLinear', interpolation_weight_activation='sigmoid')),

        dict(residual_module='ResidualAdaptiveSphericalSLERP',
            residual_module_kwargs=dict(single_weight=True, n_spheres=H, slerp_weight_map='NormLinear', interpolation_weight_activation='sigmoid')),
        dict(residual_module='ResidualAdaptiveSphericalSLERP',
            residual_module_kwargs=dict(single_weight=False, n_spheres=H, slerp_weight_map='NormLinear', interpolation_weight_activation='sigmoid')),
        ],
    )

wandb_project = 'language-modeling-nGPT'

sequence_length = 256

In [5]:
jobs_overwrite_params = []

model_config_product = itertools.product(*[[(k, v) for v in vs] for k, vs in model_configs.items()])

for model_config_update in model_config_product:
    model_config_update = dict(model_config_update)

    # copy base configs
    job_model_config = copy.deepcopy(base_model_config)
    job_train_config = copy.deepcopy(base_train_config)
    job_data_config = copy.deepcopy(base_data_config)

    # parse model_config_update
    model_arch = model_config_update.pop('model_arch')
    model_config_update = {**model_config_update, **model_arch}

    if model_config_update['model_type'] == 'llama' and model_config_update['residual_module_args'] is not None:
        continue
    elif model_config_update['model_type'] == 'nGPT' and model_config_update['residual_module_args'] is None:
        continue
    elif model_config_update['model_type'] == 'nGPT' and model_config_update['residual_module_args'] is not None:
        residual_module_args = model_config_update.pop('residual_module_args')
        model_config_update = {**model_config_update, **residual_module_args}

    model_config_update['manual_norm_weights'] = manual_norm_weights

    # update model config
    for k, v in model_config_update.items():
        job_model_config[k] = v

    # parse train_cofig

    job_train_config['wandb_config'] = job_train_config['wandb_config'] | dict(wandb_project=wandb_project)

    job_train_config['max_time'] = max_time

    # update data config
    job_data_config['sequence_length'] = sequence_length

    job_config = dict(model_config=job_model_config, train_config=job_train_config, data_config=job_data_config)
    job_config = AttributeDict(job_config)
    jobs_overwrite_params.append(job_config)

print('number of jobs', len(jobs_overwrite_params))



number of jobs 7


In [6]:
def create_job_config(config_upate, out_dir, uid=None):
    global base_model_config, base_train_config, base_data_config
    model_config, train_config, data_config = tuple(copy.deepcopy(c) for c in (base_model_config, base_train_config, base_data_config))

    model_config.update(config_upate.get('model_config', {}))
    train_config.update(config_upate.get('train_config', {}))
    data_config.update(config_upate.get('data_config', {}))

    experiment_name, _ = get_experiment_name(model_config, data_config, train_config)
    experiment_name = experiment_name.replace(' ', '')
    if uid is not None:
        experiment_name = f"UID{uid}-{experiment_name}"

    mkdir(os.path.join(out_dir, experiment_name))

    with open(os.path.join(out_dir, f'{experiment_name}/model_config.yaml'), 'w') as f:
        yaml.dump(model_config.todict(), f)

    with open(os.path.join(out_dir, f'{experiment_name}/train_config.yaml'), 'w') as f:
        yaml.dump(train_config.todict(), f)

    with open(os.path.join(out_dir, f'{experiment_name}/data_config.yaml'), 'w') as f:
        yaml.dump(data_config.todict(), f)

    return model_config, train_config, data_config, experiment_name

In [7]:
def create_job_script(experiment_name):
    filename = f'{job_directory}/{experiment_name}.job'
    with open(filename, 'w') as fh:
        fh.writelines(f"#!/bin/bash\n")
        fh.writelines(f"#SBATCH --partition={partition}\n")
        fh.writelines(f"#SBATCH --job-name={experiment_name}\n")
        fh.writelines(f"#SBATCH --output={out_dir}/%j-{experiment_name}.out\n")
        fh.writelines(f"#SBATCH --ntasks={ntasks} --nodes={nodes}\n")
        fh.writelines(f"#SBATCH --cpus-per-gpu={cpu_per_gpu}\n")
        fh.writelines(f"#SBATCH --mem-per-cpu={mem_per_cpu}G\n")
        fh.writelines(f"#SBATCH --time={time_str}\n")
        fh.writelines(f"#SBATCH --mail-type=ALL\n")
        fh.writelines(f"#SBATCH --gpus={n_gpus}\n")
        # fh.writelines(f"#SBATCH --constraint={gpus_constraints}\n")

        fh.writelines('\n')
        fh.writelines('module load StdEnv\n')
        fh.writelines('export SLURM_EXPORT_ENV=ALL\n')
        fh.writelines('\n')

        if cluster == 'grace':
            fh.writelines(f"module restore python_env\n") # load modules i need
        elif cluster == 'misha':
            fh.writelines(f"module load miniconda\n") # load modules i need
        else:
            raise ValueError(f"Cluster {cluster} not supported")

        # fh.writelines(f"conda init\n")
        fh.writelines(f"conda activate neural_prog\n") # activate conda environment
        fh.writelines(f"conda info --envs\n") # activate conda environment

        fh.writelines('\n')
        fh.writelines(f"nvidia-smi -L\n") # print gpu information
        fh.writelines('\n')

        fh.writelines(f"cd {project_dir}\n") # navigate to project directory
        fh.writelines('\n')

        # run python script
        fh.writelines(f"srun python train.py --config_dir {configs_dir}/{experiment_name}\n") # run python script

    return filename


In [None]:
job_script_files = []

for uid, job_params in enumerate(jobs_overwrite_params):
    base_model_config, base_train_config, base_data_config, experiment_name = create_job_config(job_params, configs_dir, uid=uid)

    print(f"Experiment Name: {experiment_name}")

    job_script = create_job_script(experiment_name)
    job_script_files.append(job_script)

Experiment Name: UID0-nGPT-L6H4D256-ResidualAdaptiveSphericalLERP-IWAct-linear-MNW-False
Experiment Name: UID1-nGPT-L6H4D256-ResidualAdaptiveSphericalSLERP-SW-True-SWM-NormLinear-IWAct-linear-MNW-False
Experiment Name: UID2-nGPT-L6H4D256-ResidualAdaptiveSphericalSLERP-SW-False-SWM-NormLinear-IWAct-linear-MNW-False
Experiment Name: UID3-nGPT-L6H4D256-ResidualAdaptiveSphericalSLERP-SW-True-SWM-NormLinear-IWAct-sigmoid-MNW-False
Experiment Name: UID4-nGPT-L6H4D256-ResidualAdaptiveSphericalSLERP-SW-False-SWM-NormLinear-IWAct-sigmoid-MNW-False
Experiment Name: UID5-nGPT-L6H4D256-ResidualAdaptiveSphericalSLERP-SW-True-SWM-NormLinear-IWAct-sigmoid-MNW-False
Experiment Name: UID6-nGPT-L6H4D256-ResidualAdaptiveSphericalSLERP-SW-False-SWM-NormLinear-IWAct-sigmoid-MNW-False


In [9]:
wait_time = 0.5 # number of seconds to wait between job submissions
n_trials = 1

confirm = input("Do you want to submit the jobs? (y/n): ")

responses = []

if confirm == 'y':
    for ir in range(n_trials):
        print('Trial:', ir)
        for job_script in job_script_files:
            response = subprocess.run(['sbatch', job_script], capture_output=True)
            print(f"response: {response.stdout.decode('utf-8').strip()}, return_code={response.returncode}, job_script={job_script}")
            responses.append(response)
            time.sleep(wait_time)
        print()
else:
    print("Not submitting jobs")

Trial: 0
response: Submitted batch job 138768, return_code=0, job_script=job_scripts/UID0-nGPT-L6H4D256-ResidualAdaptiveSphericalLERP-IWAct-linear-MNW-False.job
response: Submitted batch job 138769, return_code=0, job_script=job_scripts/UID1-nGPT-L6H4D256-ResidualAdaptiveSphericalSLERP-SW-True-SWM-NormLinear-IWAct-linear-MNW-False.job
response: Submitted batch job 138770, return_code=0, job_script=job_scripts/UID2-nGPT-L6H4D256-ResidualAdaptiveSphericalSLERP-SW-False-SWM-NormLinear-IWAct-linear-MNW-False.job
response: Submitted batch job 138771, return_code=0, job_script=job_scripts/UID3-nGPT-L6H4D256-ResidualAdaptiveSphericalSLERP-SW-True-SWM-NormLinear-IWAct-sigmoid-MNW-False.job
response: Submitted batch job 138772, return_code=0, job_script=job_scripts/UID4-nGPT-L6H4D256-ResidualAdaptiveSphericalSLERP-SW-False-SWM-NormLinear-IWAct-sigmoid-MNW-False.job
response: Submitted batch job 138773, return_code=0, job_script=job_scripts/UID5-nGPT-L6H4D256-ResidualAdaptiveSphericalSLERP-SW-Tr

In [10]:
# check if any jobs failed to submit
for response in responses:
    if not response.stdout.decode('utf-8').startswith('Submitted batch job') or response.returncode != 0:
        print(f"Failed to submit job: {response.stdout.decode('utf-8')}")
        print(f"stderr: {response.stderr.decode('utf-8')}")
        print(f"Full response: {response}")
        print()