# Job Scheduler

In [3]:
import os, math

In [5]:
base_dir = '/home/as14229/NYU_HPC/LLM-Personality-Codebase/jobs/'

# Make base Log dir
logs_base_dir = os.path.join(base_dir,'logs')
os.makedirs(logs_base_dir,exist_ok=True)

# Make logs dir for next batch
batch_number = max([int(d.split('_')[-1]) for d in os.listdir(logs_base_dir)]+[1])
logs_dir = os.path.join(logs_base_dir,f'batch_{batch_number}')
os.makedirs(logs_dir,exist_ok=True)

if len(os.listdir(logs_dir))!=0:
    logs_dir = os.path.join(logs_base_dir,f'batch_{batch_number+1}')

os.makedirs(logs_dir,exist_ok=True)

# Make scripts dir
scripts_dir = os.path.join(base_dir,'scripts')
os.makedirs(scripts_dir,exist_ok=True)

# Make jobs dir
jobs_dir = os.path.join(base_dir,'sbatch')
os.makedirs(jobs_dir,exist_ok=True)

In [6]:
sbatch_header = f"#!/bin/bash   \n\
#SBATCH --nodes=1               \n\
#SBATCH --ntasks-per-node=1     \n\
#SBATCH --cpus-per-task=16      \n\
#SBATCH --mem=128GB             \n\
#SBATCH --time=01:15:00         \n"

job_name_directive =  "#SBATCH --job-name=Job"
output_file_directive = "#SBATCH --output="+logs_dir+'/job'

# Command Header
command_header = "\n\
source ~/.bashrc\n\
conda activate LLM\n\
cd /home/as14229/NYU_HPC/LLM-Personality-Codebase/\n\n"

# Main Commmand
command = "python3 run.py --config 'OPT-30B' --ans index "

In [9]:
total_samples = 989
samples_per_job = 100

orders = ["original", "reverse", "order-I", "order-II", "order-III"]

jobs=[]

for order in orders:
    start, end = 0, samples_per_job
    for idx in range(int(math.ceil(total_samples/samples_per_job))):
        jobs.append(f'{command} -f {start} -t {end} --order {order}')
        start += samples_per_job
        end += samples_per_job
        if end > total_samples: end = total_samples

In [39]:
# Get the Next Job number
try: job_start_number = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]+1
except: job_start_number = 1

# job_start_number = 1

# Make sbatch files
for i,j in enumerate(range(0,len(jobs)),job_start_number):
    with open(os.path.join(jobs_dir,'job'+str(i)+'.sbatch'),'w') as file:
        file.write(sbatch_header)
        file.write(job_name_directive+str(i)+'\n')
        file.write(output_file_directive+str(i)+'.log\n')
        file.write(command_header)
        file.write(jobs[j])

In [33]:
to = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]
from_ = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[0]

schedule_file = os.path.join(scripts_dir,'schedule_jobs.sh')
with open(schedule_file,'w') as file:
    file.write('#!/bin/bash\n\n')
    for k in range(from_,to+1):
        file.write('sbatch '+jobs_dir+'/job'+str(k)+'.sbatch\n')
os.chmod(schedule_file, 0o740)

In [34]:
to = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]
from_ = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[0]
# to = 100
# from_ = 1

cancel_file = os.path.join(scripts_dir,'cancel_jobs.sh')
base_command = "scancel $(sacct -n -X --format jobid --name"
with open(cancel_file,'w') as file:
    file.write('#!/bin/bash\n\n')
    for k in range(from_,to+1):
        file.write(base_command+' Job'+str(k)+')\n')
os.chmod(cancel_file, 0o740)

In [1]:
a = [1,2,3,4]

In [2]:
a[2:-1]

[3]