# Job Scheduler

In [1]:
import os

### Setup Directories

In [2]:
base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'

# Make base Log dir
logs_base_dir = os.path.join(base_dir,'logs')
os.makedirs(logs_base_dir,exist_ok=True)

# Make logs dir for next batch
batch_number = max([int(d.split('_')[-1]) for d in os.listdir(logs_base_dir)]+[1])
logs_dir = os.path.join(logs_base_dir,f'batch_{batch_number}')
os.makedirs(logs_dir,exist_ok=True)

if len(os.listdir(logs_dir))!=0:
    logs_dir = os.path.join(logs_base_dir,f'batch_{batch_number+1}')

os.makedirs(logs_dir,exist_ok=True)

# Make scripts dir
scripts_dir = os.path.join(base_dir,'scripts')
os.makedirs(scripts_dir,exist_ok=True)

# Make jobs dir
jobs_dir = os.path.join(base_dir,'sbatch')
os.makedirs(jobs_dir,exist_ok=True)


### Setup Headers

In [3]:
sbatch_header = f"#!/bin/bash\n\
\n\
#SBATCH --nodes=1               \n\
#SBATCH --ntasks-per-node=1     \n\
#SBATCH --gres=gpu:1            \n"

job_params = dict(
    # time = { # for 50 epochs
    #     'AESDD':'00:30:00','CaFE':'00:40:00',
    #     'EmoDB':'00:30:00','EMOVO':'00:30:00',
    #     'IEMOCAP':'08:30:00','RAVDESS':'00:40:00',
    #     'ShEMO':'05:00:00'},      
    time = { # for 30 epochs
        'AESDD':'00:20:00','CaFE':'00:30:00',
        'EmoDB':'00:20:00','EMOVO':'00:20:00',
        'IEMOCAP':'05:30:00','RAVDESS':'00:30:00',
        'ShEMO':'02:45:00'},                # Time per job
    memory = '100GB',                       # RAM required in GB
    partition = 'a100_1,a100_2,rtx8000')    # GPUs you want, to list all available run - partition list - sinfo -s

sbatch_header+=f'#SBATCH --partition={job_params["partition"]}\n'
sbatch_header+=f'#SBATCH --cpus-per-task=4\n'
sbatch_header+=f'#SBATCH --mem={job_params["memory"]}GB\n'

job_name_directive =  "#SBATCH --job-name=Job"
output_file_directive = "#SBATCH --output="+logs_dir+'/job'

# Command Header
command_header = "\n\
source ~/.bashrc\n\
conda activate MSERS\n\
cd /home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/\n\n"

# Main Commmand
command = "python main.py -ll debug -em gpu_memory -nw 3 -cm CM_PROBING_LINEAR -e 30 "

### Get all Full Commands and Walltimes

In [4]:
# ['Run1','Run2','Run3','Run4','Run5']
runs = ['Run1','Run2']
# ['AESDD','CaFE','EmoDB','EMOVO','IEMOCAP','RAVDESS','ShEMO']
datasets = ['AESDD','IEMOCAP']
# ['GE2E','WAV2VEC2_BASE','WAV2VEC2_LARGE','WAV2VEC2_LARGE_XLSR','WAV2VEC2_LARGE_XLSR300M','HUBERT_BASE','HUBERT_LARGE','WAV2VEC2_ASR_LARGE_960H','HUBERT_ASR_LARGE']
models = ['WAV2VEC2_BASE','HUBERT_ASR_LARGE']

jobs = []
times = []
for run in runs:
    for dataset in datasets:
        for model in models:            
            times.append(f'#SBATCH --time={job_params["time"][dataset]}\n')
            jobs.append(f'{command} -d {dataset} -fm {model} -r {run}')

### Make SBATCH Files

In [5]:
# Get the Next Job number
try: job_start_number = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]+1
except: job_start_number = 1

# Number of consecutive jobs per GPU
jobs_per_gpu = 1

# Make sbatch files
for i,j in enumerate(range(0,len(jobs),jobs_per_gpu),job_start_number):
    with open(os.path.join(jobs_dir,'job'+str(i)+'.sbatch'),'w') as file:
        file.write(sbatch_header+ times[j])
        file.write(job_name_directive+str(i)+'\n')
        file.write(output_file_directive+str(i)+'.log\n')
        file.write(command_header)
        for k in range(j,j+jobs_per_gpu):
            jobs[k] += f' -jn Job{i}'
            file.write(jobs[k])

---

### Make Schedule File

In [6]:
to = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]
from_ = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[0]

schedule_file = os.path.join(scripts_dir,'schedule_jobs.sh')
with open(schedule_file,'w') as file:
    file.write('#!/bin/bash\n\n')
    for k in range(from_,to+1):
        file.write('sbatch '+jobs_dir+'/job'+str(k)+'.sbatch\n')
os.chmod(schedule_file, 0o740)

### Make Cancel File

In [7]:
to = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]
from_ = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[0]

cancel_file = os.path.join(scripts_dir,'cancel_jobs.sh')
base_command = "scancel $(sacct -n -X --format jobid --name"
with open(cancel_file,'w') as file:
    file.write('#!/bin/bash\n\n')
    for k in range(from_,to+1):
        file.write(base_command+' Job'+str(k)+')\n')
os.chmod(cancel_file, 0o740)

---

### Launch Jobs

In [None]:
# base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'
scripts_dir = os.path.join(base_dir,'scripts')
schedule_file = os.path.join(scripts_dir,'schedule_jobs.sh')
# os.system(f'rm -rf {logs_dir}/*')
os.system(f'bash {schedule_file}')

### Cancel Jobs

In [None]:
# base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'
scripts_dir = os.path.join(base_dir,'scripts')
cancel_file = os.path.join(scripts_dir,'cancel_jobs.sh')
os.system(f'bash {cancel_file}')

---

### Get Running Jobs

In [None]:
os.system('squeue -u $USER -t running')

### Get Pending Jobs

In [None]:
os.system('squeue -u $USER -t pending')

---