# Job Scheduler

In [2]:
import os

### Setup Directories

In [101]:
base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'

jobs_dir = os.path.join(base_dir,'sbatch')
logs_base_dir = os.path.join(base_dir,'logs')
os.makedirs(logs_base_dir,exist_ok=True)
batch_number = max([int(d.split('_')[-1]) for d in os.listdir(logs_base_dir)]+[1])
logs_dir = os.path.join(logs_base_dir,f'batch_{batch_number}')
os.makedirs(logs_dir,exist_ok=True)
if len(os.listdir(logs_dir))!=0:
    logs_dir = os.path.join(logs_base_dir,f'batch_{batch_number+1}')

scripts_dir = os.path.join(base_dir,'scripts')

os.makedirs(jobs_dir,exist_ok=True)
os.makedirs(logs_dir,exist_ok=True)
os.makedirs(scripts_dir,exist_ok=True)

### Setup Main Command

In [102]:
sbatch_header = f"#!/bin/bash\n\
\n\
#SBATCH --nodes=1               \n\
#SBATCH --ntasks-per-node=1     \n\
#SBATCH --gres=gpu:1            \n"

# partition list - sinfo -s

job_name_directive =  "#SBATCH --job-name=Job"
output_file_directive = "#SBATCH --output="+logs_dir+'/job'

command_header = "\nmodule purge\n\
source ~/.bashrc\n\
conda activate NLP_Nightly\n\
cd /home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/\n\n"

# Main Commmand
command = "python main.py -ll debug -em gpu_memory -nw 2 -cm DENSE "

In [103]:
memory_required_per_task = 128           # RAM required in GB
job_params = dict(
    time = "4:00:00",       # Time per job
    # mem = '128GB', )               # RAM required in GB
    partition = 'a100_1,a100_2,rtx8000')

for param,val in job_params.items():
    sbatch_header+=f'#SBATCH --{param}={val}\n'

### Get all Jobs

In [104]:
# ['Run1','Run2','Run3','Run4','Run5']
runs = ['Run2']
# ['AESDD','CaFE','EmoDB','EMOVO','IEMOCAP','RAVDESS','ShEMO']
datasets = ['RAVDESS']
# ['GE2E','WAV2VEC2_BASE','WAV2VEC2_LARGE','WAV2VEC2_LARGE_XLSR','WAV2VEC2_LARGE_XLSR300M','HUBERT_BASE','HUBERT_LARGE','WAVLM_BASE','WAVLM_LARGE']
models = ['WAV2VEC2_BASE']

# sbatch_header+=f'#SBATCH --cpus-per-task={len(runs)}\n'
# sbatch_header+=f'#SBATCH --mem={min(250,memory_required_per_task*len(runs))}GB\n'

sbatch_header+=f'#SBATCH --cpus-per-task=4\n'
sbatch_header+=f'#SBATCH --mem={memory_required_per_task}GB\n'

jobs = []
for run in runs:
    for dataset in datasets:
        for model in models:            
            # c = f'{command} -d {dataset} -m {model} -r '
            # for run in runs: c+=f' {run}'
            # jobs.append(c)
            jobs.append(f'{command} -d {dataset} -fm {model} -r {run}')
print(sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]+1)

157


### Make SBATCH Files

In [105]:
job_start_number = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]+1

jobs_per_gpu = 1                # Number of jobs per GPU
# Make sbatch files
for i,j in enumerate(range(0,len(jobs),jobs_per_gpu),job_start_number):
    with open(os.path.join(jobs_dir,'job'+str(i)+'.sbatch'),'w') as file:
        file.write(sbatch_header)
        file.write(job_name_directive+str(i)+'\n')
        file.write(output_file_directive+str(i)+'.log\n')
        file.write(command_header)
        for k in range(j,j+jobs_per_gpu):
            jobs[k] += f' -jn Job{i}'
            file.write(jobs[k])

### Make Schedule File

In [76]:
# schedule_file = os.path.join(scripts_dir,'schedule_jobs.sh')
# with open(schedule_file,'w') as file:
#     file.write('#!/bin/bash\n\n')
#     for k in range(job_start_number,len(jobs)+job_start_number):
#         file.write('sbatch '+jobs_dir+'/job'+str(k)+'.sbatch\n')
# os.chmod(schedule_file, 0o740)

In [112]:
to = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]
from_ = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[0]

schedule_file = os.path.join(scripts_dir,'schedule_jobs.sh')
with open(schedule_file,'w') as file:
    file.write('#!/bin/bash\n\n')
    for k in range(from_,to+1):
        file.write('sbatch '+jobs_dir+'/job'+str(k)+'.sbatch\n')
os.chmod(schedule_file, 0o740)

### Make Cancel File

In [77]:
# cancel_file = os.path.join(scripts_dir,'cancel_jobs.sh')
# base_command = "scancel $(sacct -n -X --format jobid --name"
# with open(cancel_file,'w') as file:
#     file.write('#!/bin/bash\n\n')
#     for k in range(job_start_number,len(jobs)+job_start_number):
#         file.write(base_command+' Job'+str(k)+')\n')
# os.chmod(cancel_file, 0o740)

In [113]:
to = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]
from_ = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[0]

cancel_file = os.path.join(scripts_dir,'cancel_jobs.sh')
base_command = "scancel $(sacct -n -X --format jobid --name"
with open(cancel_file,'w') as file:
    file.write('#!/bin/bash\n\n')
    for k in range(from_,to+1):
        file.write(base_command+' Job'+str(k)+')\n')
os.chmod(cancel_file, 0o740)

### Launch Jobs

In [5]:
base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'
scripts_dir = os.path.join(base_dir,'scripts')
schedule_file = os.path.join(scripts_dir,'schedule_jobs.sh')
# os.system(f'rm -rf {logs_dir}/*')
os.system(f'bash {schedule_file}')

Submitted batch job 30659173
Submitted batch job 30659174
Submitted batch job 30659175
Submitted batch job 30659176
Submitted batch job 30659177
Submitted batch job 30659178
Submitted batch job 30659179
Submitted batch job 30659180
Submitted batch job 30659181
Submitted batch job 30659182
Submitted batch job 30659183
Submitted batch job 30659184
Submitted batch job 30659185
Submitted batch job 30659186
Submitted batch job 30659187
Submitted batch job 30659188
Submitted batch job 30659189
Submitted batch job 30659190
Submitted batch job 30659191
Submitted batch job 30659192
Submitted batch job 30659193
Submitted batch job 30659194
Submitted batch job 30659195
Submitted batch job 30659196
Submitted batch job 30659197
Submitted batch job 30659198
Submitted batch job 30659199
Submitted batch job 30659200
Submitted batch job 30659201
Submitted batch job 30659202
Submitted batch job 30659203
Submitted batch job 30659204
Submitted batch job 30659205
Submitted batch job 30659206
Submitted batc

0

### Cancel Jobs

In [4]:
base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'
scripts_dir = os.path.join(base_dir,'scripts')
cancel_file = os.path.join(scripts_dir,'cancel_jobs.sh')
os.system(f'bash {cancel_file}')

0

### Get Running Jobs

In [115]:
os.system('squeue -u $USER -t running')

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          30648359        cl   log1_0  as14229  R    4:16:59      1 cl001
          30657827   rtx8000     bash  as14229  R      47:55      1 gr048


0

### Get Pending Jobs

In [55]:
os.system('squeue -u $USER -t pending')

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          30568428 rtx8000,v    Job35  as14229 PD       0:00      1 (Priority)
          30568427 rtx8000,v    Job34  as14229 PD       0:00      1 (Priority)
          30568426 rtx8000,v    Job33  as14229 PD       0:00      1 (Priority)
          30568425 rtx8000,v    Job32  as14229 PD       0:00      1 (Priority)
          30568424 rtx8000,v    Job31  as14229 PD       0:00      1 (Priority)
          30568386 rtx8000,v    Job30  as14229 PD       0:00      1 (Priority)
          30568385 rtx8000,v    Job29  as14229 PD       0:00      1 (Priority)
          30568384 rtx8000,v    Job28  as14229 PD       0:00      1 (Priority)
          30568383 rtx8000,v    Job27  as14229 PD       0:00      1 (Priority)
          30568382 rtx8000,v    Job26  as14229 PD       0:00      1 (Priority)
          30568381 rtx8000,v    Job25  as14229 PD       0:00      1 (Priority)
          30568380 rtx8000,v    Job24  as14229

0

In [29]:

!scancel $(sacct -n -X --format jobid --name Job13)
!scancel $(sacct -n -X --format jobid --name Job14)
!scancel $(sacct -n -X --format jobid --name Job15)
!scancel $(sacct -n -X --format jobid --name Job16)
!scancel $(sacct -n -X --format jobid --name Job17)
!scancel $(sacct -n -X --format jobid --name Job18)