# Job Scheduler

In [39]:
import os

### Setup Directories

In [40]:
base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'
jobs_dir = os.path.join(base_dir,'sbatch')
logs_base_dir = os.path.join(base_dir,'logs')
os.makedirs(logs_base_dir,exist_ok=True)

batch_number = max([int(d.split('_')[-1]) for d in os.listdir(logs_base_dir)]+[1])
logs_dir = os.path.join(logs_base_dir,f'batch_{batch_number}')
os.makedirs(logs_dir,exist_ok=True)

if len(os.listdir(logs_dir))!=0:
    logs_dir = os.path.join(logs_base_dir,f'batch_{batch_number+1}')

scripts_dir = os.path.join(base_dir,'scripts')

os.makedirs(jobs_dir,exist_ok=True)
os.makedirs(logs_dir,exist_ok=True)
os.makedirs(scripts_dir,exist_ok=True)

### Setup Main Command

In [41]:
sbatch_header = f"#!/bin/bash\n\
\n\
#SBATCH --nodes=1               \n\
#SBATCH --ntasks-per-node=1     \n\
#SBATCH --gres=gpu:1            \n"

# partition list - sinfo -s

job_name_directive =  "#SBATCH --job-name=Job"
output_file_directive = "#SBATCH --output="+logs_dir+'/job'

command_header = "\nmodule purge\n\
source ~/.bashrc\n\
conda activate NLP_Nightly\n\
cd /home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/\n\n"

# Main Commmand
command = "python main.py -ll debug -em gpu_memory -nw 2 -cm DENSE -e 50 "

# RAM required in GB
memory_required_per_task = 100
job_params = dict(
    time = {
        'AESDD':'00:35:00','CaFE':'00:35:00',
        'EmoDB':'00:35:00','EMOVO':'00:35:00',
        'IEMOCAP':'06:30:00','RAVDESS':'00:35:00',
        'ShEMO':'04:00:00'},       # Time per job
    # mem = '128GB', )               # RAM required in GB
    partition = 'a100_1,a100_2,rtx8000,v100')

# for param,val in job_params.items():
#     sbatch_header+=f'#SBATCH --{param}={val}\n'
sbatch_header+=f'#SBATCH --partition={job_params["partition"]}\n'

### Get all Jobs

In [42]:
# ['Run1','Run2','Run3','Run4','Run5']
runs = ['Run1','Run2','Run3','Run4','Run5']
# ['AESDD','CaFE','EmoDB','EMOVO','IEMOCAP','RAVDESS','ShEMO']
datasets = ['AESDD','CaFE','EmoDB','EMOVO','IEMOCAP','RAVDESS','ShEMO']
# ['GE2E','WAV2VEC2_BASE','WAV2VEC2_LARGE','WAV2VEC2_LARGE_XLSR','WAV2VEC2_LARGE_XLSR300M','HUBERT_BASE','HUBERT_LARGE']
models = ['WAV2VEC2_LARGE_XLSR']

# sbatch_header+=f'#SBATCH --cpus-per-task={len(runs)}\n'
# sbatch_header+=f'#SBATCH --mem={min(250,memory_required_per_task*len(runs))}GB\n'

sbatch_header+=f'#SBATCH --cpus-per-task=4\n'
sbatch_header+=f'#SBATCH --mem={memory_required_per_task}GB\n'

jobs = []
times = []
for run in runs:
    for dataset in datasets:
        for model in models:            
            # c = f'{command} -d {dataset} -m {model} -r '
            # for run in runs: c+=f' {run}'
            # jobs.append(c)
            times.append(f'#SBATCH --time={job_params["time"][dataset]}\n')
            jobs.append(f'{command} -d {dataset} -fm {model} -r {run}')
# print(sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]+1)

### Make SBATCH Files

In [43]:
job_start_number = 1#max(1,sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]+1)

# Number of jobs per GPU
jobs_per_gpu = 1
# Make sbatch files
for i,j in enumerate(range(0,len(jobs),jobs_per_gpu),job_start_number):
    with open(os.path.join(jobs_dir,'job'+str(i)+'.sbatch'),'w') as file:
        file.write(sbatch_header+ times[j])
        file.write(job_name_directive+str(i)+'\n')
        file.write(output_file_directive+str(i)+'.log\n')
        file.write(command_header)
        for k in range(j,j+jobs_per_gpu):
            jobs[k] += f' -jn Job{i}'
            file.write(jobs[k])

### Make Schedule File

In [33]:
# schedule_file = os.path.join(scripts_dir,'schedule_jobs.sh')
# with open(schedule_file,'w') as file:
#     file.write('#!/bin/bash\n\n')
#     for k in range(job_start_number,len(jobs)+job_start_number):
#         file.write('sbatch '+jobs_dir+'/job'+str(k)+'.sbatch\n')
# os.chmod(schedule_file, 0o740)

In [44]:
to = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]
from_ = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[0]

schedule_file = os.path.join(scripts_dir,'schedule_jobs.sh')
with open(schedule_file,'w') as file:
    file.write('#!/bin/bash\n\n')
    for k in range(from_,to+1):
        file.write('sbatch '+jobs_dir+'/job'+str(k)+'.sbatch\n')
os.chmod(schedule_file, 0o740)

### Make Cancel File

In [None]:
# cancel_file = os.path.join(scripts_dir,'cancel_jobs.sh')
# base_command = "scancel $(sacct -n -X --format jobid --name"
# with open(cancel_file,'w') as file:
#     file.write('#!/bin/bash\n\n')
#     for k in range(job_start_number,len(jobs)+job_start_number):
#         file.write(base_command+' Job'+str(k)+')\n')
# os.chmod(cancel_file, 0o740)

In [45]:
to = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]
from_ = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[0]

cancel_file = os.path.join(scripts_dir,'cancel_jobs.sh')
base_command = "scancel $(sacct -n -X --format jobid --name"
with open(cancel_file,'w') as file:
    file.write('#!/bin/bash\n\n')
    for k in range(from_,to+1):
        file.write(base_command+' Job'+str(k)+')\n')
os.chmod(cancel_file, 0o740)

---

### Launch Jobs

In [46]:
base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'
scripts_dir = os.path.join(base_dir,'scripts')
schedule_file = os.path.join(scripts_dir,'schedule_jobs.sh')
# os.system(f'rm -rf {logs_dir}/*')
os.system(f'bash {schedule_file}')

Submitted batch job 30703551
Submitted batch job 30703552
Submitted batch job 30703553
Submitted batch job 30703554
Submitted batch job 30703555
Submitted batch job 30703556
Submitted batch job 30703557
Submitted batch job 30703558
Submitted batch job 30703559
Submitted batch job 30703560
Submitted batch job 30703561
Submitted batch job 30703562
Submitted batch job 30703563
Submitted batch job 30703564
Submitted batch job 30703565
Submitted batch job 30703566
Submitted batch job 30703567
Submitted batch job 30703568
Submitted batch job 30703569
Submitted batch job 30703570
Submitted batch job 30703571
Submitted batch job 30703572
Submitted batch job 30703573
Submitted batch job 30703574
Submitted batch job 30703575
Submitted batch job 30703576
Submitted batch job 30703577
Submitted batch job 30703578
Submitted batch job 30703579
Submitted batch job 30703580
Submitted batch job 30703581
Submitted batch job 30703582
Submitted batch job 30703583
Submitted batch job 30703584
Submitted batc

0

### Cancel Jobs

In [4]:
base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'
scripts_dir = os.path.join(base_dir,'scripts')
cancel_file = os.path.join(scripts_dir,'cancel_jobs.sh')
os.system(f'bash {cancel_file}')

0

---

### Get Running Jobs

In [47]:
os.system('squeue -u $USER -t running')

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)


0

### Get Pending Jobs

In [48]:
os.system('squeue -u $USER -t pending')

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          30703585 a100_1,a1    Job35  as14229 PD       0:00      1 (Priority)
          30703584 a100_1,a1    Job34  as14229 PD       0:00      1 (Priority)
          30703583 a100_1,a1    Job33  as14229 PD       0:00      1 (Priority)
          30703582 a100_1,a1    Job32  as14229 PD       0:00      1 (Priority)
          30703581 a100_1,a1    Job31  as14229 PD       0:00      1 (Priority)
          30703580 a100_1,a1    Job30  as14229 PD       0:00      1 (Priority)
          30703579 a100_1,a1    Job29  as14229 PD       0:00      1 (Priority)
          30703578 a100_1,a1    Job28  as14229 PD       0:00      1 (Priority)
          30703577 a100_1,a1    Job27  as14229 PD       0:00      1 (Priority)
          30703576 a100_1,a1    Job26  as14229 PD       0:00      1 (Priority)
          30703575 a100_1,a1    Job25  as14229 PD       0:00      1 (Priority)
          30703574 a100_1,a1    Job24  as14229

0

---

In [6]:
path = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/sbatch'
jobs = [os.path.join(path,f) for f in os.listdir(path)]

# for job in jobs:


In [9]:
for job in jobs:
    with open(job, 'r') as file:
        # read a list of lines into data
        data = file.readlines()
    data[6]='#SBATCH --partition=a100_1,a100_2,rtx8000,v100\n'
    with open(job, 'w') as file:
        file.writelines(data)

In [13]:
mv_loc = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/sbatch/cancelled'
for job in jobs:
    with open(job, 'r') as file:
        # read a list of lines into data
        data = file.readlines()
    if 'WAVLM' in data[-1] :
        os.system(f'mv {job} {mv_loc}')

# python main.py -ll debug -em gpu_memory -nw 2 -cm DENSE  -d IEMOCAP -m WAV2VEC2_BASE -r Run1 -jn Job1

In [None]:
jb = sorted([int(m.split('.')[0][3:]) for m in os.listdir('/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/sbatch')])
for j in jb :
    os.system(f'scancel $(sacct -n -X --format jobid --name Job{j})')
    # print(f'Job{j}')

In [None]:
jb = sorted([int(m.split('.')[0][3:]) for m in os.listdir('/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/sbatch')])
for j in jb :
    os.system(f'sbatch /home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/sbatch/job{j}.sbatch')
    # print(f'Job{j}')