# Job Scheduler

In [3]:
import os

### Setup Directories

In [4]:
base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'
jobs_dir = os.path.join(base_dir,'sbatch')
logs_base_dir = os.path.join(base_dir,'logs')
os.makedirs(logs_base_dir,exist_ok=True)

batch_number = max([int(d.split('_')[-1]) for d in os.listdir(logs_base_dir)]+[1])
logs_dir = os.path.join(logs_base_dir,f'batch_{batch_number}')
os.makedirs(logs_dir,exist_ok=True)

if len(os.listdir(logs_dir))!=0:
    logs_dir = os.path.join(logs_base_dir,f'batch_{batch_number+1}')

scripts_dir = os.path.join(base_dir,'scripts')

os.makedirs(jobs_dir,exist_ok=True)
os.makedirs(logs_dir,exist_ok=True)
os.makedirs(scripts_dir,exist_ok=True)

### Setup Main Command

In [12]:
sbatch_header = f"#!/bin/bash\n\
\n\
#SBATCH --nodes=1               \n\
#SBATCH --ntasks-per-node=1     \n\
#SBATCH --gres=gpu:1            \n"

# partition list - sinfo -s

job_name_directive =  "#SBATCH --job-name=Job"
output_file_directive = "#SBATCH --output="+logs_dir+'/job'

command_header = "\n\
source ~/.bashrc\n\
conda activate MSERS\n\
cd /home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/\n\n"

# Main Commmand
command = "python main.py -ll debug -em gpu_memory -nw 3 -cm PROBING -e 50 "

# RAM required in GB
memory_required_per_task = 100

# job_params = dict(
#     time = {
#         'AESDD':'00:20:00','CaFE':'00:30:00',
#         'EmoDB':'00:20:00','EMOVO':'00:20:00',
#         'IEMOCAP':'04:00:00','RAVDESS':'00:30:00',
#         'ShEMO':'02:45:00'},       # Time per job
#     # mem = '128GB', )               # RAM required in GB
#     partition = 'a100_1,a100_2,rtx8000')

job_params = dict(
    time = {
        'AESDD':'00:30:00','CaFE':'00:40:00',
        'EmoDB':'00:30:00','EMOVO':'00:30:00',
        'IEMOCAP':'07:00:00','RAVDESS':'00:40:00',
        'ShEMO':'05:00:00'},       # Time per job
    # mem = '128GB', )               # RAM required in GB
    partition = 'a100_1,a100_2,rtx8000')

# for param,val in job_params.items():
#     sbatch_header+=f'#SBATCH --{param}={val}\n'
sbatch_header+=f'#SBATCH --partition={job_params["partition"]}\n'

### Get all Jobs

In [13]:
# ['Run1','Run2','Run3','Run4','Run5']
runs = ['Run1','Run2','Run3','Run4','Run5']
# ['AESDD','CaFE','EmoDB','EMOVO','IEMOCAP','RAVDESS','ShEMO']
datasets = ['AESDD','CaFE','EmoDB','EMOVO','IEMOCAP','RAVDESS','ShEMO']
# ['GE2E','WAV2VEC2_BASE','WAV2VEC2_LARGE','WAV2VEC2_LARGE_XLSR','WAV2VEC2_LARGE_XLSR300M','HUBERT_BASE','HUBERT_LARGE','WAV2VEC2_ASR_LARGE_960H', 'HUBERT_ASR_LARGE']
models = ['WAV2VEC2_ASR_LARGE_960H', 'HUBERT_ASR_LARGE']

# sbatch_header+=f'#SBATCH --cpus-per-task={len(runs)}\n'
# sbatch_header+=f'#SBATCH --mem={min(250,memory_required_per_task*len(runs))}GB\n'

sbatch_header+=f'#SBATCH --cpus-per-task=4\n'
sbatch_header+=f'#SBATCH --mem={memory_required_per_task}GB\n'

jobs = []
times = []
for run in runs:
    for dataset in datasets:
        for model in models:            
            # c = f'{command} -d {dataset} -m {model} -r '
            # for run in runs: c+=f' {run}'
            # jobs.append(c)
            times.append(f'#SBATCH --time={job_params["time"][dataset]}\n')
            jobs.append(f'{command} -d {dataset} -fm {model} -r {run}')
# print(sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]+1)

### Make SBATCH Files

In [14]:
job_start_number = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]+1
# job_start_number = 1

# Number of jobs per GPU
jobs_per_gpu = 1
# Make sbatch files
for i,j in enumerate(range(0,len(jobs),jobs_per_gpu),job_start_number):
    with open(os.path.join(jobs_dir,'job'+str(i)+'.sbatch'),'w') as file:
        file.write(sbatch_header+ times[j])
        file.write(job_name_directive+str(i)+'\n')
        file.write(output_file_directive+str(i)+'.log\n')
        file.write(command_header)
        for k in range(j,j+jobs_per_gpu):
            jobs[k] += f' -jn Job{i}'
            file.write(jobs[k])

### Make Schedule File

In [15]:
to = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]
from_ = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[0]

schedule_file = os.path.join(scripts_dir,'schedule_jobs.sh')
with open(schedule_file,'w') as file:
    file.write('#!/bin/bash\n\n')
    for k in range(from_,to+1):
        file.write('sbatch '+jobs_dir+'/job'+str(k)+'.sbatch\n')
os.chmod(schedule_file, 0o740)

### Make Cancel File

In [16]:
to = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[-1]
from_ = sorted([int(m.split('.')[0][3:]) for m in os.listdir('sbatch')])[0]

cancel_file = os.path.join(scripts_dir,'cancel_jobs.sh')
base_command = "scancel $(sacct -n -X --format jobid --name"
with open(cancel_file,'w') as file:
    file.write('#!/bin/bash\n\n')
    for k in range(from_,to+1):
        file.write(base_command+' Job'+str(k)+')\n')
os.chmod(cancel_file, 0o740)

---

### Launch Jobs

In [17]:
base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'
scripts_dir = os.path.join(base_dir,'scripts')
schedule_file = os.path.join(scripts_dir,'schedule_jobs.sh')
# os.system(f'rm -rf {logs_dir}/*')
os.system(f'bash {schedule_file}')

Submitted batch job 31168322
Submitted batch job 31168323
Submitted batch job 31168324
Submitted batch job 31168325
Submitted batch job 31168326
Submitted batch job 31168327
Submitted batch job 31168328
Submitted batch job 31168329
Submitted batch job 31168330
Submitted batch job 31168331
Submitted batch job 31168332
Submitted batch job 31168333
Submitted batch job 31168334
Submitted batch job 31168335
Submitted batch job 31168336
Submitted batch job 31168337
Submitted batch job 31168338
Submitted batch job 31168339
Submitted batch job 31168340
Submitted batch job 31168341
Submitted batch job 31168342
Submitted batch job 31168343
Submitted batch job 31168344
Submitted batch job 31168345
Submitted batch job 31168346
Submitted batch job 31168347
Submitted batch job 31168348
Submitted batch job 31168349
Submitted batch job 31168350
Submitted batch job 31168351
Submitted batch job 31168352
Submitted batch job 31168353
Submitted batch job 31168354
Submitted batch job 31168355
Submitted batc

0

### Cancel Jobs

In [4]:
base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'
scripts_dir = os.path.join(base_dir,'scripts')
cancel_file = os.path.join(scripts_dir,'cancel_jobs.sh')
os.system(f'bash {cancel_file}')

0

---

### Get Running Jobs

In [41]:
os.system('squeue -u $USER -t running')

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)


0

### Get Pending Jobs

In [40]:
os.system('squeue -u $USER -t pending')

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          30721544 a100_1,a1   Job210  as14229 PD       0:00      1 (Priority)
          30721543 a100_1,a1   Job209  as14229 PD       0:00      1 (Priority)
          30721542 a100_1,a1   Job208  as14229 PD       0:00      1 (Priority)
          30721541 a100_1,a1   Job207  as14229 PD       0:00      1 (Priority)
          30721540 a100_1,a1   Job206  as14229 PD       0:00      1 (Priority)
          30721539 a100_1,a1   Job205  as14229 PD       0:00      1 (Priority)
          30721538 a100_1,a1   Job204  as14229 PD       0:00      1 (Priority)
          30721537 a100_1,a1   Job203  as14229 PD       0:00      1 (Priority)
          30721536 a100_1,a1   Job202  as14229 PD       0:00      1 (Priority)
          30721535 a100_1,a1   Job201  as14229 PD       0:00      1 (Priority)
          30721534 a100_1,a1   Job200  as14229 PD       0:00      1 (Priority)
          30721533 a100_1,a1   Job199  as14229

0

---

### Utils

In [6]:
path = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/sbatch'
jobs = [os.path.join(path,f) for f in os.listdir(path)]

# for job in jobs:


In [9]:
for job in jobs:
    with open(job, 'r') as file:
        # read a list of lines into data
        data = file.readlines()
    data[6]='#SBATCH --partition=a100_1,a100_2,rtx8000,v100\n'
    with open(job, 'w') as file:
        file.writelines(data)

In [13]:
mv_loc = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/sbatch/cancelled'
for job in jobs:
    with open(job, 'r') as file:
        # read a list of lines into data
        data = file.readlines()
    if 'WAVLM' in data[-1] :
        os.system(f'mv {job} {mv_loc}')

# python main.py -ll debug -em gpu_memory -nw 2 -cm DENSE  -d IEMOCAP -m WAV2VEC2_BASE -r Run1 -jn Job1

In [7]:
jb = sorted([int(m.split('.')[0][3:]) for m in os.listdir('/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/cancelled')])
for j in jb :
    os.system(f'scancel $(sacct -n -X --format jobid --name Job{j})')
    # print(f'Job{j}')

In [26]:
jb = sorted([int(m.split('.')[0][3:]) for m in os.listdir('/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/cancelled')])
for j in jb :
    os.system(f'sbatch /home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/cancelled/job{j}.sbatch')
    # print(f'Job{j}')

Submitted batch job 30919018
Submitted batch job 30919019
Submitted batch job 30919020
Submitted batch job 30919021
Submitted batch job 30919022
Submitted batch job 30919023
Submitted batch job 30919024
Submitted batch job 30919025
Submitted batch job 30919026
Submitted batch job 30919027
Submitted batch job 30919028
Submitted batch job 30919029
Submitted batch job 30919030
Submitted batch job 30919031
Submitted batch job 30919032
Submitted batch job 30919033
Submitted batch job 30919034
Submitted batch job 30919035
Submitted batch job 30919036
Submitted batch job 30919037
Submitted batch job 30919038
Submitted batch job 30919039
Submitted batch job 30919040
Submitted batch job 30919041
Submitted batch job 30919042
Submitted batch job 30919043
Submitted batch job 30919044
Submitted batch job 30919045
Submitted batch job 30919046
Submitted batch job 30919047
Submitted batch job 30919048
Submitted batch job 30919049
Submitted batch job 30919050
Submitted batch job 30919051
Submitted batc

---

In [24]:
mv_loc = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/cancelled'
job_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/sbatch'
for job in os.listdir(mv_loc):
    jobfile = os.path.join(mv_loc,job)
    with open(jobfile, 'r') as file:
        # read a list of lines into data
        data = file.readlines()
    if 'PROBING_DEN50' in data[-1] :
        data[-1] = data[-1][:61]+'SE'+data[-1][63:67]+'50'+data[-1][70:]
        print(data[-1])
        with open(jobfile, 'w') as file:
            file.writelines(data)

python main.py -ll debug -em gpu_memory -nw 3 -cm PROBING_DENSE -e 50 -d RAVDESS -fm WAV2VEC2_LARGE_XLSR -r Run2 -jn Job188
python main.py -ll debug -em gpu_memory -nw 3 -cm PROBING_DENSE -e 50 -d CaFE -fm WAV2VEC2_LARGE_XLSR -r Run3 -jn Job191
python main.py -ll debug -em gpu_memory -nw 3 -cm PROBING_DENSE -e 50 -d IEMOCAP -fm WAV2VEC2_LARGE_XLSR -r Run3 -jn Job194
python main.py -ll debug -em gpu_memory -nw 3 -cm PROBING_DENSE -e 50 -d RAVDESS -fm WAV2VEC2_LARGE_XLSR -r Run4 -jn Job202
python main.py -ll debug -em gpu_memory -nw 3 -cm PROBING_DENSE -e 50 -d RAVDESS -fm WAV2VEC2_LARGE_XLSR -r Run1 -jn Job181
python main.py -ll debug -em gpu_memory -nw 3 -cm PROBING_DENSE -e 50 -d EMOVO -fm WAV2VEC2_LARGE_XLSR -r Run5 -jn Job207
python main.py -ll debug -em gpu_memory -nw 3 -cm PROBING_DENSE -e 50 -d CaFE -fm WAV2VEC2_LARGE_XLSR -r Run2 -jn Job184
python main.py -ll debug -em gpu_memory -nw 3 -cm PROBING_DENSE -e 50 -d ShEMO -fm WAV2VEC2_LARGE_XLSR -r Run1 -jn Job182
python main.py -ll

In [23]:
a = 'python main.py -ll debug -em gpu_memory -nw 3 -cm PROBING_DEN50 -e 30  -d CaFE -fm WAV2VEC2_LARGE_XLSR -r Run3 -jn Job191'
a[67:70]

'30 '

In [13]:
a = 'python main.py -ll debug -em gpu_memory -nw 3 -cm PROBING -e 30  -d EmoDB -fm WAV2VEC2_LARGE_XLSR -r Run2 -jn Job220'
a=a[:61]+'50'+a[63:]
print(a)

python main.py -ll debug -em gpu_memory -nw 3 -cm PROBING -e 50  -d EmoDB -fm WAV2VEC2_LARGE_XLSR -r Run2 -jn Job220
