# Job Scheduler

In [42]:
import os

### Setup Directories

In [44]:
base_dir = '/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/jobs/'

jobs_dir = os.path.join(base_dir,'sbatch')
logs_base_dir = os.path.join(base_dir,'logs')
batch_number = max([int(d.split('_')[-1]) for d in os.listdir(logs_base_dir)])
logs_dir = os.path.join(logs_base_dir,f'batch_{batch_number+1}')

scripts_dir = os.path.join(base_dir,'scripts')

os.makedirs(jobs_dir,exist_ok=True)
os.makedirs(logs_dir,exist_ok=True)
os.makedirs(scripts_dir,exist_ok=True)

### Delete Old Jobs

In [45]:
# if os.path.exists(jobs_dir): 
#     os.system(f'rm -rf {jobs_dir}/*')
#     os.system(f'rm -rf {logs_dir}/*')

### Setup Main Command

In [46]:
time_per_job = "3:00:00"        # Time per job
memory_required = 128           # RAM required in GB

sbatch_header = f"#!/bin/bash\n\
\n\
#SBATCH --nodes=1               \n\
#SBATCH --ntasks-per-node=1     \n\
#SBATCH --cpus-per-task=4       \n\
#SBATCH --time={time_per_job}          \n\
#SBATCH --mem={memory_required}GB              \n\
#SBATCH --gres=gpu:1       \n"

job_name_directive =  "#SBATCH --job-name=Job"
output_file_directive = "#SBATCH --output="+logs_dir+'/job'

command_header = "\nmodule purge\n\
source ~/.bashrc\n\
conda activate NLP_Nightly\n\
cd /home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/\n\n"

# Main Commmand
command = "python main.py -ll debug -em disk"

### Get all Jobs

In [47]:
runs = ['Run1','Run2','Run3','Run4','Run5']
# ['AESDD','CaFE','EmoDB','EMOVO','IEMOCAP','RAVDESS','ShEMO']
datasets = ['AESDD']#,'CaFE','EmoDB','EMOVO','IEMOCAP','RAVDESS','ShEMO']
# ['GE2E','WAV2VEC2_BASE','WAV2VEC2_BASE','WAV2VEC2_LARGE','WAV2VEC2_BASE_XLSR','WAV2VEC2_LARGE_XLSR','HUBERT_BASE','HUBERT_LARGE','GE2E']
models = ['WAV2VEC2_BASE','WAV2VEC2_LARGE','WAV2VEC2_BASE_XLSR','WAV2VEC2_LARGE_XLSR','HUBERT_BASE','HUBERT_LARGE','GE2E']


jobs = []
for run in runs:
    for dataset in datasets:
        for model in models:            
            jobs.append(f'{command} -d {dataset} -m {model} -r {run}\n')

### Make SBATCH Files

In [48]:
jobs_per_gpu = 1                # Number of jobs per GPU
# Make sbatch files
for i,j in enumerate(range(0,len(jobs),jobs_per_gpu),1):
    with open(os.path.join(jobs_dir,'job'+str(i)+'.sbatch'),'w') as file:
        file.write(sbatch_header)
        file.write(job_name_directive+str(i)+'\n')
        file.write(output_file_directive+str(i)+'.log\n')
        file.write(command_header)
        for k in range(j,j+jobs_per_gpu):
            file.write(jobs[k])


### Make Schedule File

In [49]:
schedule_file = os.path.join(scripts_dir,'schedule_jobs.sh')
with open(schedule_file,'w') as file:
    file.write('#!/bin/bash\n\n')
    for k in range(1,i+1):
        file.write('sbatch '+jobs_dir+'/job'+str(k)+'.sbatch\n')
os.chmod(schedule_file, 0o740)

### Make Cancel File

In [50]:
cancel_file = os.path.join(scripts_dir,'cancel_jobs.sh')
base_command = "scancel $(sacct -n -X --format jobid --name"
with open(cancel_file,'w') as file:
    file.write('#!/bin/bash\n\n')
    for k in range(1,i+1):
        file.write(base_command+' Job'+str(k)+')\n')
os.chmod(cancel_file, 0o740)

### Launch Jobs

In [51]:
os.system(f'bash {schedule_file}')

Submitted batch job 30567799
Submitted batch job 30567800
Submitted batch job 30567801
Submitted batch job 30567802
Submitted batch job 30567803


0

### Cancel Jobs

In [24]:
os.system(f'bash {cancel_file}')

0

### Get Running Jobs

In [None]:
os.system('squeue -u $USER -t running')

### Get Pending Jobs

In [33]:
os.system('squeue -u $USER -t pending')

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          30565575 rtx8000,v    Job49  as14229 PD       0:00      1 (Priority)
          30565574 rtx8000,v    Job48  as14229 PD       0:00      1 (Priority)
          30565573 rtx8000,v    Job47  as14229 PD       0:00      1 (Priority)
          30565572 rtx8000,v    Job46  as14229 PD       0:00      1 (Priority)
          30565571 rtx8000,v    Job45  as14229 PD       0:00      1 (Priority)
          30565570 rtx8000,v    Job44  as14229 PD       0:00      1 (Priority)
          30565569 rtx8000,v    Job43  as14229 PD       0:00      1 (Priority)
          30565568 rtx8000,v    Job42  as14229 PD       0:00      1 (Priority)
          30565567 rtx8000,v    Job41  as14229 PD       0:00      1 (Priority)
          30565566 rtx8000,v    Job40  as14229 PD       0:00      1 (Priority)
          30565565 rtx8000,v    Job39  as14229 PD       0:00      1 (Priority)
          30565564 rtx8000,v    Job38  as14229

0

In [34]:
import torch
hs = torch.load('/home/as14229/NYU_HPC/Multilingual-Speech-Emotion-Recognition-System/history/v1.1/CaFE/GE2E/Run1/history.pt')
print()


