/
submit_resumed_train.sh
60 lines (53 loc) · 1.69 KB
/
submit_resumed_train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/bin/sh
### General options
### -- specify queue --
#BSUB -q gpuv100
### -- set the job Name --
#BSUB -J rtrain
### -- ask for number of cores (default: 1) --
#BSUB -n 8
### -- Select the resources: 1 gpu in exclusive process mode --
#BSUB -gpu "num=1:mode=exclusive_process"
### -- set walltime limit: hh:mm -- maximum 24 hours for GPU-queues right now
#BSUB -W 24:00
# specify system resources
#BSUB -R "span[hosts=1]"
#BSUB -R "rusage[mem=32GB]"
#BSUB -R "select[gpu32gb]"
### -- set the email address --
# please uncomment the following line and put in your e-mail address,
# if you want to receive e-mail notifications on a non-default address
##BSUB -u your_email_address
### -- send notification at start --
##BSUB -B
### -- send notification at completion--
##BSUB -N
### -- Specify the output and error file. %J is the job-id --
### -- -o and -e mean append, -oo and -eo mean overwrite --
#BSUB -o batch_output/train_%J.out
#BSUB -e batch_output/train_%J.err
# -- end of LSF options --
# Load the cuda module
source init.sh
nvidia-smi
git log -1 --no-color
git --no-pager diff -U1
RESUME_JOB_ID=9093176
CKPT_DIR=/work1/patmjen/HALOS/logs/net_train_${RESUME_JOB_ID}/ckpts
CKPT_PATH=${CKPT_DIR}/$(ls -1 "${CKPT_DIR}")
python run_training.py \
--experiment_name=net_train_resumed_${LSB_JOBID} \
--logger_save_dir=/work1/patmjen/HALOS/logs/ \
--max_epochs=10000 \
--progress_bar_refresh_rate=0 \
--checkpoint_path="${CKPT_PATH}" \
vnet \
--lr=1e-3 \
--num_loader_workers=0 \
--data_dir=/work1/patmjen/HALOS/data/big_sparse/ \
--samples_per_volume=256 \
--batch_size=12 \
--crop_size=96 \
--normalization="b" \
--min_lr=1e-3 \
--lr_reduce_factor=0.8 \