-
Notifications
You must be signed in to change notification settings - Fork 0
/
training_job_cpu.yaml
44 lines (44 loc) · 1.21 KB
/
training_job_cpu.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
apiVersion: batch.test.bdap.com/v1
kind: MPIJob
metadata:
name: simple-train-cpu
namespace: sw-mpi-operator
spec:
numWorkers: 5
launcherTemplate:
spec:
containers:
- args:
- mkdir MPI-Operator &&
cd MPI-Operator &&
mkdir sample-python-train &&
cd sample-python-train &&
horovodrun -np 2 --hostfile $OMPI_MCA_orte_default_hostfile python main.py
command:
- /bin/sh
- -c
image: farawaya/horovod-torch-cpu
name: horovod-master
restartPolicy: Never
workerTemplate:
spec:
containers:
- args:
- git clone https://github.com/FFFFFaraway/MPI-Operator.git &&
cd MPI-Operator &&
cd sample-python-train &&
pip install -r requirements.txt &&
touch /ready.txt &&
sleep infinity
command:
- /bin/sh
- -c
image: farawaya/horovod-torch-cpu
name: horovod-worker
readinessProbe:
exec:
command:
- cat
- /ready.txt
initialDelaySeconds: 30
periodSeconds: 5