# Distributed Training with TensorFlow and PyTorch

# Launch Distributed TensorFlow Training Job

In [1]:
!pygmentize ./distributed-training/distributed-tensorflow-job.yaml

[94mapiVersion[39;49;00m: [33m"[39;49;00m[33mkubeflow.org/v1[39;49;00m[33m"[39;49;00m
[94mkind[39;49;00m: [33m"[39;49;00m[33mTFJob[39;49;00m[33m"[39;49;00m
[94mmetadata[39;49;00m:
  [94mname[39;49;00m: [33m"[39;49;00m[33mdistributed-tensorflow-job[39;49;00m[33m"[39;49;00m
[94mspec[39;49;00m:
  [94mtfReplicaSpecs[39;49;00m:
    [94mPS[39;49;00m:
      [94mreplicas[39;49;00m: 1
      [94mrestartPolicy[39;49;00m: Never
      [94mtemplate[39;49;00m:
        [94mmetadata[39;49;00m:
          [94mannotations[39;49;00m:
            [94msidecar.istio.io/inject[39;49;00m: [33m"[39;49;00m[33mfalse[39;49;00m[33m"[39;49;00m
        [94mspec[39;49;00m:
          [94mcontainers[39;49;00m:
            - [94mname[39;49;00m: tensorflow
              [94mimage[39;49;00m: gcr.io/kubeflow-ci/tf-dist-mnist-test:1.0
    [94mWorker[39;49;00m:
      [94mreplicas[39;49;00m: 2
      [94mrestartPolicy[39;49;00m: Never
      [94m

In [2]:
!kubectl create -f distributed-training/distributed-tensorflow-job.yaml

tfjob.kubeflow.org/distributed-tensorflow-job created


# View All TensorFlow Jobs

In [3]:
!kubectl get tfjob

NAME                         STATE     AGE
distributed-tensorflow-job   Created   1s


# Check TensorFlow Job Status

In [4]:
!kubectl describe tfjob distributed-tensorflow-job

Name:         distributed-tensorflow-job
Namespace:    anonymous
Labels:       <none>
Annotations:  <none>
API Version:  kubeflow.org/v1
Kind:         TFJob
Metadata:
  Creation Timestamp:  2020-09-26T22:57:08Z
  Generation:          1
  Resource Version:    37587
  Self Link:           /apis/kubeflow.org/v1/namespaces/anonymous/tfjobs/distributed-tensorflow-job
  UID:                 2405305d-0f57-42ee-923d-febcd05747fa
Spec:
  Tf Replica Specs:
    PS:
      Replicas:        1
      Restart Policy:  Never
      Template:
        Metadata:
          Annotations:
            sidecar.istio.io/inject:  false
        Spec:
          Containers:
            Image:  gcr.io/kubeflow-ci/tf-dist-mnist-test:1.0
            Name:   tensorflow
    Worker:
      Replicas:        2
      Restart Policy:  Never
      Template:
        Metadata:
          Annotations:
            sidecar.istio.io/inject:  false
        Spec:
          Containers:
            Image:  

# Check Distributed TensorFlow Job Logs
_Note:  If you see an error in this cell, just wait a bit and re-run to see the logs._

In [5]:
!kubectl get pod | grep distributed-tensorflow-job

distributed-tensorflow-job-ps-0       0/1     ContainerCreating   0          1s
distributed-tensorflow-job-worker-0   0/1     ContainerCreating   0          2s
distributed-tensorflow-job-worker-1   0/1     ContainerCreating   0          2s


In [6]:
!kubectl logs distributed-tensorflow-job-worker-0

Error from server (BadRequest): container "tensorflow" in pod "distributed-tensorflow-job-worker-0" is waiting to start: ContainerCreating


# Launch Distributed PyTorch Job

In [7]:
!pygmentize ./distributed-training/distributed-pytorch-job.yaml

[94mapiVersion[39;49;00m: [33m"[39;49;00m[33mkubeflow.org/v1[39;49;00m[33m"[39;49;00m
[94mkind[39;49;00m: [33m"[39;49;00m[33mPyTorchJob[39;49;00m[33m"[39;49;00m
[94mmetadata[39;49;00m:
  [94mname[39;49;00m: [33m"[39;49;00m[33mdistributed-pytorch-job[39;49;00m[33m"[39;49;00m
[94mspec[39;49;00m:
  [94mpytorchReplicaSpecs[39;49;00m:
    [94mMaster[39;49;00m:
      [94mreplicas[39;49;00m: 1
      [94mrestartPolicy[39;49;00m: OnFailure
      [94mtemplate[39;49;00m:
        [94mmetadata[39;49;00m:
          [94mannotations[39;49;00m:
            [94msidecar.istio.io/inject[39;49;00m: [33m"[39;49;00m[33mfalse[39;49;00m[33m"[39;49;00m
        [94mspec[39;49;00m:
          [94mcontainers[39;49;00m:
            - [94mname[39;49;00m: pytorch
              [94mimage[39;49;00m: gcr.io/kubeflow-ci/pytorch-dist-mnist_test:1.0
              [94margs[39;49;00m: [[33m"[39;49;00m[33m--backend[39;49;00m[33m"[39;49;00m, [33

# Launch Distributed PyTorch Training Job

In [8]:
!kubectl apply -f ./distributed-training/distributed-pytorch-job.yaml

pytorchjob.kubeflow.org/distributed-pytorch-job created


In [9]:
!kubectl describe pytorchjob distributed-pytorch-job

Name:         distributed-pytorch-job
Namespace:    anonymous
Labels:       <none>
Annotations:  kubectl.kubernetes.io/last-applied-configuration:
                {"apiVersion":"kubeflow.org/v1","kind":"PyTorchJob","metadata":{"annotations":{},"name":"distributed-pytorch-job","namespace":"anonymous"}...
API Version:  kubeflow.org/v1
Kind:         PyTorchJob
Metadata:
  Creation Timestamp:  2020-09-26T22:57:13Z
  Generation:          1
  Resource Version:    37643
  Self Link:           /apis/kubeflow.org/v1/namespaces/anonymous/pytorchjobs/distributed-pytorch-job
  UID:                 f40f1bfd-a7e9-4514-94dd-2708a14b495b
Spec:
  Pytorch Replica Specs:
    Master:
      Replicas:        1
      Restart Policy:  OnFailure
      Template:
        Metadata:
          Annotations:
            sidecar.istio.io/inject:  false
        Spec:
          Containers:
            Args:
              --backend
              gloo
            Image:  gcr.io/kubeflow-ci/pytor

# Check Distributed PyTorch Training Logs
## _Note:  If you see an error below, just wait a bit and re-run.  You will eventually see the pod status change to `Running` or `Completed`._

In [10]:
!kubectl get pod | grep distributed-pytorch-job

distributed-pytorch-job-master-0      0/1     ContainerCreating   0          1s
distributed-pytorch-job-worker-0      0/1     Init:0/1            0          2s
distributed-pytorch-job-worker-1      0/1     Init:0/1            0          2s


# If You See an Error Below, Wait a Few Seconds and Re-Run It 

In [12]:
!kubectl logs distributed-pytorch-job-master-0

Using distributed PyTorch with gloo backend
