# Kubeflow TFJob walkthrough

## mnist training using TFJob
[source](https://github.com/kubeflow/tf-operator/blob/master/sdk/python/examples/kubeflow-tfjob-sdk.ipynb)

## Install all Kubeflow packages

In [None]:
!pip install kubeflow-fairing
!pip install kubeflow-tfjob
!pip install kubeflow-pytorchjob

## Import Installed packages

In [None]:
from kubeflow import fairing

## Import Kubernetes specs and TFJob specs

In [7]:
from kubernetes.client import V1PodTemplateSpec
from kubernetes.client import V1ObjectMeta
from kubernetes.client import V1PodSpec
from kubernetes.client import V1Container

from kubeflow.tfjob import constants
from kubeflow.tfjob import utils
from kubeflow.tfjob import V1ReplicaSpec
from kubeflow.tfjob import V1TFJob
from kubeflow.tfjob import V1TFJobSpec
from kubeflow.tfjob import TFJobClient
from kubeflow.tfjob import V1TFJobList

In [2]:
namespace = "anonymous"
container = V1Container(
    name="tensorflow",
    image="gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0",
    command=[
        "python",
        "/var/tf_mnist/mnist_with_summaries.py",
        "--log_dir=/train/logs", "--learning_rate=0.01",
        "--batch_size=150"
        ]
)

worker = V1ReplicaSpec(
    replicas=2,
    restart_policy="Never",
    template=V1PodTemplateSpec(
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

chief = V1ReplicaSpec(
    replicas=1,
    restart_policy="Never",
    template=V1PodTemplateSpec(
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

ps = V1ReplicaSpec(
    replicas=1,
    restart_policy="Never",
    template=V1PodTemplateSpec(
        spec=V1PodSpec(
            containers=[container]
        )
    )
)

tfjob = V1TFJob(
    api_version="kubeflow.org/v1",
    kind="TFJob",
    metadata=V1ObjectMeta(name="mnist",namespace=namespace),
    spec=V1TFJobSpec(
        clean_pod_policy="None",
        tf_replica_specs={"Worker": worker,
                          "Chief": chief,
                          "PS": ps}
    )
)

## Create TFJob

In [3]:
tfjob_client = TFJobClient()
tfjob_client.create(tfjob, namespace=namespace)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'TFJob',
 'metadata': {'creationTimestamp': '2020-02-10T23:10:03Z',
  'generation': 1,
  'name': 'mnist',
  'namespace': 'anonymous',
  'resourceVersion': '39195',
  'selfLink': '/apis/kubeflow.org/v1/namespaces/anonymous/tfjobs/mnist',
  'uid': '7838f531-4c5a-11ea-b08f-0242ac110002'},
 'spec': {'cleanPodPolicy': 'None',
  'tfReplicaSpecs': {'Chief': {'replicas': 1,
    'restartPolicy': 'Never',
    'template': {'spec': {'containers': [{'command': ['python',
         '/var/tf_mnist/mnist_with_summaries.py',
         '--log_dir=/train/logs',
         '--learning_rate=0.01',
         '--batch_size=150'],
        'image': 'gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0',
        'name': 'tensorflow'}]}}},
   'PS': {'replicas': 1,
    'restartPolicy': 'Never',
    'template': {'spec': {'containers': [{'command': ['python',
         '/var/tf_mnist/mnist_with_summaries.py',
         '--log_dir=/train/logs',
         '--learning_rate=0.01',
         

## Get Job and wait for it to finish

In [4]:
tfjob_client.wait_for_job('mnist', namespace=namespace, watch=True)

NAME                           STATE                TIME                          
mnist                          Running              2020-02-10T23:10:05Z          
mnist                          Running              2020-02-10T23:10:05Z          
mnist                          Succeeded            2020-02-10T23:11:37Z          


## TFJob Logs

In [5]:
tfjob_client.get_logs('mnist', namespace=namespace)

The logs of Pod mnist-chief-0:
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use urllib or similar directly.
Instructions for updating:
Please use tf.data to implement this functionality.
Instructions for updating:
Please use tf.data to implement this functionality.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
2020-02-10 23:10:08.373954: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting /tmp/tensorflow/mnist/input_data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting /tmp/tensorflow/mnist/input_data/train-labels-idx1-ubyte.gz
Successfully 

## Delete TFJob

In [6]:
tfjob_client.delete('mnist', namespace=namespace)

{'kind': 'Status',
 'apiVersion': 'v1',
 'metadata': {},
 'status': 'Success',
 'details': {'name': 'mnist',
  'group': 'kubeflow.org',
  'kind': 'tfjobs',
  'uid': '7838f531-4c5a-11ea-b08f-0242ac110002'}}

## List all TFJobs in anonymous namespace