In [5]:
import tensorflow as tf
import os
import argparse
from tensorflow.python.keras.callbacks import Callback



class MyFashionMnist(object):
    def train(self):
    
        parser = argparse.ArgumentParser()
        parser.add_argument('--learning_rate', required=False, type=float, default=0.001)
        parser.add_argument('--dropout_rate', required=False, type=float, default=0.3)
        parser.add_argument('--opt', required=False, type=int, default=1)    
        parser.add_argument('--checkpoint_dir', required=False, default='/reuslt/training_checkpoints')
        parser.add_argument('--saved_model_dir', required=False, default='/result/saved_model/001')        
        parser.add_argument('--tensorboard_log', required=False, default='/result/log')     
        args = parser.parse_args()    

        (x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
        x_train, x_test = x_train / 255.0, x_test / 255.0

        model = tf.keras.models.Sequential([
          tf.keras.layers.Flatten(input_shape=(28, 28)),
          tf.keras.layers.Dense(128, activation='relu'),
          tf.keras.layers.Dropout(args.dropout_rate),
          tf.keras.layers.Dense(10, activation='softmax')
        ])

        model.summary()

        sgd = tf.keras.optimizers.SGD(lr=args.learning_rate)
        adam = tf.keras.optimizers.Adam(lr=args.learning_rate)

        optimizers= [sgd, adam]
        model.compile(optimizer=optimizers[args.opt],
                      loss='sparse_categorical_crossentropy',
                      metrics=['acc'])

        # 체크포인트를 저장할 체크포인트 디렉터리를 지정
        checkpoint_dir = args.checkpoint_dir
        # 체크포인트 파일의 이름
        checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")        

        model.fit(x_train, y_train,
                  verbose=0,
                  validation_data=(x_test, y_test),
                  epochs=5,
                  callbacks=[KatibMetricLog(),
                            tf.keras.callbacks.TensorBoard(log_dir=args.tensorboard_log),
                            tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                   save_weights_only=True)
                            ])
        path = args.saved_model_dir        
        model.save(path, save_format='tf')
        print("######## model successfully saved ###########")

class KatibMetricLog(Callback):
    def on_batch_end(self, batch, logs={}):
        print("batch=" + str(batch),
              "accuracy=" + str(logs.get('acc')),
              "loss=" + str(logs.get('loss')))
    def on_epoch_begin(self, epoch, logs={}):
        print("epoch " + str(epoch) + ":")
    
    def on_epoch_end(self, epoch, logs={}):
        print("Validation-accuracy=" + str(logs.get('val_acc')),
              "Validation-loss=" + str(logs.get('val_loss')))
        return

if __name__ == '__main__':
    if os.getenv('FAIRING_RUNTIME', None) is None:
        from kubeflow import fairing
        from kubeflow.fairing.kubernetes import utils as k8s_utils

        DOCKER_REGISTRY = 'jaewoo201'
        fairing.config.set_builder(
            'append',
            base_image='jaewoo201/kubeflow-jupyter-lab:tf2.0-cpu',
            registry=DOCKER_REGISTRY, 
            push=True)
        # cpu 2, memory 5GiB
        fairing.config.set_deployer('job',
                                    namespace='kf-namespace',
                                    pod_spec_mutators=[
                                        k8s_utils.mounting_pvc(pvc_name="azurefile", 
                                                              pvc_mount_path="/result"),
                                        k8s_utils.get_resource_mutator(cpu=1,
                                                                       memory=3)]
         
                                   )
        fairing.config.run()
    else:
        remote_train = MyFashionMnist()
        remote_train.train()

[W 200811 05:54:29 utils:51] The function mounting_pvc has been deprecated,                     please use `volume_mounts`
[I 200811 05:54:29 config:134] Using preprocessor: <kubeflow.fairing.preprocessors.converted_notebook.ConvertNotebookPreprocessor object at 0x7f8db024e588>
[I 200811 05:54:29 config:136] Using builder: <kubeflow.fairing.builders.append.append.AppendBuilder object at 0x7f8d5051ea58>
[I 200811 05:54:29 config:138] Using deployer: <kubeflow.fairing.deployers.job.job.Job object at 0x7f8d5051e208>
[W 200811 05:54:29 append:50] Building image using Append builder...
[I 200811 05:54:29 base:107] Creating docker context: /tmp/fairing_context_epi9cx1m
[I 200811 05:54:29 converted_notebook:127] Converting fashion-mnist-save_model.ipynb to fashion-mnist-save_model.py
[I 200811 05:54:29 docker_creds_:234] Loading Docker credentials for repository 'jaewoo201/kubeflow-jupyter-lab:tf2.0-cpu'
[W 200811 05:54:31 append:54] Image successfully built in 1.4967336129629984s.
[W 200811 

2020-08-11 05:54:42.946295: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2020-08-11 05:54:42.946334: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
2020-08-11 05:54:46.281681: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shar

[W 200811 05:55:27 job:173] Cleaning up job fairing-job-ntzjm...
