In [1]:
import tensorflow as tf
from tensorflow.python.keras.applications import InceptionV3
from tensorflow.keras.layers import Dense, Input, Activation, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import optimizers
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.keras.callbacks import Callback
from tensorflow.python.keras.utils import multi_gpu_model
import os
import argparse


class Caltech101(object):
    def run(self):
        tf.compat.v1.disable_eager_execution()
        # 입력 값을 받게 추가합니다.
        parser = argparse.ArgumentParser()
        parser.add_argument('--learning_rate', required=False, type=float, default=0.001)
        parser.add_argument('--dropout_rate', required=False, type=float, default=0.2)
        parser.add_argument('--batch_size', required=False, type=int, default=16)    
        parser.add_argument('--epoch', required=False, type=int, default=10)            
        # relu, sigmoid, softmax, tanh
        parser.add_argument('--act', required=False, type=str, default='relu')        
      

        args = parser.parse_args()          
        
        input = Input(shape=(200, 200, 3))
        model = InceptionV3(input_tensor=input, include_top=False, weights='imagenet', pooling='max')

        for layer in model.layers:
            layer.trainable = False

        input_image_size = (200, 200)

        x = model.output
        x = Dense(1024, name='fully')(x)
        x = Dropout(args.dropout_rate)(x)        
        x = BatchNormalization()(x)
        x = Activation(args.act)(x)
        x = Dense(512)(x)
        x = Dropout(args.dropout_rate)(x)          
        x = BatchNormalization()(x)
        x = Activation(args.act)(x)
        x = Dense(101, activation='softmax', name='softmax')(x)
        model = Model(model.input, x)

        model.summary()

        train_datagen = ImageDataGenerator(rescale=1. / 255, validation_split=0.2)
        batch_size = args.batch_size

        train_generator = train_datagen.flow_from_directory(
            '/result/caltech101',
            target_size=input_image_size,
            batch_size=batch_size,
            class_mode='categorical',
            subset='training')

        validation_generator = train_datagen.flow_from_directory(
            '/result/caltech101',
            target_size=input_image_size,
            batch_size=batch_size,
            class_mode='categorical',
            subset='validation')
        model = multi_gpu_model(model, gpus=2)
        model.compile(
            optimizer=tf.keras.optimizers.Adam(lr=args.learning_rate),
            loss='categorical_crossentropy',
            metrics=['acc'])

        early_stopping = EarlyStopping(patience=20, mode='auto', monitor='val_acc')
        hist = model.fit_generator(train_generator,
                                      verbose=0,
                                      steps_per_epoch=train_generator.samples // batch_size,
                                      validation_data = validation_generator,
                                      epochs=args.epoch,
                                      callbacks=[early_stopping, KatibMetricLog()])
        
class KatibMetricLog(Callback):
    def on_batch_end(self, batch, logs={}):
        print("batch=" + str(batch),
              "accuracy=" + str(logs.get('acc')),
              "loss=" + str(logs.get('loss')))
    def on_epoch_begin(self, epoch, logs={}):
        print("epoch " + str(epoch) + ":")
    
    def on_epoch_end(self, epoch, logs={}):
        print("Validation-accuracy=" + str(logs.get('val_acc')),
              "Validation-loss=" + str(logs.get('val_loss')))
        return      
    
if __name__ == '__main__':
    if os.getenv('FAIRING_RUNTIME', None) is None:
        from kubeflow import fairing
        from kubeflow.fairing.kubernetes import utils as k8s_utils
        DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000'
        fairing.config.set_builder(
            'append',
            image_name='caltech-katib-job',
            base_image='brightfly/tf-fairing:2.0-gpu',
            registry=DOCKER_REGISTRY,
            push=True)
        # cpu 1, memory 1GiB
        fairing.config.set_deployer('job',
                                    namespace='dudaji',
                                    pod_spec_mutators=[
                                    k8s_utils.mounting_pvc(pvc_name="caltech101", 
                                                          pvc_mount_path="/result")]
                                    )
        # python3
        import IPython
        ipy = IPython.get_ipython()
        if ipy is None:
            fairing.config.set_preprocessor('python', input_files=[__file__])        
        fairing.config.run()
    else:
        train = Caltech101()
        train.run()
        


[I 200311 08:15:56 config:123] Using preprocessor: <kubeflow.fairing.preprocessors.converted_notebook.ConvertNotebookPreprocessor object at 0x7f9140ff88d0>
[I 200311 08:15:56 config:125] Using builder: <kubeflow.fairing.builders.append.append.AppendBuilder object at 0x7f90efe3c5c0>
[I 200311 08:15:56 config:127] Using deployer: <kubeflow.fairing.builders.append.append.AppendBuilder object at 0x7f90efe3c5c0>
[W 200311 08:15:56 append:50] Building image using Append builder...
[I 200311 08:15:56 base:105] Creating docker context: /tmp/fairing_context_pa15vx5y
[I 200311 08:15:56 converted_notebook:127] Converting caltech101_for_katib.ipynb to caltech101_for_katib.py
[I 200311 08:15:56 docker_creds_:234] Loading Docker credentials for repository 'brightfly/tf-fairing:2.0-gpu'
[W 200311 08:15:58 append:54] Image successfully built in 2.4560069650033256s.
[W 200311 08:15:58 append:94] Pushing image kubeflow-registry.default.svc.cluster.local:30000/caltech-katib-job:5D84029B...
[I 200311 08:1

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
2020-03-11 08:16:03.373317: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-03-11 08:16:03.500912: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-03-11 08:16:03.501681: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties:
name: Tesla T4 major: 7 minor: 5 memoryClockRate(GHz): 1.59
pciBusID: 0000:00:04.0
2020-03-11 08:16:03.501849: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-03-11 08:16:03.502428: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 1 with properties:
name: Tesla T4 majo

KeyboardInterrupt: 

In [2]:
!kubectl get job

NAME                COMPLETIONS   DURATION   AGE
fairing-job-hw4dz   0/1           84s        84s


In [3]:
!kubectl delete job fairing-job-hw4dz   

job.batch "fairing-job-hw4dz" deleted
