In [None]:
def trainOp(model_relative_path: str = 'model', model_name: str = 'resnet_graphdef', epochs: int = 50, networks: int = 3):
    ### trainOp served as the main entry point (like main function for most of program languages)
    ### it takes several mandatory parameters to configure the path of saved/generated model file
    
    ### model_relative_path: A persistent volume by default will be mounted at `/home/jovyan` (we called application root here),
    ### to keep your state(i.e. files generated) in place.
    ### And by default, we want to keep our model generated in `model` relative to application root.
    ### However, in background job or hyperparameter tunning, this application root may CHANGED.
    ### So in order to perserve model, we use `model_relative_path` to discard variant application root caused.
    
    ### model_name: a model name will be generated as a file holder (i.e. folder) to all model files generated.
    ### In real scenario, the stucture of model file should depends on the inference infrastructure.
    ### and in our integration, we requires model should be structured in below:
    ### model/
    ###        $model_name/
    ###                    config.pbtxt
    ###                    labels.txt
    ###                    $version/
    ###                             model.savedmodel/
    ###                                              saved_models.pb
    
    import json
    import sys
    import os
    import pathlib


    home = '/home/jovyan'
    
   
    prep_data_dir = os.path.join(home, "dataprep")
    output_model_dir = os.path.join(home, model_relative_path)
    temp_data_dir  = os.path.join(home, ".temp")
    model_version = os.environ.get('TINTIN_SESSION_TEMPLATE_MODEL_VERSION', '1')
    
    
    print("outputdir:", prep_data_dir)
    pathlib.Path(prep_data_dir).mkdir(parents=True, exist_ok=True)
    pathlib.Path(output_model_dir).mkdir(parents=True, exist_ok=True)
    pathlib.Path(temp_data_dir).mkdir(parents=True, exist_ok=True)
    
    
    # This tranform_data will be used in online serving to tranform data in preprocessing
    from tintin.online_serving import save_function_to_model
    from tintin.online_serving import transform_data_file_name
    @save_function_to_model(model_path=output_model_dir, file_name=transform_data_file_name)
    def transform_data(input_data):
        input_data = input_data.astype('float32') / 255
        input_data -= 0.5
        return input_data
    
    
    def load_data():
        
        import numpy as np
        from tensorflow.keras.datasets import cifar10
        (x_train, y_train), (x_test, y_test) = cifar10.load_data()
        
        np.save(os.path.join(prep_data_dir, 'x_train.npy'), x_train)
        np.save(os.path.join(prep_data_dir, 'y_train.npy'), y_train)
        np.save(os.path.join(prep_data_dir, 'x_test.npy'), x_test)
        np.save(os.path.join(prep_data_dir, 'y_test.npy'), y_test)

        
    def train(epochs=50, networks=3):
        
        import os
        import shutil
        import argparse
        import numpy as np

        from tensorflow.python.saved_model import builder as saved_model_builder
        from tensorflow.python.saved_model.signature_def_utils import predict_signature_def
        from tensorflow.python.saved_model import tag_constants
        from tensorflow.python.saved_model import signature_constants

        from tensorflow.keras.regularizers import l2
        from tensorflow.keras.models import Model
        from tensorflow.keras.optimizers import Adam
        from tensorflow.keras.models import load_model
        from tensorflow.keras.models import save_model
        from tensorflow.keras.layers import Dense, Conv2D
        from tensorflow.keras.layers import BatchNormalization, Activation
        from tensorflow.keras.layers import AveragePooling2D, Input, Flatten
        from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
        from tensorflow.keras.callbacks import ReduceLROnPlateau
        from tensorflow.keras.preprocessing.image import ImageDataGenerator
        from tensorflow.keras import utils
        from tensorflow.keras import layers
        from tensorflow.keras import callbacks as keras_callbacks
        #import tensorflow.python.compiler.tensorrt as trt

        CONT_TRTIS_RESOURCE_DIR = 'trtis_resource'

        # Copy TRTIS resource (containing config.pbtxt, labels.txt, ...) from container to mounted volume
        model_dir = os.path.join(output_model_dir, model_name)
        if model_version == '1': # if it is default version, we always clear it to keep the space clean
            model_verison_dir = os.path.join(output_model_dir, model_name, model_version)
            if os.path.isdir(model_verison_dir):
                shutil.rmtree(model_verison_dir)
        configdotpbtxt = """name: "resnet_graphdef"
platform: "tensorflow_savedmodel"
max_batch_size: 128


input [
  {
    name: "input_1"
    data_type: TYPE_FP32
    format: FORMAT_NHWC
    dims: [ 32, 32, 3 ]
  }
]

output [
  {
    name: "dense"
    data_type: TYPE_FP32
    dims: [ 10 ]
    label_filename: "labels.txt"
  }
]

version_policy: { all { }}
"""
        labeldottxt = """airplane
automobile
bird
cat
deer
dog
frog
horse
ship
truck"""

        
        pathlib.Path(model_dir).mkdir(parents=True, exist_ok=True)
        with open(os.path.join(model_dir, 'config.pbtxt'), 'w') as f:
            f.write(configdotpbtxt)
        with open(os.path.join(model_dir, 'labels.txt'), 'w') as f:
            f.write(labeldottxt)

        # Training parameters
        batch_size = 128  # orig paper trained all networks with batch_size=128
        epochs = int(epochs)
        data_augmentation = True
        num_classes = 10

        n = int(networks)
        # Model version
        # Orig paper: version = 1 (ResNet v1), Improved ResNet: version = 2 (ResNet v2)
        version = 2

        # Computed depth from supplied model parameter n
        if version == 1:
            depth = n * 6 + 2
        elif version == 2:
            depth = n * 9 + 2

        # Model name, depth and version
        model_type = 'ResNet%dv%d' % (depth, version)

        # Load the CIFAR10 data.
        x_train = np.load(os.path.join(prep_data_dir, "x_train.npy"))
        y_train = np.load(os.path.join(prep_data_dir, "y_train.npy"))
        x_test = np.load(os.path.join(prep_data_dir, "x_test.npy"))
        y_test = np.load(os.path.join(prep_data_dir, "y_test.npy"))

        # Input image dimensions.
        input_shape = x_train.shape[1:]

        # Normalize data.
        x_train = transform_data(x_train)
        x_test = transform_data(x_test)

        print('x_train shape:', x_train.shape)
        print(x_train.shape[0], 'train samples')
        print(x_test.shape[0], 'test samples')
        print('y_train shape:', y_train.shape)

        # Convert class vectors to binary class matrices.
        y_train = utils.to_categorical(y_train, num_classes)
        y_test = utils.to_categorical(y_test, num_classes)

        def lr_schedule(epoch):
            """Learning Rate Schedule

            Learning rate is scheduled to be reduced after 80, 120, 160, 180 epochs.
            Called automatically every epoch as part of callbacks during training.

            # Arguments
                epoch (int): The number of epochs

            # Returns
                lr (float32): learning rate
            """
            lr = 1e-3
            if epoch > 180:
                lr *= 0.5e-3
            elif epoch > 160:
                lr *= 1e-3
            elif epoch > 120:
                lr *= 1e-2
            elif epoch > 80:
                lr *= 1e-1
            print('Learning rate: ', lr)
            return lr

        def resnet_layer(inputs,
                         num_filters=16,
                         kernel_size=3,
                         strides=1,
                         activation='relu',
                         batch_normalization=True,
                         conv_first=True):
            """2D Convolution-Batch Normalization-Activation stack builder

            # Arguments
                inputs (tensor): input tensor from input image or previous layer
                num_filters (int): Conv2D number of filters
                kernel_size (int): Conv2D square kernel dimensions
                strides (int): Conv2D square stride dimensions
                activation (string): activation name
                batch_normalization (bool): whether to include batch normalization
                conv_first (bool): conv-bn-activation (True) or
                    bn-activation-conv (False)

            # Returns
                x (tensor): tensor as input to the next layer
            """
            conv = Conv2D(num_filters,
                          kernel_size=kernel_size,
                          strides=strides,
                          padding='same',
                          kernel_initializer='he_normal',
                          kernel_regularizer=l2(1e-4))

            x = inputs
            if conv_first:
                x = conv(x)
                if batch_normalization:
                    x = BatchNormalization()(x)
                if activation is not None:
                    x = Activation(activation)(x)
            else:
                if batch_normalization:
                    x = BatchNormalization()(x)
                if activation is not None:
                    x = Activation(activation)(x)
                x = conv(x)
            return x

        def resnet_v1(input_shape, depth, num_classes=10):
            """ResNet Version 1 Model builder [a]

            Stacks of 2 x (3 x 3) Conv2D-BN-ReLU
            Last ReLU is after the shortcut connection.
            At the beginning of each stage, the feature map size is halved (downsampled)
            by a convolutional layer with strides=2, while the number of filters is
            doubled. Within each stage, the layers have the same number filters and the
            same number of filters.
            Features maps sizes:
            stage 0: 32x32, 16
            stage 1: 16x16, 32
            stage 2:  8x8,  64
            The Number of parameters is approx the same as Table 6 of [a]:
            ResNet20 0.27M
            ResNet32 0.46M
            ResNet44 0.66M
            ResNet56 0.85M
            ResNet110 1.7M

            # Arguments
                input_shape (tensor): shape of input image tensor
                depth (int): number of core convolutional layers
                num_classes (int): number of classes (CIFAR10 has 10)

            # Returns
                model (Model): Keras model instance
            """
            if (depth - 2) % 6 != 0:
                raise ValueError('depth should be 6n+2 (eg 20, 32, 44 in [a])')
            # Start model definition.
            num_filters = 16
            num_res_blocks = int((depth - 2) / 6)

            inputs = Input(shape=input_shape)
            x = resnet_layer(inputs=inputs)
            # Instantiate the stack of residual units
            for stack in range(3):
                for res_block in range(num_res_blocks):
                    strides = 1
                    if stack > 0 and res_block == 0:  # first layer but not first stack
                        strides = 2  # downsample
                    y = resnet_layer(inputs=x,
                                     num_filters=num_filters,
                                     strides=strides)
                    y = resnet_layer(inputs=y,
                                     num_filters=num_filters,
                                     activation=None)
                    if stack > 0 and res_block == 0:  # first layer but not first stack
                        # linear projection residual shortcut connection to match
                        # changed dims
                        x = resnet_layer(inputs=x,
                                         num_filters=num_filters,
                                         kernel_size=1,
                                         strides=strides,
                                         activation=None,
                                         batch_normalization=False)
                    x = layers.add([x, y])
                    x = Activation('relu')(x)
                num_filters *= 2

            # Add classifier on top.
            # v1 does not use BN after last shortcut connection-ReLU
            x = AveragePooling2D(pool_size=8)(x)
            y = Flatten()(x)
            outputs = Dense(num_classes,
                            activation='softmax',
                            kernel_initializer='he_normal')(y)

            # Instantiate model.
            model = Model(inputs=inputs, outputs=outputs)
            return model

        
        class LossAndErrorPrintingCallback(keras_callbacks.Callback):
            def on_epoch_end(self, epoch, logs=None):
                print("epoch={}".format(epoch))
                print("Training-Accuracy={:7.6f}".format(logs["accuracy"]))
                print("Training-Loss={:7.6f}".format(logs["loss"]))
                print("Validation-Accuracy={:7.6f}".format(logs["val_accuracy"]))
                print("Validation-Loss={:7.6f}".format(logs["val_loss"]))
        
        def resnet_v2(input_shape, depth, num_classes=10):
            """ResNet Version 2 Model builder [b]

            Stacks of (1 x 1)-(3 x 3)-(1 x 1) BN-ReLU-Conv2D or also known as
            bottleneck layer
            First shortcut connection per layer is 1 x 1 Conv2D.
            Second and onwards shortcut connection is identity.
            At the beginning of each stage, the feature map size is halved (downsampled)
            by a convolutional layer with strides=2, while the number of filter maps is
            doubled. Within each stage, the layers have the same number filters and the
            same filter map sizes.
            Features maps sizes:
            conv1  : 32x32,  16
            stage 0: 32x32,  64
            stage 1: 16x16, 128
            stage 2:  8x8,  256

            # Arguments
                input_shape (tensor): shape of input image tensor
                depth (int): number of core convolutional layers
                num_classes (int): number of classes (CIFAR10 has 10)

            # Returns
                model (Model): Keras model instance
            """
            if (depth - 2) % 9 != 0:
                raise ValueError('depth should be 9n+2 (eg 56 or 110 in [b])')
            # Start model definition.
            num_filters_in = 16
            num_res_blocks = int((depth - 2) / 9)

            inputs = Input(shape=input_shape)
            # v2 performs Conv2D with BN-ReLU on input before splitting into 2 paths
            x = resnet_layer(inputs=inputs,
                             num_filters=num_filters_in,
                             conv_first=True)

            # Instantiate the stack of residual units
            for stage in range(3):
                for res_block in range(num_res_blocks):
                    activation = 'relu'
                    batch_normalization = True
                    strides = 1
                    if stage == 0:
                        num_filters_out = num_filters_in * 4
                        if res_block == 0:  # first layer and first stage
                            activation = None
                            batch_normalization = False
                    else:
                        num_filters_out = num_filters_in * 2
                        if res_block == 0:  # first layer but not first stage
                            strides = 2    # downsample

                    # bottleneck residual unit
                    y = resnet_layer(inputs=x,
                                     num_filters=num_filters_in,
                                     kernel_size=1,
                                     strides=strides,
                                     activation=activation,
                                     batch_normalization=batch_normalization,
                                     conv_first=False)
                    y = resnet_layer(inputs=y,
                                     num_filters=num_filters_in,
                                     conv_first=False)
                    y = resnet_layer(inputs=y,
                                     num_filters=num_filters_out,
                                     kernel_size=1,
                                     conv_first=False)
                    if res_block == 0:
                        # linear projection residual shortcut connection to match
                        # changed dims
                        x = resnet_layer(inputs=x,
                                         num_filters=num_filters_out,
                                         kernel_size=1,
                                         strides=strides,
                                         activation=None,
                                         batch_normalization=False)
                    x = layers.add([x, y])

                num_filters_in = num_filters_out

            # Add classifier on top.
            # v2 has BN-ReLU before Pooling
            x = BatchNormalization()(x)
            x = Activation('relu')(x)
            x = AveragePooling2D(pool_size=8)(x)
            y = Flatten()(x)
            outputs = Dense(num_classes,
                            activation='softmax',
                            kernel_initializer='he_normal')(y)

            # Instantiate model.
            model = Model(inputs=inputs, outputs=outputs)
            return model

        if version == 2:
            model = resnet_v2(input_shape=input_shape, depth=depth)
        else:
            model = resnet_v1(input_shape=input_shape, depth=depth)

        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(lr=lr_schedule(0)),
                      metrics=['accuracy'])
        model.summary()
        print(model_type)

        # Prepare model model saving directory.
        
        save_dir = os.path.join(home, 'saved_models')
        save_model_name = 'cifar10_%s_model.%03d.h5' % (model_type, epochs)
        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)
        filepath = os.path.join(save_dir, save_model_name)

        # Prepare callbacks for model saving and for learning rate adjustment.
        checkpoint = ModelCheckpoint(filepath=filepath,
                                     monitor='val_accuracy',
                                     verbose=1,
                                     save_best_only=True)

        lr_scheduler = LearningRateScheduler(lr_schedule)

        lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1),
                                       cooldown=0,
                                       patience=5,
                                       min_lr=0.5e-6)

        
        
        
        callbacks = [checkpoint, LossAndErrorPrintingCallback(), lr_reducer, lr_scheduler]

        # Run training, with or without data augmentation.
        if not data_augmentation:
            print('Not using data augmentation.')
            model.fit(x_train, y_train,
                      batch_size=batch_size,
                      epochs=epochs,
                      validation_data=(x_test, y_test),
                      shuffle=True,
                      verbose=2,
                      callbacks=callbacks)
        else:
            print('Using real-time data augmentation.')
            # This will do preprocessing and realtime data augmentation:
            datagen = ImageDataGenerator(
                # set input mean to 0 over the dataset
                featurewise_center=False,
                # set each sample mean to 0
                samplewise_center=False,
                # divide inputs by std of dataset
                featurewise_std_normalization=False,
                # divide each input by its std
                samplewise_std_normalization=False,
                # apply ZCA whitening
                zca_whitening=False,
                # epsilon for ZCA whitening
                zca_epsilon=1e-06,
                # randomly rotate images in the range (deg 0 to 180)
                rotation_range=0,
                # randomly shift images horizontally
                width_shift_range=0.1,
                # randomly shift images vertically
                height_shift_range=0.1,
                # set range for random shear
                shear_range=0.,
                # set range for random zoom
                zoom_range=0.,
                # set range for random channel shifts
                channel_shift_range=0.,
                # set mode for filling points outside the input boundaries
                fill_mode='nearest',
                # value used for fill_mode = "constant"
                cval=0.,
                # randomly flip images
                horizontal_flip=True,
                # randomly flip images
                vertical_flip=False,
                # set rescaling factor (applied before any other transformation)
                rescale=None,
                # set function that will be applied on each input
                preprocessing_function=None,
                # image data format, either "channels_first" or "channels_last"
                data_format=None,
                # fraction of images reserved for validation (strictly between 0 and 1)
                validation_split=0.0)

            # Compute quantities required for featurewise normalization
            # (std, mean, and principal components if ZCA whitening is applied).
            datagen.fit(x_train)

            # Fit the model on the batches generated by datagen.flow().
            model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size),
                                steps_per_epoch=len(x_train)/batch_size,
                                validation_data=(x_test, y_test),
                                epochs=epochs, verbose=2, workers=1,
                                use_multiprocessing=False,
                                callbacks=callbacks)

        # Score trained model.
        
        scores = model.evaluate(x_test, y_test, verbose=0)
        
        #validAcc = scores[1]
        #validLoss = scores[0]

        # Save Keras model
        tmp_model_path = os.path.join(temp_data_dir, "tmp")
        if os.path.isdir(tmp_model_path):
            shutil.rmtree(tmp_model_path)
        os.mkdir(tmp_model_path)
        

        keras_model_path = os.path.join(tmp_model_path, 'keras_model.h5')
        model.save(keras_model_path)

        # Convert Keras model to Tensorflow SavedModel
        def export_h5_to_pb(path_to_h5, export_path):
             # Load the Keras model
            keras_model = load_model(path_to_h5)
            
            save_model(model=keras_model, filepath=export_path, overwrite=True, include_optimizer=True, save_format=None, signatures=None, options=None, save_traces=True)

        tf_model_path  = os.path.join(model_dir, model_version, 'model.savedmodel')
        export_h5_to_pb(keras_model_path, tf_model_path)
        # Remove tmp dirs
        shutil.rmtree(tmp_model_path)
        return
    
    child_pid = os.fork()
    if child_pid == 0:
        # child process get 0 pid
        sys.stdout = open(os.devnull, 'w')
        
        load_data()
        
        sys.exit(0)
    else:
        # parent process get child pid
        os.waitpid(child_pid, 0)
    
    train(epochs, networks)
    print("done")

In [None]:
### if you want to debug the above tranOp function,
### uncomment below
### and remember to COMMENT it before you `BUILD` this pipeline through UI

# = trainOp('model', 'resnet_graphdef', 1, 3)

In [None]:
with open("requirements.txt", "w") as f:
    f.write("kfp==0.5.1\n")
    f.write("h5py<3.0.0\n")
    f.write("keras==2.3.1\n")
    f.write("tintin-sdk>=0.0.4\n")

!pip install -r requirements.txt --user --upgrade

In [None]:
import kfp
import kfp.dsl as dsl
import kfp.components as comp
import kfp.compiler as compiler

In [None]:
import os
pvcname = os.environ.get('TINTIN_SESSION_TEMPLATE_PVC_NAME')
generated_pipeline_zip_filename = os.environ.get('TINTIN_SESSION_TEMPLATE_GENERATED_PIPELINE_ZIP_FILENAME')
gpu_type_list_text = os.environ.get('TINTIN_SESSION_TEMPLATE_GPU_TYPE_LIST')
default_image = os.environ.get('TINTIN_SESSION_TEMPLATE_DEFAULT_IMAGE', 'footprintai/nvidia-tensorflow:19.12-tf1-py3')
mountPath = os.environ.get('TINTIN_SESSION_TEMPLATE_MOUNT_PATH', '/home/jovyan')



In [None]:
trainComp = comp.func_to_container_op(trainOp, 
                                      base_image=default_image,
                                      packages_to_install=["keras==2.3.1", "h5py<3.0.0", "tintin-sdk>=0.0.4"])

import kfp.dsl as dsl
@dsl.pipeline(
   name='Projectname pipeline',
   description='simple pipeline.'
)
def templated_pipeline_func(
    epochs=50,
    networks=3,
):
    
    ### model relative path can NOT be nest path(e.g. a/b/c/d, it should be the first folder (e.g. model)
    model_relative_path = os.environ.get('TINTIN_SESSION_TEMPLATE_MODEL_RELATIVE_PATH', 'model')    
    model_name = os.environ.get('TINTIN_SESSION_TEMPLATE_MODEL_NAME', 'resnet_graphdef')
    ### if you want to customize $model_name, replace `my_customized_model_name` and uncomment below
    ### model_name = os.environ.get('TINTIN_SESSION_TEMPLATE_MODEL_NAME', 'my_customized_model_name')
    
    train_task = trainComp(model_relative_path, model_name, epochs, networks)
    # add train_task default resources for cpu and memory, this value will be changed during runtime
    # to reflect your settings in UI
    train_task = train_task.add_resource_request('cpu', '1')
    train_task = train_task.add_resource_limit('cpu', '1')
    train_task = train_task.add_resource_request('memory', '4Gi')
    train_task = train_task.add_resource_limit('memory', '4Gi')
    
    # add annotation to reflect our configuration on `model_relative_path` and `model_name` to workflow itself.
    train_task = train_task.add_pod_annotation('tintin.footprint-ai.com/session-model-relative-path', model_relative_path)    
    train_task = train_task.add_pod_annotation('tintin.footprint-ai.com/session-model-name', model_name)
compiler.Compiler().compile(templated_pipeline_func, generated_pipeline_zip_filename)