In [None]:
def trainOp(model_relative_path: str = 'model', model_name: str = 'simple_cifar_pytorch', epochs: int = 50):
    ### trainOp served as the main entry point (like main function for most of program languages)
    ### it takes several mandatory parameters to configure the path of saved/generated model file
    
    ### model_relative_path: A persistent volume by default will be mounted at `/home/jovyan` (we called application root here),
    ### to keep your state(i.e. files generated) in place.
    ### And by default, we want to keep our model generated in `model` relative to application root.
    ### However, in background job or hyperparameter tunning, this application root may CHANGED.
    ### So in order to perserve model, we use `model_relative_path` to discard variant application root caused.
    
    ### model_name: a model name will be generated as a file holder (i.e. folder) to all model files generated.
    ### In real scenario, the stucture of model file should depends on the inference infrastructure.
    ### and in our integration, we requires model should be structured in below:
    ### model/
    ###        $model_name/
    ###                    config.pbtxt
    ###                    labels.txt
    ###                    $version/
    ###                             model.savedmodel/
    ###                                              saved_models.pb
    
    import json
    import sys
    import os
    import pathlib


    home = '/home/jovyan'
    
   
    prep_data_dir = os.path.join(home, "dataprep")
    output_model_dir = os.path.join(home, model_relative_path)
    temp_data_dir  = os.path.join(home, ".temp")
    model_version = os.environ.get('TINTIN_SESSION_TEMPLATE_MODEL_VERSION', '1')
    
    
    print("outputdir:", prep_data_dir)
    pathlib.Path(prep_data_dir).mkdir(parents=True, exist_ok=True)
    pathlib.Path(output_model_dir).mkdir(parents=True, exist_ok=True)
    pathlib.Path(temp_data_dir).mkdir(parents=True, exist_ok=True)
    
    
    # This tranform_data will be used in online serving to tranform data in preprocessing
    from tintin.online_serving import save_function_to_model
    from tintin.online_serving import transform_data_file_name
    @save_function_to_model(model_path=output_model_dir, file_name=transform_data_file_name)
    def transform_data(input_data):
        input_data = input_data.astype('float32') / 255
        input_data -= 0.5
        return input_data
    
    
    def load_data():
    
        import torch
        import torchvision
        import torchvision.transforms as transforms
        
        # mimic transform_data defined above but use pytorch
        transform = transforms.Compose([transforms.ToTensor(), # from [0,255] -> [0, 1]
            transforms.Normalize((0.5, 0.5, 0.5), (1, 1, 1))])

        batch_size = 64

        trainset = torchvision.datasets.CIFAR10(root=prep_data_dir, train=True,
                                                download=True, transform=transform)
        
        # we use testset as validation, so no additional split here
        
        trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                                  shuffle=True, num_workers=1)
        
        testset = torchvision.datasets.CIFAR10(root=prep_data_dir, train=False,
                                               download=True, transform=transform)
        testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                                 shuffle=False, num_workers=1)

        return (trainloader, testloader)

    def export_triton_config_to_modeldir(model_dir:str):
        configdotpbtxt = """name: "simple_cifar_pytorch"
platform: "pytorch_libtorch"
max_batch_size: 128


input [
  {
    name: "input__0"
    data_type: TYPE_FP32
    format: FORMAT_NCHW
    dims: [ 3, 32, 32 ]
  }
]

output [
  {
    name: "output__0"
    data_type: TYPE_FP32
    dims: [ 10 ]
    label_filename: "labels.txt"
  }
]

version_policy: { all { }}
"""
        labeldottxt = """airplane
automobile
bird
cat
deer
dog
frog
horse
ship
truck"""
        pathlib.Path(model_dir).mkdir(parents=True, exist_ok=True)
        with open(os.path.join(model_dir, 'config.pbtxt'), 'w') as f:
            f.write(configdotpbtxt)
        with open(os.path.join(model_dir, 'labels.txt'), 'w') as f:
            f.write(labeldottxt)

    def train(epochs=50):
        
        import os
        import shutil
        import numpy as np
        
        import torch
        import torch.nn as nn
        import torch.nn.functional as F
        import torch.optim as optim

        (trainloader, testloader) = load_data()

        # Copy TRTIS resource (containing config.pbtxt, labels.txt, ...) from container to mounted volume
        model_dir = os.path.join(output_model_dir, model_name)
        export_triton_config_to_modeldir(model_dir)
        
        model_verison_dir = os.path.join(output_model_dir, model_name, model_version)
        if model_version == '1': # if it is default version, we always clear it to keep the space clean    
            if os.path.isdir(model_verison_dir):
                shutil.rmtree(model_verison_dir)
        pathlib.Path(model_verison_dir).mkdir(parents=True, exist_ok=True)

        class Net(nn.Module):
            def __init__(self):
                super().__init__()
                self.conv1 = nn.Conv2d(3, 6, 5)
                self.pool = nn.MaxPool2d(2, 2)
                self.conv2 = nn.Conv2d(6, 16, 5)
                self.fc1 = nn.Linear(16 * 5 * 5, 120)
                self.fc2 = nn.Linear(120, 84)
                self.fc3 = nn.Linear(84, 10)

            def forward(self, x):
                x = self.pool(F.relu(self.conv1(x)))
                x = self.pool(F.relu(self.conv2(x)))
                x = torch.flatten(x, 1) # flatten all dimensions except batch
                x = F.relu(self.fc1(x))
                x = F.relu(self.fc2(x))
                x = self.fc3(x)
                return x

        net = Net()
        if torch.cuda.is_available():
            net = net.cuda()
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
        
        for epoch in range(epochs):  # loop over the dataset multiple times
            train_loss = 0.0
            train_correct = 0
            for inputs, labels in trainloader:
                # Transfer Data to GPU if available
                if torch.cuda.is_available():
                    inputs, labels = inputs.cuda(), labels.cuda()
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                preds = net(inputs)
                loss = criterion(preds, labels)
                loss.backward()
                optimizer.step()
                train_loss = loss.item() * inputs.size(0)
                _, pred_labels = torch.max(preds, dim = 1)
                train_correct += (pred_labels == labels).float().sum()
            
            valid_loss = 0.0
            valid_correct = 0
            for inputs, labels in testloader:
                # Transfer Data to GPU if available
                if torch.cuda.is_available():
                    inputs, labels = inputs.cuda(), labels.cuda()

                # Forward Pass
                preds = net(inputs)
                # Find the Loss
                loss = criterion(preds,labels)
                # Calculate Loss
                valid_loss = loss.item() * inputs.size(0)
                _, pred_labels = torch.max(preds, dim = 1)
                valid_correct += (pred_labels == labels).float().sum()

            # print stats for each epoch
            print("epoch={}".format(epoch))
            print("Training-Accuracy={:7.6f}".format(train_correct / len(trainloader)))
            print("Training-Loss={:7.6f}".format(train_loss / len(trainloader)))
            print("Validation-Accuracy={:7.6f}".format(valid_correct / len(testloader)))
            print("Validation-Loss={:7.6f}".format(valid_loss / len(testloader)))
        
        # convert pytorch model to typescript model
        sample = torch.rand(64, 3, 32, 32)
        if torch.cuda.is_available():
            sample = sample.cuda()
        scripted_model = torch.jit.trace(net, sample)
        torch_model_path  = os.path.join(model_dir, model_version, 'model.pt')
        scripted_model.save(torch_model_path)

    train(epochs)
    print("done")

In [None]:
### if you want to debug the above tranOp function,
### uncomment below
### and remember to COMMENT it before you `BUILD` this pipeline through UI

# trainOp('model', 'resnet_graphdef', 1, 3)

In [None]:
with open("requirements.txt", "w") as f:
    f.write("kfp==0.5.1\n")
    f.write("h5py<3.0.0\n")
    f.write("tintin-sdk>=0.0.4\n")
    f.write("ipywidgets==7.6.3\n")
    f.write("torch==1.6.0\n")
    f.write("torchvision==0.7.0\n")
    f.write("tqdm >=4.62.1\n")

!pip install -r requirements.txt --user --upgrade

In [None]:
import kfp
import kfp.dsl as dsl
import kfp.components as comp
import kfp.compiler as compiler

In [None]:
import os
pvcname = os.environ.get('TINTIN_SESSION_TEMPLATE_PVC_NAME')
generated_pipeline_zip_filename = os.environ.get('TINTIN_SESSION_TEMPLATE_GENERATED_PIPELINE_ZIP_FILENAME')
gpu_type_list_text = os.environ.get('TINTIN_SESSION_TEMPLATE_GPU_TYPE_LIST')
default_image = os.environ.get('TINTIN_SESSION_TEMPLATE_DEFAULT_IMAGE', 'footprintai/nvidia-tensorflow:19.12-tf1-py3')
mountPath = os.environ.get('TINTIN_SESSION_TEMPLATE_MOUNT_PATH', '/home/jovyan')



In [None]:
trainComp = comp.func_to_container_op(trainOp, 
                                      base_image=default_image,
                                      packages_to_install=["h5py<3.0.0", "tintin-sdk>=0.0.4", "ipywidgets==7.6.3", "torch==1.6.0", "torchvision==0.7.0", "tqdm >=4.62.1"])

import kfp.dsl as dsl
@dsl.pipeline(
   name='Projectname pipeline',
   description='simple pipeline.'
)
def templated_pipeline_func(
    epochs=50,
):
    
    ### model relative path can NOT be nest path(e.g. a/b/c/d, it should be the first folder (e.g. model)
    model_relative_path = os.environ.get('TINTIN_SESSION_TEMPLATE_MODEL_RELATIVE_PATH', 'model')    
    model_name = os.environ.get('TINTIN_SESSION_TEMPLATE_MODEL_NAME', 'simple_cifar_pytorch')
    ### if you want to customize $model_name, replace `my_customized_model_name` and uncomment below
    ### model_name = os.environ.get('TINTIN_SESSION_TEMPLATE_MODEL_NAME', 'my_customized_model_name')
    
    train_task = trainComp(model_relative_path, model_name, epochs)
    # add train_task default resources for cpu and memory, this value will be changed during runtime
    # to reflect your settings in UI
    train_task = train_task.add_resource_request('cpu', '1')
    train_task = train_task.add_resource_limit('cpu', '1')
    train_task = train_task.add_resource_request('memory', '4Gi')
    train_task = train_task.add_resource_limit('memory', '4Gi')
    
    # add annotation to reflect our configuration on `model_relative_path` and `model_name` to workflow itself.
    train_task = train_task.add_pod_annotation('tintin.footprint-ai.com/session-model-relative-path', model_relative_path)    
    train_task = train_task.add_pod_annotation('tintin.footprint-ai.com/session-model-name', model_name)
compiler.Compiler().compile(templated_pipeline_func, generated_pipeline_zip_filename)