In [1]:
!pip install --user --upgrade pip


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Requirement already up-to-date: pip in /home/jovyan/.local/lib/python3.6/site-packages (20.2.4)


In [2]:
!pip install --user kfp

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [3]:
import kfp
import kfp.dsl as dsl
import kfp.gcp as gcp
from kfp import components
import json
from string import Template
from jinja2 import Template as JinjaTemplate

In [4]:

def convert_result(result) -> dict:
    import json
    hyperparameters = json.loads(result)
    res = {}
    args = []
    for param in hyperparameters:
#         args.append(f"{param['name']}={param['value']}")
        res[param['name']] = param['value']
    print(res)
    return res

In [None]:
TRAINING_FILE_PATH = 'datasets/training/data.csv'
VALIDATION_FILE_PATH = 'datasets/validation/data.csv'
TESTING_FILE_PATH = 'datasets/testing/data.csv'
KATIB_NAMESPACE = "kubeflow-olamideakinkunmi11"
SOURCE_TABLE = 'used_cars.vehicles'
PROJECT_ID = "hamoye-296618"
DATASET_ID = "used_cars"
TRAINING_IMAGE = "sklearn-usedcars-image:latest"
RUNTIME_VERSION = "2.3"
PYTHON_VERSION = "3.7"
MODEL_ID = 'usedcars_price_regressor'
VERSION_ID = 'v01'
REPLACE_EXISTING_VERSION = 'True'
EVALUATION_METRIC_THRESHOLD = '0.7'

In [None]:
def generate_sampling_query(source_table_name, num_lots, lots):
    sampling_query_template = """
        SELECT *
        FROM
            `{{ source_table }}` AS vehicle
        WHERE
        MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(vehicle))), {{ num_lots }}) IN ({{ lots }})
        """
    query = JinjaTemplate(sampling_query_template).render(
        source_table=source_table_name,
        num_lots=num_lots,
        lots=str(lots)[1:-1]
    )

    return query

In [None]:
@dsl.pipeline(
    name="Used-Cars",
    description="A pipeline to train and serve the used cars price prediction model"
)

def used_cars_pipeline(
    name="used-cars-{{workflow.uid}}",
    katib_namespace=KATIB_NAMESPACE,
    goal=0.9,
    source_table_name=SOURCE_TABLE,
    # preprocess_image=f"gcr.io/{}/kubeflow-used-cars-preprocess:latest",
    training_image="gcr.io/{}/{}".format(PROJECT_ID, TRAINING_IMAGE),
    training_namespace="kubeflow",
    gcs_root="gs://used-cars",
    parallel_trial=3,
    max_trial=2,
    project_id=PROJECT_ID,
    dataset_id=DATASET_ID,
    dataset_location="US",
    python_version=PYTHON_VERSION,
    runtime_version=RUNTIME_VERSION,
    replace_existing_version=REPLACE_EXISTING_VERSION
    ):
    
    bigquery_op = kfp.components.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/d2f5cc92a46012b9927209e2aaccab70961582dc/components/gcp/bigquery/query/component.yaml'
    )
    
#     bigquery_op.name = "big query train split"

    query = generate_sampling_query(
        source_table_name=source_table_name,
        num_lots=10,
        lots=[1, 2, 3, 4, 5]
    )

    training_file_path = f'{gcs_root}/{TRAINING_FILE_PATH}'

    create_training_split = bigquery_op(
        query=query,
        project_id=project_id,
        dataset_id=dataset_id,
        table_id='',
        output_gcs_path=training_file_path,
        dataset_location=dataset_location,
    )
    
#     create_training_split.name = "big query test split"
    

    query = generate_sampling_query(
        source_table_name=source_table_name,
        num_lots=10,
        lots=[8]
    )

    validation_file_path = f'{gcs_root}/{VALIDATION_FILE_PATH}'
    
#     bigquery_op.name = "big query validation split"

    create_validation_split = bigquery_op(
        query=query,
        project_id=project_id,
        dataset_id=dataset_id,
        table_id='',
        output_gcs_path=validation_file_path,
        dataset_location=dataset_location,
    )
    
#     create_validation_split.name = "big query validation split"
#     create_validation_split.after(create_training_split)
    
    
    query = generate_sampling_query(
        source_table_name=source_table_name,
        num_lots=10,
        lots=[9]
    )

    testing_file_path = f'{gcs_root}/{TESTING_FILE_PATH}'
#     bigquery_op.name = "big query test split"

    create_testing_split = bigquery_op(
        query=query,
        project_id=project_id,
        dataset_id=dataset_id,
        table_id='',
        output_gcs_path=testing_file_path,
        dataset_location=dataset_location,
    )

#     create_testing_split.name = "big query test split"
#     create_testing_split.after(create_validation_split)
    
##################################KATIB SETTINGS #######################################################
    objectiveConfig = {
            "type": "maximize",
            "goal": goal,
            "objectiveMetricName": "score",
#             "additionalMetricNames": ["accuracy"],
        }
    
    algorithmConfig = {"algorithmName": "random"}
    
    metricsCollectorSpec = {
        "collector": {
            "kind": "StdOut"
        }
    }
    
    parameters = [
        {"name": "--n-estimators", "parameterType": "int", "feasibleSpace": {"min": "200", "max":"1000"}},
        {"name": "--min-samples-split", "parameterType": "int", "feasibleSpace": {"min": "2", "max":"10"}},
        {"name": "--min-samples-leaf", "parameterType": "int", "feasibleSpace": {"min": "1", "max":"4"}},
        {"name": "--max-features", "parameterType": "categorical", "feasibleSpace": {"list": ["auto", "sqrt"]}},
        {"name": "--max-depth", "parameterType": "int", "feasibleSpace": {"min": "10", "max":"100"}}
    ]
    
    rawTemplate = {
        "apiVersion": "batch/v1",
        "kind": "Job",
        "metadata": {
            "name": "{{.Trial}}",
            "namespace": "{{.NameSpace}}"
        },
        "spec": {
            "template": {
                "spec": {
                    "restartPolicy": "Never",
                    "containers": [
                        {
                            "name": "{{.Trial}}",
                            "image": str(training_image),
                            "imagePullPolicy": "Always",
                            "command": [
                                "python",
                                "train.py",
                                "--training-file-path={}".format(create_training_split.outputs['output_gcs_path']),
                                "--validation-file-path={}".format(create_validation_split.outputs['output_gcs_path']),
                                "--hypertune=True",
                                "{{- with .HyperParameters}}",
                                "{{- range .}}",
                                "{{.Name}}={{.Value}}",
                                "{{- end}}",
                                "{{- end}}"
                            ]
                        }
                    ]
                }
            }
        }
    }
    
    trialTemplate = {
        "goTemplate": {
            "rawTemplate": json.dumps(rawTemplate)
        }
    }
#######################################################################################################
    
#     katib_experiment_launcher_op = components.load_component_from_url(
#         'https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml'
#     )
    
    katib_launcher_op = components.load_component_from_file('./train/hptuning.yml')
    
 
    katib_op = katib_launcher_op(
        experiment_name=name,
        experiment_namespace=katib_namespace,
        parallel_trial_count=parallel_trial,
        max_trial_count=max_trial,
        objective=str(objectiveConfig),
        algorithm=str(algorithmConfig),
        trial_template=str(trialTemplate),
        parameters=str(parameters),
        metrics_collector=str(metricsCollectorSpec),
        delete_finished_experiment=False
    )
    
    katib_op.name = "hyperparameter tuning-katib"
    
    
    
    convert_op = components.func_to_container_op(convert_result)
    convert = convert_op(katib_op.output)
    
    
#     train_op = components.load_component_from_file('./train/traincomponent.yml')
    
#     job_dir = '{}/{}/{}'.format(gcs_root, 'jobdir', kfp.dsl.RUN_ID_PLACEHOLDER)
    
#     train = train_op(
#         job_dir=job_dir,
#         training_file_path=create_training_split.outputs['output_gcs_path'],
#         validation_file_path=create_validation_split.outputs['output_gcs_path'],
#         n_estimators=int(covert.output['--n-estimators']),
#         min_samples_split=int(covert.output['--min-samples-split']),
#         min_samples_leaf=int(convert.output['--min-samples-leaf']),
#         max_features=str(convert.output['--max-features']),
#         max_depth=int(convert.output['--max-depth']),
#     ).after(convert)
    
#     deploy_op = components.load_component_from_url(
#         'https://raw.githubusercontent.com/kubeflow/pipelines/master/components/gcp/ml_engine/deploy/component.yaml'
#     )
    
#     deploy = deploy_op(
#         model_uri=train.outputs['job_dir'],
#         project_id=project_id,
#         model_id=model_id,
#         version_id=version_id,
#         runtime_version=RUNTIME_VERSION,
#         python_version=PYTHON_VERSION,
#         replace_existing_version=REPLACE_EXISTING_VERSION
#     ).after(train)
    
#     deploy.name = "deploy model"

In [None]:
pipeline = kfp.Client().create_run_from_pipeline_func(used_cars_pipeline, arguments={})