# Kubeflow pipeline

Setup environment

In [1]:
pip freeze | grep kfp || pip install kfp

kfp==1.4.0
kfp-pipeline-spec==0.1.6
kfp-server-api==1.3.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
from os import path

import kfp
import kfp.compiler as compiler
import kfp.components as comp
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.notebook

## Kubeflow cluster parameters


To deploy a Kubeflow cluster in your GCP project, use the AI Platform pipelines:

- Go to AI Platform Pipelines in the GCP Console.
- Create a new instance
- Hit "Configure"
- Check the box "Allow access to the following Cloud APIs"
- Hit "Create Cluster"
- Hit "Deploy"
- When the cluster is ready, go back to the AI Platform pipelines page and click on "SETTINGS" entry for your cluster. This will bring up a pop up with code snippets on how to access the cluster programmatically.

Copy the "host" entry and set the "HOST" variable below with that.

In [3]:
HOST = "<kfp-host>"

In [4]:
client = kfp.Client(host=HOST)

In [5]:
exp = client.create_experiment(name='exoplanets')
client.list_experiments()

{'experiments': [{'created_at': datetime.datetime(2021, 2, 22, 17, 29, 16, tzinfo=tzlocal()),
                  'description': 'All runs created without specifying an '
                                 'experiment will be grouped here.',
                  'id': 'd8124aae-43fc-43ef-8433-fbf86e6edec6',
                  'name': 'Default',
                  'resource_references': None,
                  'storage_state': 'STORAGESTATE_AVAILABLE'},
                 {'created_at': datetime.datetime(2021, 2, 22, 17, 29, 47, tzinfo=tzlocal()),
                  'description': None,
                  'id': 'b73a2cc7-f578-4204-bd3c-7e8578b87581',
                  'name': 'exoplanets',
                  'resource_references': None,
                  'storage_state': 'STORAGESTATE_AVAILABLE'}],
 'next_page_token': None,
 'total_size': 2}

## Build and push the images on gcp container repo

In [6]:
# Builds the exoplanets preprocess container and push it
!components/preprocess/scripts/build.sh && components/preprocess/scripts/push.sh

Sending build context to Docker daemon  15.87kB
Step 1/5 : FROM google/cloud-sdk:latest
 ---> 5ead854c70c0
Step 2/5 : RUN apt-get update &&     apt-get install --yes python3-pip
 ---> Using cache
 ---> 92907819164e
Step 3/5 : COPY . /code
 ---> Using cache
 ---> 293b2d770048
Step 4/5 : WORKDIR /code
 ---> Using cache
 ---> 360cd4f20892
Step 5/5 : RUN pip install -U numpy xgboost scikit-learn google-cloud-storage
 ---> Using cache
 ---> f00e133af241
Successfully built f00e133af241
Successfully tagged gcr.io/ds-dev-playground/exoplanets_kubeflow-preprocess:latest
The push refers to repository [gcr.io/ds-dev-playground/exoplanets_kubeflow-preprocess]

[1B5b2c0bbe: Preparing 
[1B41e91459: Preparing 
[1B9c37eb3e: Preparing 
[1Ba6cf62ae: Preparing 
[1Bffaafef1: Preparing 
[1Ba809bf25: Preparing 
[1B9da373bf: Preparing 
[1Ba0e91bef: Preparing 
[1B4fd7b3af: Preparing 
[1Bbfe4d6dc: Layer already exists [10A[2K[5A[2K[3A[2K[2A[2K[1A[2Klatest: digest: sha256:fbf8391a3369c6265c9

In [7]:
# Builds the exoplanets trainer container and push it
!components/trainer/scripts/build.sh && components/trainer/scripts/push.sh

Sending build context to Docker daemon  23.04kB
Step 1/5 : FROM google/cloud-sdk:latest
 ---> 5ead854c70c0
Step 2/5 : RUN apt-get update &&     apt-get install --yes python3-pip
 ---> Using cache
 ---> 92907819164e
Step 3/5 : COPY . /code
 ---> 5d990f1eab4d
Step 4/5 : WORKDIR /code
 ---> Running in 3d1bd7387046
Removing intermediate container 3d1bd7387046
 ---> 086e727a057f
Step 5/5 : RUN pip install -U numpy xgboost scikit-learn google-cloud-storage
 ---> Running in ee96bc10556e
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
[0mCollecting numpy
  Downloading numpy-1.20.1-cp37-cp37m-manylinux2010_x86_64.whl (15.3 MB)
Collecting xgboost
  Downloading xgboost-1.3.3-py3-none-manylinux2010_x86_64.whl (157.5 MB)
Collecting scikit-learn
  Downloading scikit_learn-0.24.1-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
Collecting google-cloud-storage
  Downloading google_cloud_storage-1.36.0-py2.py3-none-any.whl (97 kB)
Collecting google-cloud-co

In [8]:
# Builds the exoplanets prediction container and push it
!components/prediction/scripts/build.sh && components/prediction/scripts/push.sh

Sending build context to Docker daemon  20.48kB
Step 1/5 : FROM google/cloud-sdk:latest
 ---> 5ead854c70c0
Step 2/5 : RUN apt-get update &&     apt-get install --yes python3-pip
 ---> Using cache
 ---> 92907819164e
Step 3/5 : COPY . /code
 ---> f58a9d06b142
Step 4/5 : WORKDIR /code
 ---> Running in 31fd584294e1
Removing intermediate container 31fd584294e1
 ---> d850559004a1
Step 5/5 : RUN pip install -U numpy xgboost scikit-learn google-cloud-storage
 ---> Running in ca17e175063d
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
[0mCollecting numpy
  Downloading numpy-1.20.1-cp37-cp37m-manylinux2010_x86_64.whl (15.3 MB)
Collecting xgboost
  Downloading xgboost-1.3.3-py3-none-manylinux2010_x86_64.whl (157.5 MB)
Collecting scikit-learn
  Downloading scikit_learn-0.24.1-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
Collecting google-cloud-storage
  Downloading google_cloud_storage-1.36.0-py2.py3-none-any.whl (97 kB)
Collecting google-cloud-co

## Create a Kubeflow pipeline¶


In [9]:
PIPELINE_TAR = 'exoplanets.tar.gz'

PREPROCESS_YAML = 'components/preprocess/preprocess.yaml'
TRAINER_YAML = 'components/trainer/trainer.yaml'
PREDICTION_YAML = 'components/prediction/prediction.yaml'

In [16]:
@dsl.pipeline(
    name='ExoPlanets_kubeflow',
    description='ExoPlanets - Kubeflow edn-to-end pipeline')

def pipeline(gcs_bucket_name='<bucket where data and model will be exported>'):

    preprocess_op = comp.load_component_from_file(PREPROCESS_YAML)
    preprocess = preprocess_op(
        input_bucket=gcs_bucket_name,
    )
    
    trainer_op = comp.load_component_from_file(TRAINER_YAML)
    trainer = trainer_op(
        input_bucket=gcs_bucket_name,
    )
    
    prediction_op = comp.load_component_from_file(PREDICTION_YAML)
    prediction = prediction_op(
        input_bucket=gcs_bucket_name,
    )
    
    trainer.after(preprocess)
    prediction.after(preprocess, trainer)

In [17]:
# Compile the pipeline
compiler.Compiler().compile(pipeline, PIPELINE_TAR)

In [18]:
ls $PIPELINE_TAR

exoplanets.tar.gz


## Run the Kubeflow pipeline

In [19]:
run = client.run_pipeline(
    experiment_id=exp.id, 
    job_name='exoplanets_complete', 
    pipeline_package_path=PIPELINE_TAR, 
    params={
        'gcs_bucket_name': "exoplanets_kubeflow",
    },
)