In [1]:
import kfp

# Initializing the client
#client = kfp.Client()

# ! Use kfp.Client(host='https://xxxxx.notebooks.googleusercontent.com/') if working from GCP notebooks (or local notebooks)
client = kfp.Client(host='https://34c40cdd21e49f0a-dot-us-central1.notebooks.googleusercontent.com')

In [2]:
from kfp.components import func_to_container_op, load_component_from_file, load_component_from_text, InputPath, OutputPath
from pathlib import Path

In [3]:
component_file_name = 'google.cloud.bigquery.query.component.yaml'
Path(component_file_name).write_text('''\
name: Query BigQuery
inputs:
- name: Query
  type: String
outputs:
- name: Results
  type: CSV
implementation:
  container:
    image: google/cloud-sdk:latest
    command:
    - sh
    - -e
    - -c
    - |
        if [ -n "$GOOGLE_APPLICATION_CREDENTIALS" ]; then
            gcloud auth activate-service-account --key-file="$GOOGLE_APPLICATION_CREDENTIALS"
        fi
        query="$0"
        results_path=$1

        mkdir -p "$(dirname "$results_path")"
        echo 'y' | bq init
        bq query --nouse_legacy_sql --format csv -q "$query" > "$results_path"
    - {inputValue: Query}
    - {outputPath: Results}
''')
bq_query_op = load_component_from_file(component_file_name)

In [4]:
def sklearn_svm_csr_train(
    training_data_path: InputPath('CSV'),
    model_path: OutputPath('SKLearnSvmSvrModel'),
    transformer_path: OutputPath('SKLearnTransformer'),
    target_column_name: str,
):
    import pandas
    data = pandas.read_csv(training_data_path)
    cleaned_data = data.select_dtypes("number").fillna(0)

    from sklearn.model_selection import train_test_split
    training_data, testing_data = train_test_split(cleaned_data, test_size=0.5)

    training_features = training_data.drop(target_column_name, axis=1).values
    training_labels = training_data[target_column_name].values
    testing_features = testing_data.drop(target_column_name, axis=1).values
    testing_labels = testing_data[target_column_name].values

    from sklearn import preprocessing
    transformer = preprocessing.StandardScaler()
    transformer.fit(training_features)

    scaled_training_features = transformer.transform(training_features)
    scaled_testing_features = transformer.transform(training_features)

    from sklearn import svm
    model = svm.SVR().fit(scaled_training_features, training_labels)

    predictions = model.predict(scaled_testing_features)

    from sklearn import metrics
    mean_squared_error = metrics.mean_squared_error(testing_labels, predictions)
    print("mean_squared_error=" + str(mean_squared_error))

    import pickle
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    with open(transformer_path, 'wb') as f:
        pickle.dump(transformer, f)


sklearn_svm_csr_train_op = func_to_container_op(
    sklearn_svm_csr_train,
    packages_to_install=['sklearn', 'pandas'],
    output_component_file='sklearn.svm.csr.train.component.yaml',
)

In [5]:
def bq_sklearn_pipeline(query: str):
    bq_query_task = bq_query_op(query=query)
    train_svm_task = sklearn_svm_csr_train_op(
        training_data=bq_query_task.outputs['results'],
        target_column_name='tips',
    )

In [6]:
# The rate at which to sample rows from the Chicago Taxi dataset using BigQuery.
# The full taxi dataset is > 120M record.  In the interest of resource
# savings and time, we've set the default for this example to be much smaller.
# Feel free to crank it up and process the full dataset!
_query_sample_rate = 0.001  # Generate a 0.1% random sample.

# This is the upper bound of FARM_FINGERPRINT in Bigquery (ie the max value of
# signed int64).
_max_int64 = '0x7FFFFFFFFFFFFFFF'

# The query that extracts the examples from BigQuery.  The Chicago Taxi dataset
# used for this example is a public dataset available on Google AI Platform.
# https://console.cloud.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips
_query = """
         SELECT
           pickup_community_area,
           fare,
           EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month,
           EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour,
           EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day,
           UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp,
           pickup_latitude,
           pickup_longitude,
           dropoff_latitude,
           dropoff_longitude,
           trip_miles,
           pickup_census_tract,
           dropoff_census_tract,
           payment_type,
           company,
           trip_seconds,
           dropoff_community_area,
           tips
         FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
         WHERE (ABS(FARM_FINGERPRINT(unique_key)) / {max_int64})
           < {query_sample_rate}""".format(
               max_int64=_max_int64, query_sample_rate=_query_sample_rate)

In [7]:
from kfp.gcp import use_gcp_secret
pipeline_conf = kfp.dsl.PipelineConf()
pipeline_conf.add_op_transformer(use_gcp_secret())

client.create_run_from_pipeline_func(
    bq_sklearn_pipeline,
    arguments=dict(
        query=_query,
    ),
    pipeline_conf=pipeline_conf,
)

RunPipelineResult(run_id=f76c79c8-21b9-46ff-8c1e-6e0bf74f4463)