In [None]:
from kfp import dsl
from kfp.dsl import (
    component,
)

from kfp import compiler
from google.cloud import aiplatform

In [8]:
BASE_IMAGE = "europe-west3-docker.pkg.dev/bda-gameon-demo/vertex/base_f1_container:latest"

In [9]:
@component(
    base_image=BASE_IMAGE
)
def load_and_preprocess(
    race_id: int,
):
    import requests
    from vertex_utils import (
        prepare_df_from_json,
        prepare_aggregations,
        save_historic_to_big_query
    )
    
    API_BASE_URL = "https://big-data-project-api-248863766350.europe-west3.run.app/laps"
    api_url = f"{API_BASE_URL}/{race_id}"
    response = requests.get(api_url)
    data = response.json()

    print(f"Gathering data for race: {race_id}...")
    df = prepare_df_from_json(data)

    print("Preparing aggregations...")
    aggregations = prepare_aggregations(df)

    aggregations = aggregations.drop(["race_id", "milliseconds"], axis=1)

    print("Saving aggregations to BigQuery...")
    save_historic_to_big_query(aggregations)

In [14]:
@component(
    base_image=BASE_IMAGE
)
def train_model():
    from google.cloud import bigquery

    query = """    
        CREATE OR REPLACE MODEL f1.xgboost_model
        OPTIONS(model_type='BOOSTED_TREE_REGRESSOR', 
                input_label_cols=['final_position']) AS
        SELECT 
        lap,
        driver,
        position,
        mean_position_up_to_lap,
        std_position_up_to_lap,
        min_position_up_to_lap,
        last_5_laps_mean_position,
        final_position
        FROM f1.historic_data;
    """

    client = bigquery.Client(project="bda-gameon-demo")
    client.query(query).result()

In [15]:
@dsl.pipeline(name="batch_processing", description="Pipeline responsible for batch processing and model training")
def batch_processing_pipeline(race_id: int = 1):
    
    load_preprocess_step = load_and_preprocess(
        race_id=race_id,
    ).set_display_name("Load and Preprocess Data")

    train_step = train_model().after(load_preprocess_step).set_display_name('Model training')

In [16]:
compiler.Compiler().compile(
    pipeline_func=batch_processing_pipeline,
    package_path="batch_processing_pipeline.json",
)

aiplatform.init(project="bda-gameon-demo", location="europe-west3")

pipeline_job = aiplatform.PipelineJob(
    display_name="batch_processing_job",
    template_path="batch_processing_pipeline.json",
    parameter_values={
        "race_id": 5,
    },
)

pipeline_job.run(sync=True)

Creating PipelineJob
PipelineJob created. Resource name: projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20241122203821
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20241122203821')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west3/pipelines/runs/batch-processing-20241122203821?project=248863766350
PipelineJob projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20241122203821 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20241122203821 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20241122203821 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/248863766350/locations/europe-west3/pi