In [17]:
from kfp import dsl
from kfp.dsl import (
    component, 
    Output,
    Input,
    Model
)

from kfp import compiler
from google.cloud import aiplatform

In [18]:
BASE_IMAGE = "europe-west3-docker.pkg.dev/bda-gameon-demo/vertex/base_f1_container:latest"

In [19]:
@component(
    base_image=BASE_IMAGE
)
def load_and_preprocess(
    current_year: int,
    race_id: int,
):
    import requests
    from vertex_utils import (
        prepare_df_from_json,
        prepare_aggregations,
        enrich_with_drivers,
        enrich_with_races,
        save_historic_to_big_query
    )
    
    API_BASE_URL = "https://big-data-project-api-248863766350.europe-west3.run.app/laps"
    api_url = f"{API_BASE_URL}/{race_id}"
    response = requests.get(api_url)
    data = response.json()

    print(f"Gathering data for race: {race_id}...")
    df = prepare_df_from_json(data)

    print("Preparing aggregations...")
    aggregations = prepare_aggregations(df)

    print("Adding information from drivers table...")
    aggregations = enrich_with_drivers(aggregations, current_year)
    
    print("Adding information from races table...")
    aggregations = enrich_with_races(aggregations)

    aggregations = aggregations.drop(["driverId", "raceId", "race_id", "milliseconds"], axis=1)

    print("Saving aggregations to BigQuery...")
    save_historic_to_big_query(aggregations)

In [35]:
@component(
    base_image=BASE_IMAGE
)
def train_model(trained_model: Output[Model]):
    from xgboost import XGBRegressor
    import joblib
    import os

    from vertex_utils import load_historic
    
    final_df = load_historic()

    X = final_df.drop(columns=['final_position'])
    y = final_df['final_position']

    xgb_model = XGBRegressor(
        objective='reg:squarederror',
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        random_state=42
    )

    xgb_model.fit(X, y)

    model_dir = trained_model.path
    if not os.path.exists(model_dir):
        os.makedirs(model_dir) 

    model_path = model_dir + "/model.joblib"
    joblib.dump(xgb_model, model_path)
    print(f"Model saved to {model_path}")

In [36]:
@component(
    base_image=BASE_IMAGE
)
def deploy_model(trained_model: Input[Model]):
    from google.cloud import aiplatform

    project = "bda-gameon-demo"
    endpoint_name = "f1_model_endpoint"
    region = "europe-west3"
    
    aiplatform.init(project=project, location=region)

    model = aiplatform.Model.upload(
        display_name="f1_model",
        artifact_uri=trained_model.uri,
        serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-5:latest",
    )
    print(f"Model registered with resource name: {model.resource_name}")

    # endpoint = aiplatform.Endpoint.create(display_name=endpoint_name)
    # model.deploy(
    #     endpoint=endpoint,
    #     deployed_model_display_name="f1_model_deployment",
    #     machine_type="n1-standard-2",
    # )
    # print(f"Model deployed to endpoint: {endpoint.resource_name}")

In [37]:
@dsl.pipeline(name="batch_processing", description="Pipeline responsible for batch processing and model training")
def batch_processing_pipeline(current_year: int = 2024, race_id: int = 1):
    
    load_preprocess_step = load_and_preprocess(
        current_year=current_year,
        race_id=race_id,
    ).set_display_name("Load and Preprocess Data")

    train_step = train_model().after(load_preprocess_step).set_display_name('Model training')

    deploy_model(trained_model=train_step.outputs["trained_model"])

In [38]:
compiler.Compiler().compile(
    pipeline_func=batch_processing_pipeline,
    package_path="batch_processing_pipeline.json",
)

aiplatform.init(project="bda-gameon-demo", location="europe-west3")

pipeline_job = aiplatform.PipelineJob(
    display_name="batch_processing_job",
    template_path="batch_processing_pipeline.json",
    parameter_values={
        "current_year": 2024,
        "race_id": 5,
    },
)

pipeline_job.run(sync=True)

Creating PipelineJob
PipelineJob created. Resource name: projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20241121185501
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20241121185501')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west3/pipelines/runs/batch-processing-20241121185501?project=248863766350
PipelineJob projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20241121185501 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20241121185501 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob run completed. Resource name: projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20241121185501
