In [17]:
from kfp import dsl
from kfp.dsl import (
    component, 
    Output,
    Input,
    Model
)

from kfp import compiler
from google.cloud import aiplatform

In [None]:
BASE_IMAGE = "europe-west3-docker.pkg.dev/bda-gameon-demo/vertex/base_football_container:latest"

In [None]:
@component(
    base_image=BASE_IMAGE
)
def load_and_preprocess(
    gamesweek: int,
    competitionId: int,
):
    import requests
    from vertex_utils import (
        prepare_df_from_events_api,
        prepare_df_from_matches_api,
        fetch_events_data,
        merge_events_and_matches,
        enrich_with_tags_names,
        save_historic_to_big_query,
        prepare_aggregations,
    )
    
    API_BASE_URL = "https://big-data-project-api-248863766350.europe-west3.run.app/laps"

    # Fetch matches for the given gameweek and competitionId
    api_match_url = f"{API_BASE_URL}/matches?gameweek={gamesweek}&competitionId={competitionId}"

    print(f"Fetching match info for competition: {competitionId} and gameweek: {gamesweek}...")
    response_matches = requests.get(api_match_url)
    matches_data = response_matches.json()

    if "matches" not in matches_data:
        raise ValueError("Invalid matches data format received from API.")

    print(f"Gathering match info for competition: {competitionId} and gamesweek: {gamesweek}...")
    match_df = prepare_df_from_matches_api(matches_data)

    print(f"Fetching events data for competition: {competitionId} and gameweek: {gamesweek}...")
    events_data = fetch_events_data(matches_data, API_BASE_URL)

    # Convert events data to a DataFrame
    print(f"Preparing events DataFrame for competition: {competitionId} and gameweek: {gamesweek}...")
    events_df = prepare_df_from_events_api(events_data)

    print("Merging events and match data...")
    df = merge_events_and_matches(events_df, match_df)

    print("Enriching with tags names...")
    df = enrich_with_tags_names(df)

    print("Preparing aggregations...")
    aggregations = prepare_aggregations(df)

    print("Saving aggregations to BigQuery...")
    save_historic_to_big_query(aggregations)

In [None]:
@component(
    base_image=BASE_IMAGE
)
def train_model():
    from google.cloud import bigquery

    query = """    
        CREATE OR REPLACE MODEL football.lightgbm_model
        OPTIONS(model_type='BOOSTED_TREE_CLASSIFIER',
                input_label_cols=['label'],
                budget_hours=1)
        AS
        SELECT
            *
        FROM
            `bda-gameon-demo.football.historic_aggregations`
    """

    client = bigquery.Client(project="bda-gameon-demo")
    client.query(query).result()

In [None]:
@dsl.pipeline(name="batch_processing", description="Pipeline responsible for batch processing and model training")
def batch_processing_pipeline(
    gamesweek: int = 1,
    competitionId: int = 364,
):
    load_and_preprocess_step = load_and_preprocess(
        gamesweek=gamesweek,
        competitionId=competitionId
    ).set_display_name("Load and Preprocess")

    train_model_step = train_model().after(load_and_preprocess_step).set_display_name("Train Model")

In [None]:
compiler.Compiler().compile(
    pipeline_func=batch_processing_pipeline,
    package_path="batch_processing_pipeline.json",
)

aiplatform.init(project="bda-gameon-demo", location="europe-west3")

pipeline_job = aiplatform.PipelineJob(
    display_name="batch_processing_job",
    template_path="batch_processing_pipeline.json",
    parameter_values={
        "gamesweek": 1,
        "competitionId": 364,
    },
)

pipeline_job.run(sync=True)