In [1]:
from kfp import dsl
from kfp.dsl import component
from kfp import compiler
from google.cloud import aiplatform
from typing import List

In [2]:
BASE_IMAGE = "europe-west3-docker.pkg.dev/bda-gameon-demo/vertex/base_cricket_container:latest"

In [3]:
@component(
    base_image=BASE_IMAGE,
)
def load_and_preprocess(
    match_ids: List[int],
    write_disp: str = 'APPEND',
):
    from cricket_utils import (
        load_data,
        transform_cricket_data,
        save_historic_to_big_query,
    )

    print("Loading data...")
    df = load_data(match_ids)

    print("Transforming data...")
    df = transform_cricket_data(df)

    print("Saving data to BigQuery...")
    save_historic_to_big_query(df, write_disp)

In [None]:
@component(
    base_image=BASE_IMAGE
)
def train_model():
    from google.cloud import bigquery
    import logging

    query = """    
        CREATE OR REPLACE MODEL cricket.xgboost_model
        OPTIONS(model_type='BOOSTED_TREE_CLASSIFIER', 
                input_label_cols=['winner']) AS
        SELECT 
        inning,
        `over` as over_number,
        cumulative_score,
        cumulative_wickets,
        toss_decision,
        toss_winner,
        season,
        current_team,
        first_inning_total_score,
        first_inning_total_wickets,
        first_inning_run_rate,
        team_1,
        team_2,
        run_rate,
        required_run_rate,
        winner
        FROM cricket.historic_data;
    """
    try:
        logging.info("Initializing BigQuery client...")
        client = bigquery.Client(project="bda-gameon-demo")
        logging.info("Running query:\n%s", query)
        query_job = client.query(query)
        query_job.result()
        logging.info("Query completed successfully.")
    except Exception as e:
        logging.error("Failed to execute query: %s", str(e))
        raise e

In [5]:
@component(
    base_image=BASE_IMAGE
)
def write_metrics_to_bq():
    from google.cloud import bigquery
    import logging

    model_name = "bda-gameon-demo.cricket.xgboost_model"
    table_id = "bda-gameon-demo.cricket.model_metrics"

    try:
        logging.info("Initializing BigQuery client...")
        client = bigquery.Client(project="bda-gameon-demo")

        logging.info("Checking if table exists: %s", table_id)
        dataset_id, table_name = table_id.split('.')[-2:]
        dataset = client.dataset(dataset_id)
        table = dataset.table(table_name)

        try:
            client.get_table(table)
            logging.info("Table exists: %s", table_id)
        except Exception:
            logging.info("Table does not exist. Creating table: %s", table_id)
            schema = [
                bigquery.SchemaField("metric_name", "STRING", mode="REQUIRED"),
                bigquery.SchemaField("metric_value", "FLOAT", mode="REQUIRED"),
            ]
            table = bigquery.Table(table_id, schema=schema)
            client.create_table(table)
            logging.info("Table created successfully: %s", table_id)

        eval_query = f"""
            SELECT *
            FROM ML.EVALUATE(MODEL `{model_name}`)
        """
        logging.info("Retrieving evaluation metrics...")
        eval_results = client.query(eval_query).result()

        rows_to_insert = []
        for row in eval_results:
            rows_to_insert.append({
                "metric_name": row.metric_name,
                "metric_value": row.metric_value
            })

        # Write metrics to BigQuery table
        logging.info("Writing metrics to BigQuery table: %s", table_id)
        errors = client.insert_rows_json(table_id, rows_to_insert)
        if errors:
            logging.error("Errors occurred while inserting rows: %s", errors)
            raise Exception(f"Failed to write metrics: {errors}")
        else:
            logging.info("Metrics successfully written to BigQuery.")
    except Exception as e:
        logging.error("Failed to write metrics to BigQuery: %s", str(e))
        raise e


In [6]:
@dsl.pipeline(name="batch_processing", description="Cricket batch processing pipeline")
def cricket_batch_processing_pipeline(match_ids: List[int] = [i for i in range(1, 30)], write_disp: str = 'APPEND'):
    
    load_preprocess_step = load_and_preprocess(
        match_ids=match_ids, 
        write_disp=write_disp
    ).set_display_name("Load and Preprocess Data")

    train_step = train_model().after(load_preprocess_step).set_display_name('Train Model')

    write_metrics_step = write_metrics_to_bq().after(train_step).set_display_name('Write Model Metrics to BigQuery')

In [7]:
compiler.Compiler().compile(
    pipeline_func=cricket_batch_processing_pipeline,
    package_path="cricket_batch_processing_pipeline.json",
)

In [8]:
aiplatform.init(project="bda-gameon-demo", location="europe-west3")

pipeline_job = aiplatform.PipelineJob(
    display_name="cricket_batch_processing_pipeline",
    template_path="cricket_batch_processing_pipeline.json",
    parameter_values={
        "match_ids": [2363],
        "write_disp": 'APPEND',
    },
)

pipeline_job.run(sync=True)

Creating PipelineJob
PipelineJob created. Resource name: projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20250104225145
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20250104225145')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west3/pipelines/runs/batch-processing-20250104225145?project=248863766350
PipelineJob projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20250104225145 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20250104225145 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/248863766350/locations/europe-west3/pipelineJobs/batch-processing-20250104225145 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/248863766350/locations/europe-west3/pi

RuntimeError: Job failed with:
code: 9
message: " The DAG failed because some tasks failed. The failed tasks are: [write-metrics-to-bq].; Job (project_id = bda-gameon-demo, job_id = 5320134345552297984) is failed due to the above error.; Failed to handle the job: {project_number = 248863766350, job_id = 5320134345552297984}"


In [11]:
from google.cloud import bigquery
PROJECT_ID = "bda-gameon-demo"
MODEL = "cricket.xgboost_model"
row = {
    "inning": 1,
    "over_number": 1,
    "cumulative_score": 0,
    "cumulative_wickets": 0,
    "toss_decision": 1,
    "toss_winner": 1,
    "season": 2021,
    "current_team": 1,
    "first_inning_total_score": 0,
    "first_inning_total_wickets": 0,
    "first_inning_run_rate": 0,
    "team_1": "A",
    "team_2": "B",
    "run_rate": 0,
    "required_run_rate": 0,
}

client = bigquery.Client(project=PROJECT_ID)

query = f"""
SELECT winner
FROM ML.PREDICT(MODEL `{MODEL}`, (
    SELECT 
    {row['inning']} AS inning,
    {row['over_number']} AS over_number,
    {row['cumulative_score']} AS cumulative_score,
    {row['cumulative_wickets']} AS cumulative_wickets,
    {row['toss_decision']} AS toss_decision,
    {row['toss_winner']} AS toss_winner,
    {row['season']} AS season,
    {row['current_team']} AS current_team,
    {row['first_inning_total_score']} AS first_inning_total_score,
    {row['first_inning_total_wickets']} AS first_inning_total_wickets,
    {row['first_inning_run_rate']} AS first_inning_run_rate,
    '{row['team_1']}' AS team_1,
    '{row['team_2']}' AS team_2,
    {row['run_rate']} AS run_rate,
    {row['required_run_rate']} AS required_run_rate
))
"""

query_job = client.query(query)
results = list(query_job.result())
print(f"Prediction: {results[0]['winner']}")

BadRequest: 400 Invalid table-valued function ML.PREDICT
Column over_number with type INT64 cannot be converted to type STRING from training implicitly according to the coercion rule: https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_rules. at [3:6]; reason: invalidQuery, location: query, message: Invalid table-valued function ML.PREDICT
Column over_number with type INT64 cannot be converted to type STRING from training implicitly according to the coercion rule: https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_rules. at [3:6]

Location: EU
Job ID: 9ecdbb09-01b9-4d39-82e6-baf53a059940
