## Baseball win percentage prediction  

In [None]:
# Install the packages
! pip3 install --user --no-cache-dir --upgrade "kfp>2" "google-cloud-pipeline-components>2" \
                                        google-cloud-aiplatform

In [None]:
# Restart the kernel
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [None]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! pip3 freeze | grep aiplatform
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

### Set up global variables

In [None]:
#The Google Cloud project that this pipeline runs in.
PROJECT_ID = "your project id"
# The region that this pipeline runs in
REGION = "us-central1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
PIPELINE_ROOT = "gs://your temp bucket"


# The model_repo is specified in the pipeline definition
# Threshold values are specified in the pipelien definition
# Data url is specified in the pipeline definition

### Import libraries

In [None]:
import kfp
import typing
from typing import Dict
from typing import NamedTuple
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)
import google.cloud.aiplatform as aip
from google_cloud_pipeline_components.types import artifact_types

## Create pipeline components
Components include:
* Load data
* Train model
* Evaluate model
* Deploy model

In [1]:
# List columns
import pandas as pd
df_baseball = pd.read_csv("https://raw.githubusercontent.com/Anne-Barnasconi/DEProject1/refs/heads/main/Data/baseball_clean.csv?token=GHSAT0AAAAAACXJCXZT2EROZL6L2AA24QW2ZYOK5EQ", delimiter=",")
df_baseball

Unnamed: 0,League,On_base_percentage,Slugging_percentage,Batting_average,Opponent_on_base_percentage,Opponent_slugging_percentage,Win_percentage,decade60s,decade70s,decade80s,decade90s,decade2000s,decade2010s
0,1,0.328,0.418,0.259,0.317,0.415,0.500000,0,0,0,0,0,1
1,1,0.320,0.389,0.247,0.306,0.378,0.580247,0,0,0,0,0,1
2,0,0.311,0.417,0.247,0.315,0.403,0.574074,0,0,0,0,0,1
3,0,0.315,0.415,0.260,0.331,0.428,0.425926,0,0,0,0,0,1
4,1,0.302,0.378,0.240,0.335,0.424,0.376543,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,1,0.356,0.434,0.271,0.345,0.423,0.530864,0,0,0,1,0,0
416,1,0.338,0.426,0.262,0.355,0.427,0.465839,0,0,0,1,0,0
417,0,0.343,0.411,0.274,0.371,0.448,0.425926,0,0,0,1,0,0
418,0,0.361,0.479,0.293,0.346,0.459,0.586420,0,0,0,1,0,0


## Create pipeline components

### Data ingestion

In [None]:
@dsl.component(
    packages_to_install=["pandas","google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def download_data(project_id: str, bucket: str, file_name: str, dataset: Output[Dataset]):
    '''download data'''
    from google.cloud import storage
    import pandas as pd
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    # Downloaing the file from a google bucket 
    client = storage.Client(project=project_id)
    bucket = client.bucket(bucket)
    blob = bucket.blob(file_name)
    blob.download_to_filename(dataset.path + ".csv")
    logging.info('Downloaded Data!')
    

### Train and test split and scaling

In [None]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn==1.3.2"],
    base_image="python:3.10.7-slim"
)
def train_test_split_scale(dataset: Input[Dataset], X_train: Output[Dataset], X_test: Output[Dataset], y_train: Output[Dataset], y_test: Output[Dataset]):
    '''train_test_split and feature scaling'''
    import pandas as pd
    import logging 
    import sys
    from sklearn.model_selection import train_test_split as tts
    from sklearn.preprocessing import MinMaxScaler
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO) 
    
    # load all data, create prediction (y) and feature sets (X)
    alldata = pd.read_csv(dataset.path, index_col=None)
    X = alldata.drop(labels=['Win_percentage'],axis=1)
    y = alldata['Win_percentage']
    
    # Split test and train data
    X_train, X_test, y_train, y_test = tts(X, y, random_state=0)
    
    # Scale data
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)  
    X_test = scaler.transform(X_test)
    X_train = pd.DataFrame(X_train,columns=X.columns)
    X_test = pd.DataFrame(X_test,columns=X.columns)
    
    # Put data in csv files
    X_train.to_csv(X_train.path + ".csv" , index=False, encoding='utf-8-sig')
    X_test.to_csv(X_test.path + ".csv" , index=False, encoding='utf-8-sig')
    y_train.to_csv(y_train.path + ".csv" , index=False, encoding='utf-8-sig')
    y_test.to_csv(y_test.path + ".csv" , index=False, encoding='utf-8-sig')

### Train the KNN Regression model

In [None]:
@dsl.component(
    packages_to_install=['pandas', 'scikit-learn==1.3.2'],
    base_image="python:3.10.7-slim"
)
def train_model(X_train: Input[Dataset], y_train: Input[Dataset], out_model: Output[Model]):
    '''train a KNN regressor with default parameters (k = 4)'''
    import pandas as pd
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn import metrics
    from sklearn.model_selection import train_test_split
    import json
    import logging 
    import sys
    import os
    import pickle
       
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    # Load the train set
    X_train = pd.read_csv(X_train.path+".csv")
    y_train = pd.read_csv(y_train.path+".csv")
    
    # Log the train information
    logging.info(X_train.columns)
    logging.info(X_train.columns)        
  
    # Train the model
    model_knn = KNeighborsRegressor(n_neighbors=4)
    model_knn.fit(X_train,y_train)
    
    # Save the model
    out_model.metadata["framework"] = "KNN"
    file_name = out_model.path + ".pkl"
    with open(file_name, 'wb') as file:
        pickle.dump(model_knn, file)


### Evaluate the model

In [None]:
@dsl.component(
    packages_to_install=['pandas', 'scikit-learn==1.3.2'],
    base_image="python:3.10.7-slim"
)
def evaluate_model(model_knn: Input[Model], X_test: Input[Dataset], y_test: Input[Dataset], thresholds_dict_str: str, metrics_dict: Output[Metrics]) -> NamedTuple('outputs', deploy=bool):
    '''evaluate the KNN regressor'''
    import pandas as pd
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.metrics import root_mean_squared_error
    from sklearn import metrics
    from sklearn.model_selection import train_test_split
    import json
    import logging 
    import sys
    import os
    import pickle
       
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    # Create threshold_check function
    def threshold_check(val1, val2):
        cond = False
        if val1 >= val2 :
            cond = True
        return cond
    
    # Load data and model
    X_test = pd.read_csv(X_test.path+".csv")
    y_test = pd.read_csv(y_test.path+".csv")
    model = KNeighborsRegressor()
    file_name = model_knn.path + ".pkl"
    with open(file_name, 'rb') as file:  
        model = pickle.load(file)
    
    # Prepare the data for evaluation
    y_pred = model.predict(y_test)
    
    

    logging.info(metrics_dict)

    # Compute R^2
    r_squared = model_knn.score(X_test, y_test)
    
    # Load threshold from JSON file
    thresholds_dict = json.loads(thresholds_dict_str)
    model_knn.metadata["R-squared"] = float(r_squared)
    outputs = NamedTuple('outputs', deploy=bool)
    approval_value = threshold_check(float(r_squared), int(thresholds_dict['R-squared']))
    
    # Log R^2 for the training set - Note: In the original testing, we also used MSE but I've left it out here
    metrics_dict = {
        "R-squared": r_squared,
        "RMSE": root_mean_squared_error(y_test, y_pred)
    }
    
    return outputs(approval_value)

### Deploy the model

In [None]:
@dsl.component(
    packages_to_install=["google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def upload_model_to_gcs(project_id: str, model_repo: str, model_name:str, model: Input[Model]):
    '''upload model to gsc'''
    from google.cloud import storage   
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)    
  
    # upload the model to GCS
    client = storage.Client(project=project_id)
    bucket = client.bucket(model_repo)
    dest_file_name= model_name + '.pkl'
    blob = bucket.blob(dest_file_name)
    source_file_name= model.path + '.pkl'
   
    blob.upload_from_filename(source_file_name)    
    
    print(f"File {source_file_name} uploaded to {model_repo}.")

## Create the pipeline

In [None]:
@dsl.pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    pipeline_root=PIPELINE_ROOT,
    # A name for the pipeline. Use to determine the pipeline Context.
    name="pipeline-moneyball",
    
)
def pipeline(
    url: str = "https://raw.githubusercontent.com/Anne-Barnasconi/DEProject1/refs/heads/main/Data/baseball_clean.csv?token=GHSAT0AAAAAACXJCXZTA4LFIREMA4456RYKZYOQDFQ",
    project: str = PROJECT_ID,
    region: str = REGION, 
    thresholds_dict_str: str = '{"R-squared":0.7}',  # Threshold set to 0.7
    model_repo: str = "models_de2024_trs"
    ):
    
    # Loading the data
    data_op = download_data(url=url)
    
    # Splitting and scaling the model
    split_scale_model_op = train_test_split_scale(
        dataset = data_op.outputs["dataset_train"])
    
    # Training the model
    model_train_op = train_model(
        X_train = split_scale_model_op.outputs["X_train"], 
        y_train = split_scale_model_op.outputs["y_train"])
    
    # Evaluating the model
    model_evaluation_op = evaluate_model(
        model_knn = model_train_op.outputs["out_model"],
        X_test = split_scale_model_op.outputs["X_test"],
        y_test = split_scale_model_op.outputs["test"],
        thresholds_dict_str = thresholds_dict_str
    )

    
    with dsl.If(
        model_evaluation_op.outputs["deploy"]==True,
        name="deploy-model",
    ):
           
        upload_model_to_gc_op = upload_model_to_gcs(
            project_id=project,
            model_repo=model_repo,
            model_name="moneyball",
            model = model_train_op.outputs['out_model']
        )    
    

### Compile and run the pipeline

In [None]:
from kfp import compiler

compiler.Compiler().compile(pipeline_func=pipeline, package_path = 'ml_moneyball.yaml')

In [None]:
import google.cloud.aiplatform as aip

aip.init(
    project=PROJECT_ID,
    location=REGION,
)

job = aip.PipelineJob(
    display_name="moneyball-pipeline",
    template_path="ml_moneyball.yaml",
    enable_caching=False,
    location=REGION,
)
job.run()