# End-to-End UAT Test: Wine Quality Predictor

This notebook demonstrates a complete end-to-end machine learning pipeline using Kubeflow, MLflow, and KServe. The pipeline covers the following steps:
1. **Data Ingestion**: Downloading a wine quality dataset from a public URL.
2. **Data Preprocessing**: Cleaning and transforming the dataset into a format suitable for model training.
3. **Model Training**: Training an ElasticNet regression model to predict wine quality, with automatic logging of model artifacts to MLflow.
4. **Model Deployment**: Deploying the trained model as a scalable inference service using KServe.
5. **Model Inference**: Making predictions on new data using the deployed model and verifying the end-to-end functionality.
6. **Cleanup**: Removing the deployed inference service after the test is completed to free up resources.

This UAT test serves as a demonstration of the seamless integration of Kubeflow Pipelines with MLflow for model management and KServe for model deployment, along with proper resource management by cleaning up the deployed services.

In [None]:
!pip install -r requirements.txt

In [None]:
import kfp
import mlflow
import os
import requests

from kfp.dsl import Input, Model, component
from kfp.dsl import InputPath, OutputPath, pipeline, component
from kserve import KServeClient
from mlflow.tracking import MlflowClient
from tenacity import retry, stop_after_attempt, wait_exponential

In [None]:
HTTP_PROXY = HTTPS_PROXY = NO_PROXY = None

if os.environ.get("HTTP_PROXY") and os.environ.get("HTTPS_PROXY") and os.environ.get("NO_PROXY"):
    HTTP_PROXY = os.environ["HTTP_PROXY"]
    HTTPS_PROXY = os.environ["HTTPS_PROXY"]
    # add `.kubeflow` to NO_PROXY needed for pipelines
    NO_PROXY = os.environ["NO_PROXY"]


def add_proxy(obj, http_proxy=HTTP_PROXY, https_proxy=HTTPS_PROXY, no_proxy=NO_PROXY):
    """Adds the proxy env vars to the PipelineTask object."""
    return (
        obj.set_env_variable(name="http_proxy", value=http_proxy)
        .set_env_variable(name="https_proxy", value=https_proxy)
        .set_env_variable(name="HTTP_PROXY", value=http_proxy)
        .set_env_variable(name="HTTPS_PROXY", value=https_proxy)
        .set_env_variable(name="no_proxy", value=no_proxy)
        .set_env_variable(name="NO_PROXY", value=no_proxy)
    )


def proxy_envs_set() -> bool:
    if HTTP_PROXY and HTTPS_PROXY and NO_PROXY:
        return True
    return False

In [None]:
# Define a constant for the Inference Service name
ISVC_NAME = "wine-regressor3"
MLFLOW_RUN_NAME = "elastic_net_models"
MLFLOW_MODEL_NAME = "wine-elasticnet"


@component(
    base_image="python:3.11",  # Use Python 3.11 base image
    packages_to_install=["requests==2.32.3", "pandas==2.2.2"],
)
def download_dataset(url: str, dataset_path: OutputPath("Dataset")) -> None:
    import requests
    import pandas as pd

    # Download the dataset from the provided URL
    response = requests.get(url)
    response.raise_for_status()

    # Convert the response content to a Pandas DataFrame
    from io import StringIO

    dataset = pd.read_csv(StringIO(response.text), header=0, sep=";")

    # Save the DataFrame to a CSV file at the specified output path
    dataset.to_csv(dataset_path, index=False)


@component(
    base_image="python:3.11",  # Use Python 3.11 base image
    packages_to_install=["pandas==2.2.2", "pyarrow==15.0.2"],
)
def preprocess_dataset(dataset: InputPath("Dataset"), output_file: OutputPath("Dataset")) -> None:
    import pandas as pd

    # Read the CSV file into a DataFrame
    df = pd.read_csv(dataset, header=0)

    # Preprocess the DataFrame by standardizing column names
    df.columns = [c.lower().replace(" ", "_") for c in df.columns]

    # Save the preprocessed DataFrame as a Parquet file
    df.to_parquet(output_file)


@component(
    base_image="python:3.11",  # Use Python 3.11 base image
    packages_to_install=[
        "pandas==2.2.2",
        "scikit-learn==1.5.1",
        "mlflow==2.15.1",
        "pyarrow==15.0.2",
        "boto3==1.34.162",
    ],
)
def train_model(dataset: InputPath("Dataset"), run_name: str, model_name: str) -> str:
    import os
    import mlflow
    import pandas as pd
    from sklearn.linear_model import ElasticNet
    from sklearn.model_selection import train_test_split

    # Load the preprocessed dataset
    df = pd.read_parquet(dataset)

    # Define the target column for prediction
    target_column = "quality"

    # Split the data into training and testing sets
    train_x, test_x, train_y, test_y = train_test_split(
        df.drop(columns=[target_column]),
        df[target_column],
        test_size=0.25,
        random_state=42,
        stratify=df[target_column],
    )

    # Enable MLflow auto logging for scikit-learn models
    mlflow.sklearn.autolog()

    # Start an MLflow run and train the model
    with mlflow.start_run(run_name=run_name) as run:
        mlflow.set_tag("author", "kf-testing")
        lr = ElasticNet(alpha=0.5, l1_ratio=0.5, random_state=42)
        lr.fit(train_x, train_y)
        mlflow.sklearn.log_model(lr, "model", registered_model_name=model_name)

        # Return the model artifact URI as a string
        model_uri = f"{run.info.artifact_uri}/model"
        print(model_uri)
        return model_uri

## Delete Inference Service

# Delete MLflow data