# Notebook from Data Scientist with E2E scenario for Wine Quality Predictor

This notebook demonstrates a complete end-to-end machine learning pipeline using Kubeflow, MLflow, and KServe. 

Expected Steps:
1. **Data Ingestion**: Downloading a wine quality dataset from a public URL.
2. **Data Preprocessing**: Cleaning and transforming the dataset into a format suitable for model training.
3. **Model Training**: Training an ElasticNet regression model to predict wine quality, with automatic logging of model artifacts to MLflow.
4. **Model Deployment**: Deploying the trained model as a scalable inference service using KServe.
5. **Model Inference**: Making predictions on new data using the deployed model and verifying the end-to-end functionality.
6. **Cleanup**: Removing the deployed inference service after the test is completed to free up resources.



In [None]:
!pip install -r requirements.txt

In [None]:
import kfp
import mlflow
import os
import requests

from kfp.dsl import Input, Model, component
from kfp.dsl import InputPath, OutputPath, pipeline, component
from kserve import KServeClient
from mlflow.tracking import MlflowClient
from tenacity import retry, stop_after_attempt, wait_exponential

In [None]:
# Dataset
url = "https://raw.githubusercontent.com/canonical/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv"

# Here we define a constant for the Inference Service name
ISVC_NAME = "wine-regressor3"
MLFLOW_RUN_NAME = "elastic_net_models"
MLFLOW_MODEL_NAME = "wine-elasticnet"


# TODO: Define the function as a pipeline component. 
# Let base_image be Python 3.11 and figure out the packages to install
@component()
def download_dataset(url: str, dataset_path: OutputPath("Dataset")) -> None:
    import requests
    import pandas as pd

    # Download the dataset from the provided URL
    response = requests.get(url)
    response.raise_for_status()

    # Convert the response content to a Pandas DataFrame
    from io import StringIO

    dataset = pd.read_csv(StringIO(response.text), header=0, sep=";")

    # Save the DataFrame to a CSV file at the specified output path
    dataset.to_csv(dataset_path, index=False)


# TODO: Define the function as a pipeline component. 
# Let base_image be Python 3.11 and set the packages to install to pandas==2.2.2 and pyarrow==15.0.2
@component()
def preprocess_dataset(dataset: InputPath("Dataset"), output_file: OutputPath("Dataset")) -> None:
    import pandas as pd

    # Read the CSV file into a DataFrame
    df = pd.read_csv(dataset, header=0)

    # Preprocess the DataFrame by standardizing column names
    df.columns = [c.lower().replace(" ", "_") for c in df.columns]

    # Save the preprocessed DataFrame as a Parquet file
    df.to_parquet(output_file)


@component(
    base_image="python:3.11",  # Use Python 3.11 base image
    packages_to_install=[
        "pandas==2.2.2",
        "scikit-learn==1.5.1",
        "mlflow==2.15.1",
        "pyarrow==15.0.2",
        "boto3==1.34.162",
    ],
)
def train_model(dataset: InputPath("Dataset"), run_name: str, model_name: str) -> str:
    import os
    import mlflow
    import pandas as pd
    from sklearn.linear_model import ElasticNet
    from sklearn.model_selection import train_test_split

    # Load the preprocessed dataset
    df = pd.read_parquet(dataset)

    # Define the target column for prediction
    target_column = "quality"

    # Split the data into training and testing sets
    train_x, test_x, train_y, test_y = train_test_split(
        df.drop(columns=[target_column]),
        df[target_column],
        test_size=0.25,
        random_state=42,
        stratify=df[target_column],
    )

    # TODO: Enable MLflow auto logging for scikit-learn models


    # Start an MLflow run and train the model
    with mlflow.start_run(run_name=run_name) as run:
        mlflow.set_tag("author", "kf-testing")
        lr = ElasticNet(alpha=0.5, l1_ratio=0.5, random_state=42)
        lr.fit(train_x, train_y)

        #TODO: Log the Linear Regression model and register it with model_name
        mlflow.sklearn.log_model(lr, "model", registered_model_name=model_name)

        # Return the model artifact URI as a string
        model_uri = f"{run.info.artifact_uri}/model"
        print(model_uri)
        return model_uri

**TODO:** Write a pipeline component to deploy the model with KServe. The function should return the ISVC URL.

In [None]:
@component(
    base_image="python:3.11",  # Use Python 3.11 base image
    packages_to_install=["kserve==0.13.1", "kubernetes==26.1.0", "tenacity==9.0.0"],
)
def deploy_model_with_kserve(model_uri: str, isvc_name: str) -> str:
    from kubernetes.client import V1ObjectMeta
    from kserve import (
        constants,
        KServeClient,
        V1beta1InferenceService,
        V1beta1InferenceServiceSpec,
        V1beta1PredictorSpec,
        V1beta1SKLearnSpec,
    )
    from tenacity import retry, wait_exponential, stop_after_attempt
    #
    # 1. Initialize the Inference Service specification
    # 2. Deploy the Inference Service using KServe
    # 3. Implement a logic to make sure the Inference Service is ready
    # 4. Wait until the service is ready and get the service URL. 
    # 5. Return it
    pass
    

**TODO:** Create the pipeline itself using the components.



In [None]:
# Fetch environment variables for MLflow tracking and AWS credentials
# These are guaranteed to be present because of the mlflow's poddefault please refer to [this guide](https://documentation.ubuntu.com/charmed-mlflow/en/latest/tutorial/mlflow-kubeflow/

mlflow_tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow_s3_endpoint_url = os.getenv("MLFLOW_S3_ENDPOINT_URL")
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")


@pipeline(name="download-preprocess-train-deploy-pipeline")
def download_preprocess_train_deploy_pipeline(url: str):
    # Step 1: Download the dataset from the URL
    # Step 2: Preprocess the downloaded dataset
    # Step 3: Train the model on the preprocessed dataset
    # Step 4: Deploy the trained model with KServe

**TODO:** Initialize a KFP Client and compile the pipeline. Then run it. Variable for the run should be "run".

In [None]:
# 1. Initialize a KFP client
# This client is used to interact with the Kubeflow Pipelines API.
client = None

# This URL points to the dataset that will be downloaded and processed in the pipeline.
url = "https://raw.githubusercontent.com/canonical/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv"

# 2. Compile the pipeline

# 3. Run the pipeline, set enable_caching to False

**TODO:** Initialize the KServe client, and try the inference service on the defined data

In [None]:
# 1. Initialize the KServe client
# This client is used to interact with the KServe Inference Service.
kserve_client = None

# 2. Retrieve the Inference Service details
# Fetches the Inference Service by name and extracts the URL for making predictions.
# YOUR CODE HERE
print("Inference URL:", inference_service_url)

# This data matches the expected input format of the deployed model, with each instance being a list of feature values.
input_data = {
    "instances": [
        [7.4, 0.7, 0.0, 1.9, 0.076, 11.0, 34.0, 0.9978, 3.51, 0.56, 9.4],
        [7.8, 0.88, 0.0, 2.6, 0.098, 25.0, 67.0, 0.9968, 3.2, 0.68, 9.8],
    ]
}

# 3. Send a prediction request to the Inference Service
# This sends the input data to the model for prediction via a POST request and prints the response.
# YOUR CODE HERE
print(response.text)

## Delete Inference Service

In [None]:
# YOUR CODE HERE

# Delete MLflow data

In [None]:
# YOUR CODE HERE