In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [31]:
from typing import List, Tuple
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
import mlflow
import mlflow.sklearn
from mlflow import MlflowClient
from loguru import logger

In [32]:

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'file:///Users/nicolasschroeder/Programming/Abalone-Age-Prediction-XHEC-2024-25/notebooks/mlruns'


In [33]:

client = MlflowClient()

experiments = client.search_experiments()

In [34]:
CATEGORICAL_COLS = ["Sex"]

DATA_DIRPATH = "../data"

In [35]:


def filter_outliers(df: pd.DataFrame, min_rings: int = 1, max_rings: int = 20) -> pd.DataFrame:
    """
    Remove rows corresponding to negative/zero
    and too high target' values from the dataset
    """
    return df[df["Rings"].between(min_rings, max_rings)]


def encode_categorical_cols(df: pd.DataFrame, categorical_cols: List[str] = None) -> pd.DataFrame:
    """Encode categorical columns as strings"""
    if categorical_cols is None:
        categorical_cols = CATEGORICAL_COLS
    df.loc[:, categorical_cols] = df[categorical_cols].fillna(-1).astype("str")
    df.loc[:, categorical_cols] = df[categorical_cols].astype("str")
    return df


def extract_x_y(
    df: pd.DataFrame,
    categorical_cols: List[str] = None,
    dv: DictVectorizer = None,
    with_target: bool = True,
) -> Tuple[scipy.sparse.csr_matrix, np.ndarray, DictVectorizer]:
    """Extract X and y from the dataframe"""
    if categorical_cols is None:
        categorical_cols = CATEGORICAL_COLS
    dicts = df[categorical_cols].to_dict(orient="records")

    y = None
    if with_target:
        if dv is None:
            dv = DictVectorizer()
            dv.fit(dicts)
        y = df["Rings"].values

    x = dv.transform(dicts)
    return x, y, dv


def process_data(df: pd.DataFrame, dv=None, with_target: bool = True) -> scipy.sparse.csr_matrix:
    """
    Load data from a parquet file
    Compute target (duration column) and apply threshold filters (optional)
    Turn features to sparse matrix
    :return The sparce matrix, the target' values and the
    dictvectorizer object if needed.
    """
    if with_target:
        df2 = filter_outliers(df)
        logger.debug(f"Encoding categorical columns...")
        df3 = encode_categorical_cols(df2)
        logger.debug(f"Extracting X and y...")
        return extract_x_y(df3, dv=dv)
    else:
        logger.debug(f"Encoding categorical columns...")
        df2 = encode_categorical_cols(df)
        logger.debug(f"Extracting X and y...")
        return extract_x_y(df2, dv=dv, with_target=with_target)

In [36]:


def train_model(X: scipy.sparse.csr_matrix, y: np.ndarray) -> LinearRegression:
    """
    Train a linear regression model on the given training data.
    
    Args:
        X (scipy.sparse.csr_matrix): The feature matrix in sparse format.
        y (np.ndarray): The target values corresponding to the features.
    
    Returns:
        LinearRegression: The trained linear regression model.
    """
    lr = LinearRegression()
    lr.fit(X, y)
    return lr


def predict(X: scipy.sparse.csr_matrix, model: LinearRegression) -> np.ndarray:
    """
    Use a trained linear regression model to make predictions on the given feature data.
    
    Args:
        X (scipy.sparse.csr_matrix): The feature matrix in sparse format.
        model (LinearRegression): The trained linear regression model.
    
    Returns:
        np.ndarray: The predicted target values.
    """
    return model.predict(X)


def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """
    Evaluate the performance of the model by calculating the Root Mean Squared Error (RMSE).
    
    Args:
        y_true (np.ndarray): The true target values.
        y_pred (np.ndarray): The predicted target values from the model.
    
    Returns:
        float: The calculated RMSE value, indicating the model's prediction error.
    """
    return np.sqrt(root_mean_squared_error(y_true, y_pred))


def train_model_workflow(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
) -> Tuple[float, float, LinearRegression]:
    """
    Complete workflow for training and evaluating a linear regression model.
    
    Args:
        train_df (pd.DataFrame): The training dataset containing features and target values.
        test_df (pd.DataFrame): The test dataset for evaluating the trained model.
    
    Returns:
        Tuple[float, float, LinearRegression]: A tuple containing the RMSE for the training data,
        RMSE for the test data, and the trained linear regression model.
    """
    # Process the training data
    x_train, y_train, dv = process_data(df=train_df)
    
    # Train the model on the training data
    model = train_model(x_train, y_train)
    
    # Make predictions on the training data
    y_pred_train = predict(x_train, model)
    
    # Calculate the RMSE for the training data
    train_rmse = evaluate_model(y_train, y_pred_train)
    
    # Process the test data
    x_test, y_test, _ = process_data(df=test_df, dv=dv)
    
    # Make predictions on the test data
    y_pred_test = predict(x_test, model)
    
    # Calculate the RMSE for the test data
    test_rmse = evaluate_model(y_test, y_pred_test)
    
    return train_rmse, test_rmse, model


In [37]:


# Set the experiment name for tracking
mlflow_experiment_path = "/mlflow/abalone_linear_reg_test"
mlflow.set_experiment(mlflow_experiment_path)

# Start an MLflow run for tracking the experiment
with mlflow.start_run() as run:
    # Get the run ID for future reference
    run_id = run.info.run_id
    
    # Set metadata tags for this run
    mlflow.set_tag("Level", "Development")
    mlflow.set_tag("Team", "Data Science")
    
    # Load dataset from CSV
    df = pd.read_csv(f"{DATA_DIRPATH}/abalone.csv")
    
    # Split the dataset into training and test sets (70% training, 30% test)
    train_df, test_df = train_test_split(df, test_size=0.30, random_state=42)

    # Train the model and get RMSE for both train and test sets
    train_rmse, test_rmse, model = train_model_workflow(
        train_df=train_df,
        test_df=test_df,
    )

    # Log the dataset sizes (number of rows in train and test sets) as parameters
    mlflow.log_param("train_set_size", train_df.shape[0])
    mlflow.log_param("test_set_size", test_df.shape[0])

    # Log whether any data preprocessing such as filtering outliers was performed
    mlflow.log_param("filtered_outliers", True)

    # Log RMSE metrics for training and test sets
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)

    # Log the trained model to MLflow
    mlflow.sklearn.log_model(model, "models")

    # Register the model in the MLflow model registry for future production use
    mlflow.register_model(f"runs:/{run_id}/models", "linear_reg_test")


2024/10/24 17:30:21 INFO mlflow.tracking.fluent: Experiment with name '/mlflow/abalone_linear_reg_test' does not exist. Creating a new experiment.
[32m2024-10-24 17:30:21.136[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mprocess_data[0m:[36m50[0m - [34m[1mEncoding categorical columns...[0m
[32m2024-10-24 17:30:21.139[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mprocess_data[0m:[36m52[0m - [34m[1mExtracting X and y...[0m
[32m2024-10-24 17:30:21.148[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mprocess_data[0m:[36m50[0m - [34m[1mEncoding categorical columns...[0m
[32m2024-10-24 17:30:21.149[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mprocess_data[0m:[36m52[0m - [34m[1mExtracting X and y...[0m
Registered model 'linear_reg_test' already exists. Creating a new version of this model...
Created version '2' of model 'linear_reg_test'.
