In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

# 1. Baseline model - Ridge regression

We used a Ridge regression to estimate the age of an Abalone

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
# Display data
df = pd.read_csv("../data/abalone.csv")
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [3]:
# Create an "Age" column using the "Rings" column values, plus 1.5
df['Age'] = df['Rings'] + 1.5
df.drop(columns=['Rings'], inplace=True)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Age
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,16.5
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,8.5
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,10.5
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,11.5
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,8.5


In [4]:
# Creating a preprocessor and a pipeline
cont_variables = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight','Viscera weight', 'Shell weight']
cat_variables = ['Sex']
preprocessor = ColumnTransformer(
    [
        ('cat', OneHotEncoder(), cat_variables), 
        ('num', StandardScaler(), cont_variables) 
    ]
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

In [5]:
# Extracting X and y
X = df.drop(columns=['Age'])
y = df['Age']

In [6]:
# Splitting data between test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Experiments using MLflow

In [7]:
import mlflow

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'file:///Users/francoislebrun/Documents/X-HEC%20Msc%20DataScience%20for%20Business/HEC-2024/Cours/8-ML%20OPS/test/xhec-mlops-project-student/notebooks/mlruns'


In [8]:
from mlflow import MlflowClient

client = mlflow.MlflowClient()

# Search for experiments with name "a"
experiments = client.search_experiments()
experiments

[<Experiment: artifact_location='file:///Users/francoislebrun/Documents/X-HEC%20Msc%20DataScience%20for%20Business/HEC-2024/Cours/8-ML%20OPS/test/xhec-mlops-project-student/notebooks/mlruns/0', creation_time=1729793692349, experiment_id='0', last_update_time=1729793692349, lifecycle_stage='active', name='Default', tags={}>]

### 2.1 1st experiment 
With and intercept and an &alpha; for the linear regression equal to 1

In [9]:
# Setting an expermient
mlflow.set_experiment("ridge-experiment-1")

with mlflow.start_run() as run:
    run_id = run.info.run_id
    params = {"regressor__alpha": 1.0, "regressor__fit_intercept": True}
    pipeline.set_params(**params)
    mlflow.log_params(params)

    model = pipeline.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mlflow.sklearn.log_model(model, "ridge_reg")

    rmse = root_mean_squared_error(y_test, y_pred)
    mlflow.log_metric("rmse", rmse)
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

2024/10/24 20:14:58 INFO mlflow.tracking.fluent: Experiment with name 'ridge-experiment-1' does not exist. Creating a new experiment.


default artifacts URI: 'file:///Users/francoislebrun/Documents/X-HEC%20Msc%20DataScience%20for%20Business/HEC-2024/Cours/8-ML%20OPS/test/xhec-mlops-project-student/notebooks/mlruns/306669412623490563/ea572321ca32449e99f86f15f6352be0/artifacts'


In [10]:
experiments = client.search_experiments()
experiments

[<Experiment: artifact_location='file:///Users/francoislebrun/Documents/X-HEC%20Msc%20DataScience%20for%20Business/HEC-2024/Cours/8-ML%20OPS/test/xhec-mlops-project-student/notebooks/mlruns/306669412623490563', creation_time=1729793698008, experiment_id='306669412623490563', last_update_time=1729793698008, lifecycle_stage='active', name='ridge-experiment-1', tags={}>,
 <Experiment: artifact_location='file:///Users/francoislebrun/Documents/X-HEC%20Msc%20DataScience%20for%20Business/HEC-2024/Cours/8-ML%20OPS/test/xhec-mlops-project-student/notebooks/mlruns/0', creation_time=1729793692349, experiment_id='0', last_update_time=1729793692349, lifecycle_stage='active', name='Default', tags={}>]

### 2.1 2nd experiment 
With and intercept and an &alpha; for the linear regression equal to 100

In [11]:
# Setting an expermient
mlflow.set_experiment("ridge-experiment-2")

with mlflow.start_run() as run:
    run_id = run.info.run_id
    params = {"regressor__alpha": 100, "regressor__fit_intercept": True}
    pipeline.set_params(**params)
    mlflow.log_params(params)

    model = pipeline.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mlflow.sklearn.log_model(model, "ridge_reg")

    rmse = root_mean_squared_error(y_test, y_pred)
    mlflow.log_metric("rmse", rmse)
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

2024/10/24 20:15:19 INFO mlflow.tracking.fluent: Experiment with name 'ridge-experiment-2' does not exist. Creating a new experiment.


default artifacts URI: 'file:///Users/francoislebrun/Documents/X-HEC%20Msc%20DataScience%20for%20Business/HEC-2024/Cours/8-ML%20OPS/test/xhec-mlops-project-student/notebooks/mlruns/122851235229546978/dcc8635fdb3246c7b2371e5a68164365/artifacts'


### 2.1 3rd experiment 
Without and intercept and an &alpha; for the linear regression equal to 1


In [12]:
# Setting an expermient
mlflow.set_experiment("ridge-experiment-3")

with mlflow.start_run() as run:
    run_id = run.info.run_id
    params = {"regressor__alpha": 1.0, "regressor__fit_intercept": False}
    pipeline.set_params(**params)
    mlflow.log_params(params)

    model = pipeline.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mlflow.sklearn.log_model(model, "ridge_reg")

    rmse = root_mean_squared_error(y_test, y_pred)
    mlflow.log_metric("rmse", rmse)
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

2024/10/24 20:15:22 INFO mlflow.tracking.fluent: Experiment with name 'ridge-experiment-3' does not exist. Creating a new experiment.


default artifacts URI: 'file:///Users/francoislebrun/Documents/X-HEC%20Msc%20DataScience%20for%20Business/HEC-2024/Cours/8-ML%20OPS/test/xhec-mlops-project-student/notebooks/mlruns/774004748106268155/f6494f4ac87f4064993415aa67c059a1/artifacts'


In [13]:
experiments

[<Experiment: artifact_location='file:///Users/francoislebrun/Documents/X-HEC%20Msc%20DataScience%20for%20Business/HEC-2024/Cours/8-ML%20OPS/test/xhec-mlops-project-student/notebooks/mlruns/306669412623490563', creation_time=1729793698008, experiment_id='306669412623490563', last_update_time=1729793698008, lifecycle_stage='active', name='ridge-experiment-1', tags={}>,
 <Experiment: artifact_location='file:///Users/francoislebrun/Documents/X-HEC%20Msc%20DataScience%20for%20Business/HEC-2024/Cours/8-ML%20OPS/test/xhec-mlops-project-student/notebooks/mlruns/0', creation_time=1729793692349, experiment_id='0', last_update_time=1729793692349, lifecycle_stage='active', name='Default', tags={}>]

After comparing the different experiments on MLflow, we decided to keep the third model with parameters alpha=1.0 and fit_intercept=False:
    - the first and third model have the same rmse
    - but as we already one-hot-encoded the "Sex" variable, there is no need to add an intercept as the columns created from the one-hot-encoded variable also serve this role