In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from typing import List

df = pd.read_csv('../data/abalone.csv')
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [2]:
def encode_sex_column(df) -> pd.DataFrame:

    # 1. Apply One-Hot Encoding
    df_encoded = pd.get_dummies(
        df['Sex'],
        prefix='Sex',
        drop_first=True,
        dtype=int
    )

    # 2. Concatenate the new encoded columns back to the original DataFrame
    df = pd.concat([df.drop('Sex', axis=1), df_encoded], axis=1)
    return df

train_df = encode_sex_column(df)
train_df.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,1,0


In [4]:
def split_data(df):
    y = df['Rings']
    X = df.drop(columns=['Rings'])

    # 3. Perform Train-Test Split (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=41)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(train_df)

In [5]:
def train_model(model, X_train=X_train, y_train=y_train):
    model.fit(X_train, y_train)
    return model

def evaluate(model, X_test=X_test, y_test=y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    return rmse

In [6]:
def train_and_evaluate_cv(model: object, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> List[float]:
    scoring_metric = 'neg_mean_squared_error'

    # Perform cross-validation to get Negative MSE scores
    neg_mse_scores = cross_val_score(
        estimator=model,
        X=X,
        y=y,
        cv=cv_folds,
        scoring=scoring_metric
    )

    # Convert Negative MSE scores to positive RMSE scores
    rmse_scores = np.sqrt(-neg_mse_scores)

    print("\nCross-Validation Results (RMSE):")
    print(f"Individual RMSE Scores: {rmse_scores}")
    print(f"Mean RMSE Score: {np.mean(rmse_scores):.4f}")
    print(f"Standard Deviation of RMSE Score: {np.std(rmse_scores):.4f}")
    print("--------------------------------------------------")

    return rmse_scores

In [7]:
X, y = train_df.drop(columns=['Rings']), train_df['Rings']
train_and_evaluate_cv(XGBRegressor(),X,y)


Cross-Validation Results (RMSE):
Individual RMSE Scores: [3.23376872 1.73813874 2.50856268 2.0231785  2.0594153 ]
Mean RMSE Score: 2.3126
Standard Deviation of RMSE Score: 0.5225
--------------------------------------------------


array([3.23376872, 1.73813874, 2.50856268, 2.0231785 , 2.0594153 ])

In [8]:
train_and_evaluate_cv(RandomForestRegressor(),X, y)


Cross-Validation Results (RMSE):
Individual RMSE Scores: [3.08366642 1.66920013 2.35415128 1.8854439  1.96761721]
Mean RMSE Score: 2.1920
Standard Deviation of RMSE Score: 0.4978
--------------------------------------------------


array([3.08366642, 1.66920013, 2.35415128, 1.8854439 , 1.96761721])

In [9]:
train_and_evaluate_cv(LinearRegression(),X, y)


Cross-Validation Results (RMSE):
Individual RMSE Scores: [3.12649047 1.72875481 2.41110856 1.94617432 1.99586635]
Mean RMSE Score: 2.2417
Standard Deviation of RMSE Score: 0.4945
--------------------------------------------------


array([3.12649047, 1.72875481, 2.41110856, 1.94617432, 1.99586635])