In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [16]:
import mlflow
import mlflow.sklearn
import pandas as pd
from mlflow import MlflowClient
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = '../data/abalone.csv'
df = pd.read_csv(file_path)

# Client
client = MlflowClient()

# Set the experiment
mlflow.set_experiment("abalone-model")

# Enable MLflow autologging
mlflow.sklearn.autolog()

# Start an MLflow run
with mlflow.start_run() as run:
    # Separate features and target variable
    X = df.drop(columns='Rings')
    y = df['Rings']

    # Parameters to be logged
    alpha = 0.01
    test_size = 0.2
    random_state = 50

    # Preprocess the 'Sex' feature using one-hot encoding, and scale numeric features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), ['Length', 'Diameter', 'Height', 'Whole weight', 
                                       'Shucked weight', 'Viscera weight', 'Shell weight']),
            ('cat', OneHotEncoder(), ['Sex'])
        ]
    )

    # Create a pipeline with Lasso regression model
    model = Lasso(alpha=alpha)
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # Log parameters manually
    mlflow.log_param("alpha", alpha)
    mlflow.log_param("test_size", test_size)
    mlflow.log_param("random_state", random_state)

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict on the test set
    y_pred = pipeline.predict(X_test)

    # Evaluate the model using Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse:.2f}')

    # Log metrics manually
    mlflow.log_metric("mse", mse)

    # Log the model
    mlflow.sklearn.log_model(pipeline, "model")
    mlflow.end_run()

Mean Squared Error: 4.90


