In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
import numpy as np
from typing import List

def get_data():
    return pd.read_csv('../data/abalone.csv')

df = get_data()

In [2]:
def encode_sex_column(df) -> pd.DataFrame:

    # 1. Apply One-Hot Encoding
    df_encoded = pd.get_dummies(
        df['Sex'],
        prefix='Sex',
        drop_first=True,
        dtype=int
    )

    # 2. Concatenate the new encoded columns back to the original DataFrame
    df = pd.concat([df.drop('Sex', axis=1), df_encoded], axis=1)
    return df

df = encode_sex_column(df)

In [3]:
def split_data(df):
    y = df['Rings']
    X = df.drop(columns=['Rings'])

    # 3. Perform Train-Test Split (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=41)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df)

In [None]:
def preprocessing(df):
    df = encode_sex_column(df)
    df.drop(columns=['Length'], inplace=True)
    return df

In [9]:
def train_model(X_train, y_train):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    return model

def predict_rings(input_data, model):
    """
    Predicts the target variable using the provided model and input data.

    Args:
        input_data (csr_matrix): The input feature matrix in sparse format.
        model (LinearRegression): The trained linear regression model.

    Returns:
        np.array: The predicted values for the input data.
    """
    return model.predict(input_data)


def evaluate_model(y_true, y_pred):
    """
    Evaluates the model's performance using the root mean squared error (RMSE).

    Args:
        y_true (np.ndarray): The true target values.
        y_pred (np.ndarray): The predicted target values.

    Returns:
        float: The computed RMSE value.
    """
    return root_mean_squared_error(y_true, y_pred)

model = train_model(X_train, y_train)
y_pred = predict_rings(X_test, model)
score = evaluate_model(y_test, y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [3341, 836]

In [7]:
from mlflow import MlflowClient
import mlflow

experiments = MlflowClient().search_experiments()
print(experiments)

# Set the experiment name
# mlflow.set_experiment("abalone_project")

[<Experiment: artifact_location='file:///c:/Users/PC/Desktop/HEC/ML_OPS/xhec-mlops-2025-project/notebooks/mlruns/130990549742822623', creation_time=1761220381443, experiment_id='130990549742822623', last_update_time=1761220381443, lifecycle_stage='active', name='abalone_project', tags={}>, <Experiment: artifact_location='file:///c:/Users/PC/Desktop/HEC/ML_OPS/xhec-mlops-2025-project/notebooks/mlruns/0', creation_time=1761219468068, experiment_id='0', last_update_time=1761219468068, lifecycle_stage='active', name='Default', tags={}>]


In [10]:
# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tags({"model_type": "random_forest", "framework": "sklearn"})

    # Load data
    train_df = get_data()

    # Encode categorical columns
    train_df = preprocessing(train_df)

    # Extract X and y
    X_train, X_test, y_train, y_test = split_data(train_df)

    # Train model
    model = train_model(X_train, y_train)

    # Evaluate model
    prediction = predict_rings(X_train, model)
    train_me = evaluate_model(y_train, prediction)
    mlflow.log_metric("train_rmse", train_me)

    # Evaluate model on test set
    y_pred_test = predict_rings(X_test, model)
    test_me = evaluate_model(y_test, y_pred_test)
    mlflow.log_metric("test_rmse", test_me)

    # Log your model
    mlflow.sklearn.log_model(model, "model")

    # Register your model in mlflow model registry
    result = mlflow.register_model(f"runs:/{run_id}/model", "abalone_rf_model")

Successfully registered model 'abalone_rf_model'.
Created version '1' of model 'abalone_rf_model'.


In [11]:
!mlflow ui --host 0.0.0.0 --port 5002

^C
