In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
import numpy as np
from typing import List

df = pd.read_csv('../data/abalone.csv')

In [2]:
def encode_sex_column(df) -> pd.DataFrame:

    # 1. Apply One-Hot Encoding
    df_encoded = pd.get_dummies(
        df['Sex'],
        prefix='Sex',
        drop_first=True,
        dtype=int
    )

    # 2. Concatenate the new encoded columns back to the original DataFrame
    df = pd.concat([df.drop('Sex', axis=1), df_encoded], axis=1)
    return df

train_df = encode_sex_column(df)

In [3]:
def split_data(df):
    y = df['Rings']
    X = df.drop(columns=['Rings'])

    # 3. Perform Train-Test Split (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=41)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(train_df)

In [None]:
def train_model(X_train=X_train, y_train=y_train):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    return model

def predict_rings(input_data, model):
    """
    Predicts the target variable using the provided model and input data.

    Args:
        input_data (csr_matrix): The input feature matrix in sparse format.
        model (LinearRegression): The trained linear regression model.

    Returns:
        np.array: The predicted values for the input data.
    """
    return model.predict(input_data)


def evaluate_model(y_true, y_pred):
    """
    Evaluates the model's performance using the root mean squared error (RMSE).

    Args:
        y_true (np.ndarray): The true target values.
        y_pred (np.ndarray): The predicted target values.

    Returns:
        float: The computed RMSE value.
    """
    return root_mean_squared_error(y_true, y_pred)



In [None]:
def train_and_evaluate_cv(model: object, X: pd.DataFrame, y: pd.Series, cv_folds: int = 5) -> List[float]:
    scoring_metric = 'neg_mean_squared_error'

    # Perform cross-validation to get Negative MSE scores
    neg_mse_scores = cross_val_score(
        estimator=model,
        X=X,
        y=y,
        cv=cv_folds,
        scoring=scoring_metric
    )

    # Convert Negative MSE scores to positive RMSE scores
    rmse_scores = np.sqrt(-neg_mse_scores)

    print("\nCross-Validation Results (RMSE):")
    print(f"Individual RMSE Scores: {rmse_scores}")
    print(f"Mean RMSE Score: {np.mean(rmse_scores):.4f}")
    print(f"Standard Deviation of RMSE Score: {np.std(rmse_scores):.4f}")
    print("--------------------------------------------------")

    return np.mean(rmse_scores)

X = train_df.drop(columns=['Rings'])
y = train_df['Rings']

In [10]:
train_and_evaluate_cv(RandomForestRegressor(),X, y)


Cross-Validation Results (RMSE):
Individual RMSE Scores: [3.11851885 1.69278108 2.33804822 1.88969557 1.98312516]
Mean RMSE Score: 2.2044
Standard Deviation of RMSE Score: 0.5026
--------------------------------------------------


In [30]:
from mlflow import MlflowClient
import mlflow

# experiments = MlflowClient().search_experiments()
# print(experiments)

In [32]:
def preprocessing(df):
    return df

In [31]:
mlflow.set_experiment("abalone_project")

2025/10/23 13:53:01 INFO mlflow.tracking.fluent: Experiment with name 'abalone_project' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/PC/Desktop/HEC/ML_OPS/xhec-mlops-2025-project/notebooks/mlruns/130990549742822623', creation_time=1761220381443, experiment_id='130990549742822623', last_update_time=1761220381443, lifecycle_stage='active', name='abalone_project', tags={}>