# Baseline Model — Ridge Regression with MLflow

In [1]:
# --- Setup ---
import os, pathlib, warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow

warnings.filterwarnings("ignore")


notebook_dir = os.path.dirname(os.path.abspath("__file__"))
data_path = os.path.join(notebook_dir, "..", "data", "abalone.csv")
df = pd.read_csv(data_path)

df["Age"] = df["Rings"] + 1.5

# Basic feature selection
y = df["Age"].values
X = df.drop(columns=["Age", "Rings"]).copy()

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")
print("X shape:", X.shape, "y shape:", y.shape)
X.head()

  import pkg_resources  # noqa: TID251


Categorical columns: ['Sex']
Numerical columns: ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight']
X shape: (4177, 8) y shape: (4177,)


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [2]:
# --- Train/valid split ---
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Outlier removal: Remove samples where Height > 0.35
print(f"Training samples before outlier removal: {len(X_train)}")
outlier_mask = X_train['Height'] <= 0.35
X_train = X_train[outlier_mask]
y_train = y_train[outlier_mask]
print(f"Training samples after outlier removal: {len(X_train)}")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

params = {"alpha": 1.0}

pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("ridge", Ridge(**params))
])

Training samples before outlier removal: 3341
Training samples after outlier removal: 3339


In [3]:
os.makedirs("mlruns_local", exist_ok=True)
mlflow.set_tracking_uri("file:./mlruns_local")
mlflow.set_experiment("abalone-baseline")

with mlflow.start_run(run_name="ridge_baseline"):
    mlflow.log_param("model", "Ridge")
    mlflow.log_param("preprocessing", "ColumnTransformer")
    mlflow.log_param("categorical_encoding", "OneHotEncoder")
    mlflow.log_param("random_state", 42)
    mlflow.log_param("test_size", 0.2)

    # Train
    pipe.fit(X_train, y_train)

    # Evaluate
    y_pred = pipe.predict(X_valid)
    rmse = float(np.sqrt(mean_squared_error(y_valid, y_pred)))
    mae  = float(mean_absolute_error(y_valid, y_pred))
    r2   = float(r2_score(y_valid, y_pred))

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    # Save model artifact
    mlflow.sklearn.log_model(pipe, "model")

    print(f"RMSE: {rmse:.4f} | MAE: {mae:.4f} | R2: {r2:.4f}")
    

RMSE: 2.2172 | MAE: 1.6032 | R2: 0.5459
