In [1]:
import mlflow
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

diabetes_data = load_diabetes(as_frame=True)

In [2]:
X = diabetes_data.data
y = diabetes_data.target

df = diabetes_data.frame
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.3,
    random_state = 42
)

In [4]:
mlflow.set_experiment("Diabetes Linear Regression Baseline")

def train_and_log_model(model_class, model_params, X_train, X_test, y_train, y_test, run_name):
    
    model = model_class(**model_params)

    run_params = {"model_type": model_class.__name__, **model_params}
    
    with mlflow.start_run() as run:

        print(f"--- Starting run for: {run_name} ---")
    
        mlflow.log_params(run_params)
        
        # Training the model on training data
        model.fit(X_train, y_train)
        y_pred_test = model.predict(X_test)
        y_pred_train = model.predict(X_train)
        
        r2_train = r2_score(y_train, y_pred_train)
        r2_test = r2_score(y_test, y_pred_test)
        
        mse_train = mean_squared_error(y_train, y_pred_train)
        mse_test = mean_squared_error(y_test, y_pred_test)
    
        mlflow.log_metric("mean_squared_error_train", mse_train)
        mlflow.log_metric("mean_squared_error_test", mse_test)
        
        mlflow.log_metric("r2_test", r2_test)
        mlflow.log_metric("r2_train", r2_train)
    
        # 5. Diagnostic Plots (Artifacts)
        residuals = y_test - y_pred_test
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

        # Predicted vs. Actual
        ax1.scatter(y_test, y_pred_test, alpha=0.7)
        min_val = min(y_test.min(), y_pred_test.min())
        max_val = max(y_test.max(), y_pred_test.max())
        ax1.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)
        ax1.set_title(f"Predicted vs. Actual ({run_name})")
        ax1.set_xlabel("Actual Values")
        ax1.set_ylabel("Predicted Values")
        
        # Residuals vs. Predicted
        ax2.scatter(y_pred_test, residuals, alpha=0.7)
        ax2.axhline(y=0, color='r', linestyle='-', lw=2)
        ax2.set_title(f"Residuals vs. Predicted ({run_name})")
        ax2.set_xlabel("Predicted Values")
        ax2.set_ylabel("Residuals")

        plt.tight_layout()
        plot_filename = f"{model_class.__name__}_plots.png"
        plt.savefig(plot_filename)
        plt.close(fig)

        mlflow.log_artifact(plot_filename)
        
        # 6. Log Model Artifact
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="model_artifact"
        )
        
        print(f"Logged run {run.info.run_id} | Test R2: {r2_test:.4f}")

  return FileStore(store_uri, store_uri)
2025/11/18 17:25:36 INFO mlflow.tracking.fluent: Experiment with name 'Diabetes Linear Regression Baseline' does not exist. Creating a new experiment.


In [5]:
train_and_log_model(
    model_class = LinearRegression,
    model_params = {},
    X_train = X_train,
    y_train = y_train,
    X_test = X_test,
    y_test = y_test,
    run_name = "Linear_Baseline"
)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



--- Starting run for: Linear_Baseline ---




Logged run 8e323279577946b685b2a2502e481ef9 | Test R2: 0.4773


In [6]:
train_and_log_model(
    model_class = Ridge,
    model_params = {"alpha": 1.0, "random_state": 42},
    X_train = X_train,
    y_train = y_train,
    X_test = X_test,
    y_test = y_test,
    run_name = "Ridge_Alpha_1.0"
)

--- Starting run for: Ridge_Alpha_1.0 ---




Logged run 94f0c1b951354eea8ea03358b44c9e8c | Test R2: 0.4233


In [7]:
train_and_log_model(
    model_class = RandomForestRegressor,
    model_params = {"n_estimators": 100, "max_depth": 5, "random_state": 42},
    X_train = X_train,
    y_train = y_train,
    X_test = X_test,
    y_test = y_test,
    run_name = "RandomForest_Depth_5"
)

--- Starting run for: RandomForest_Depth_5 ---




Logged run 08a262620517442fbedfec3f7f6c8217 | Test R2: 0.4871


In [8]:
train_and_log_model(
    model_class = RandomForestRegressor,
    model_params = {"n_estimators": 100, "max_depth": 6, "random_state": 42},
    X_train = X_train,
    y_train = y_train,
    X_test = X_test,
    y_test = y_test,
    run_name = "RandomForest_Depth_5"
)

--- Starting run for: RandomForest_Depth_5 ---




Logged run fb09ef10099242f9810b533d656f208a | Test R2: 0.4796
