In [1]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

In [2]:
# Generate synthetic data
def generate_data():
    np.random.seed(42)
    X = np.random.rand(1000, 3)
    y = 3 * X[:, 0] + 2 * X[:, 1] + X[:, 2] + np.random.randn(1000) * 0.5
    return pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3']), y


In [4]:
X, y = generate_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X

Unnamed: 0,feature1,feature2,feature3
0,0.374540,0.950714,0.731994
1,0.598658,0.156019,0.155995
2,0.058084,0.866176,0.601115
3,0.708073,0.020584,0.969910
4,0.832443,0.212339,0.181825
...,...,...,...
995,0.967035,0.051669,0.504796
996,0.718454,0.862640,0.179256
997,0.800003,0.552707,0.396554
998,0.131715,0.865296,0.157273


In [11]:
# Define model and parameters
model = LinearRegression()
param_grid = {
    "fit_intercept": [True, False],
    "positive": [True,False]
}

In [9]:
# Custom scorer for MSE
def mse_scorer(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

In [12]:
 # Set up GridSearchCV
grid_search = GridSearchCV(
     estimator=model,
     param_grid=param_grid,
     scoring=make_scorer(mse_scorer, greater_is_better=False),  # Negative MSE
     cv=5,
     return_train_score=True
)

# Run grid search
grid_search.fit(X_train, y_train)

In [16]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_fit_intercept,param_positive,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.01479,0.026088,0.001409,0.000378,True,True,"{'fit_intercept': True, 'positive': True}",-0.209474,-0.258977,-0.25827,...,-0.25007,0.020725,4,-0.257562,-0.244876,-0.244647,-0.245374,-0.242038,-0.246899,0.005456
1,0.00501,0.007269,0.001336,0.000272,True,False,"{'fit_intercept': True, 'positive': False}",-0.209474,-0.258977,-0.25827,...,-0.25007,0.020725,3,-0.257562,-0.244876,-0.244647,-0.245374,-0.242038,-0.246899,0.005456
2,0.001405,0.000493,0.001271,0.000346,False,True,"{'fit_intercept': False, 'positive': True}",-0.208393,-0.256488,-0.258299,...,-0.249278,0.020916,1,-0.257882,-0.245402,-0.244647,-0.245378,-0.242203,-0.247103,0.005515
3,0.001403,0.000486,0.00102,0.000666,False,False,"{'fit_intercept': False, 'positive': False}",-0.208393,-0.256488,-0.258299,...,-0.249278,0.020916,1,-0.257882,-0.245402,-0.244647,-0.245378,-0.242203,-0.247103,0.005515


In [19]:
grid_search.estimator.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [22]:
mlflow.set_tracking_uri(uri="http://localhost:5000")
exp_id = mlflow.create_experiment("lin_reg hyperparam with nested runs")
mlflow.set_experiment(exp_id)

2025/01/20 13:01:14 INFO mlflow.tracking.fluent: Experiment with name '5' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///D:/myscripts/ML-IPython-notebooks/mlflow/6', creation_time=1737358274500, experiment_id='6', last_update_time=1737358274500, lifecycle_stage='active', name='5', tags={}>

In [21]:
# Log results for each combination from GridSearchCV
def log_gridsearch_results(grid_search, X_test, y_test):
    for i, params in enumerate(grid_search.cv_results_['params']):
        with mlflow.start_run(nested=True):  # Use nested=True for sub-runs
            # Get metrics from cv_results_
            mean_test_score = grid_search.cv_results_['mean_test_score'][i]
            std_test_score = grid_search.cv_results_['std_test_score'][i]

            # Log parameters and cross-validation metrics
            mlflow.log_params(params)
            mlflow.log_metric("mean_cv_score", mean_test_score)
            mlflow.log_metric("std_cv_score", std_test_score)

            # Refit model on the best parameters and evaluate on test data
            model = grid_search.estimator.set_params(**params)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            # Log final evaluation metrics and the model
            mlflow.log_metric("mse_test", mse)
            mlflow.log_metric("r2_test", r2)
            mlflow.sklearn.log_model(model, "model")

            print(f"Logged run with params: {params}, mse: {mse:.4f}, r2: {r2:.4f}")

In [25]:
 with mlflow.start_run(run_name="2.GridSearch_LinearRegression"):
        # Log the best parameters and metrics
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        mlflow.log_params(best_params)
        mlflow.log_metric("best_mean_cv_score", best_score)

        # Log all runs for each parameter combination
        log_gridsearch_results(grid_search, X_test, y_test)

        # Log the overall best model
        mlflow.sklearn.log_model(grid_search.best_estimator_, "best_model", registered_model_name="GridSearch_LinearRegression")

2025/01/20 13:22:06 INFO mlflow.tracking._tracking_service.client: üèÉ View run unruly-mare-635 at: http://localhost:5000/#/experiments/6/runs/6d392f5dc8ee4e5289b12b09bbfa48ed.
2025/01/20 13:22:06 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/6.


Logged run with params: {'fit_intercept': True, 'positive': True}, mse: 0.2714, r2: 0.8305


2025/01/20 13:22:10 INFO mlflow.tracking._tracking_service.client: üèÉ View run adventurous-hare-238 at: http://localhost:5000/#/experiments/6/runs/443b24e3e7af4ba1a7f05facc0dff364.
2025/01/20 13:22:10 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/6.


Logged run with params: {'fit_intercept': True, 'positive': False}, mse: 0.2714, r2: 0.8305


2025/01/20 13:22:15 INFO mlflow.tracking._tracking_service.client: üèÉ View run bittersweet-calf-135 at: http://localhost:5000/#/experiments/6/runs/3f775c9dc2d14b879853aeef28f8a6aa.
2025/01/20 13:22:15 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/6.


Logged run with params: {'fit_intercept': False, 'positive': True}, mse: 0.2705, r2: 0.8311


2025/01/20 13:22:20 INFO mlflow.tracking._tracking_service.client: üèÉ View run peaceful-gnu-943 at: http://localhost:5000/#/experiments/6/runs/de845ec001ae4a5ebf8be9f7b49b66ec.
2025/01/20 13:22:20 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/6.


Logged run with params: {'fit_intercept': False, 'positive': False}, mse: 0.2705, r2: 0.8311


Successfully registered model 'GridSearch_LinearRegression'.
2025/01/20 13:22:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: GridSearch_LinearRegression, version 1
Created version '1' of model 'GridSearch_LinearRegression'.
2025/01/20 13:22:24 INFO mlflow.tracking._tracking_service.client: üèÉ View run 2.GridSearch_LinearRegression at: http://localhost:5000/#/experiments/6/runs/74cff775362e4f8aa8674c155e62e3cf.
2025/01/20 13:22:24 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/6.
