In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore') # To suppress common warnings

# MLflow imports
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature


In [2]:
# always set the tracking uri at start of every session else the model and metrics will not logged on mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [4]:
# --- Start an MLflow Experiment ---
mlflow.set_experiment("Regression_Model_Comparison")

2025/11/14 05:02:14 INFO mlflow.tracking.fluent: Experiment with name 'Regression_Model_Comparison' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/727291165338773283', creation_time=1763076734505, experiment_id='727291165338773283', last_update_time=1763076734505, lifecycle_stage='active', name='Regression_Model_Comparison', tags={}>

In [5]:
df_train = pd.read_csv("df_train_preprocessed_regression.csv")
df_test = pd.read_csv("df_test_preprocessed_regression.csv")

x_train = df_train.drop(columns='price_transformed')
y_train = df_train['price_transformed']
x_test = df_test.drop(columns='price_transformed')
y_test = df_test['price_transformed']

In [None]:
# Plan for the session
# train some regression models and then log them in mlflow
# test the trained model with the test data.
# log metrics
# after finish it up with the mlflow, tansform the predicted value to test with the actual value
# visualise the important features at the end

# perform the unsupervised learning too

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105903 entries, 0 to 105902
Data columns (total 40 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             105903 non-null  int64  
 1   model_photography      105903 non-null  int64  
 2   price_2                105903 non-null  int64  
 3   page                   105903 non-null  int64  
 4   price_transformed      105903 non-null  float64
 5   model_target_encoded   105903 non-null  float64
 6   country_9              105903 non-null  int64  
 7   country_24             105903 non-null  int64  
 8   country_29             105903 non-null  int64  
 9   country_46             105903 non-null  int64  
 10  country_RARE_GROUP     105903 non-null  int64  
 11  month_4                105903 non-null  int64  
 12  month_5                105903 non-null  int64  
 13  month_6                105903 non-null  int64  
 14  month_7                105903 non-nu

In [None]:
# preprocess the Target data - encode them to be binary than leaving it as label. XGBclassifier needs it in the binary format

In [None]:
# change classification models into regression models

In [6]:
# Define models and their hyperparameter grids for tuning
models_to_tune = {
    "LinearRegression": {
        "model": LinearRegression(),
        "params": {} # No tuning for simple Linear Regression
    },
    "Ridge": {
        "model": Ridge(random_state=42),
        "params": {"alpha": [0.1, 1.0, 10.0]}
    },
    "Lasso": {
        "model": Lasso(random_state=42),
        "params": {"alpha": [0.001, 0.01, 0.1]}
    },
    "GradientBoostingRegressor": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {
            "n_estimators": [50, 100],
            "learning_rate": [0.01, 0.1],
            "max_depth": [3, 5]
        }
    }
}

In [7]:
for model_name, config in models_to_tune.items():
    # Start a new MLflow run for each model
    with mlflow.start_run(run_name=model_name) as run:
        print(f"--- Starting run for {model_name} ---")

        estimator = config["model"]
        param_grid = config["params"]

        # 1. Hyperparameter Tuning (Simple GridSearchCV)
        if param_grid:
            # Use 'neg_mean_squared_error' for tuning since GridSearchCV maximizes the score
            grid_search = GridSearchCV(
                estimator,
                param_grid,
                cv=3,
                scoring='neg_mean_squared_error',
                n_jobs=-1
            )
            grid_search.fit(x_train, y_train)
            best_estimator = grid_search.best_estimator_
            best_params = grid_search.best_params_

            # Log tuning results (optional but helpful)
            mlflow.log_param("best_cv_score", grid_search.best_score_)
            print(f"Best parameters: {best_params}")
        else:
            # For Linear Regression (no tuning)
            best_estimator = estimator
            best_params = {}

        # Log hyperparameters
        mlflow.log_params(best_params)

        # 2. Train the best model
        best_estimator.fit(x_train, y_train)

        # 3. Predict and Evaluate
        y_pred = best_estimator.predict(x_test)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)

        # 4. Log All Metrics to MLflow
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2_score", r2)
        
        # Print results
        print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R2 Score: {r2:.4f}")

        # 5. Log the Model
        # Infer the model signature for better deployment
        signature = infer_signature(x_test, y_pred)
        mlflow.sklearn.log_model(
            sk_model=best_estimator,
            artifact_path="model",
            signature=signature,
            registered_model_name=f"{model_name}_Regression_Model" # Optional: Register for easy deployment
        )

        print("-" * 30)

print("\nAll runs logged to MLflow UI.")

--- Starting run for LinearRegression ---
MAE: 0.1609, MSE: 0.0444, RMSE: 0.2107, R2 Score: 0.8564


Successfully registered model 'LinearRegression_Regression_Model'.
2025/11/14 05:07:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LinearRegression_Regression_Model, version 1
Created version '1' of model 'LinearRegression_Regression_Model'.


------------------------------
üèÉ View run LinearRegression at: http://127.0.0.1:5000/#/experiments/727291165338773283/runs/b677d0245f9e47b3a234da3bbf1173a4
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/727291165338773283
--- Starting run for Ridge ---
Best parameters: {'alpha': 1.0}
MAE: 0.1609, MSE: 0.0444, RMSE: 0.2107, R2 Score: 0.8564


Successfully registered model 'Ridge_Regression_Model'.
2025/11/14 05:08:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Ridge_Regression_Model, version 1
Created version '1' of model 'Ridge_Regression_Model'.


------------------------------
üèÉ View run Ridge at: http://127.0.0.1:5000/#/experiments/727291165338773283/runs/c577881ce4564609b91148dd72b780a6
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/727291165338773283
--- Starting run for Lasso ---
Best parameters: {'alpha': 0.001}
MAE: 0.1618, MSE: 0.0449, RMSE: 0.2119, R2 Score: 0.8547


Successfully registered model 'Lasso_Regression_Model'.
2025/11/14 05:08:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Lasso_Regression_Model, version 1
Created version '1' of model 'Lasso_Regression_Model'.


------------------------------
üèÉ View run Lasso at: http://127.0.0.1:5000/#/experiments/727291165338773283/runs/123fc3ae35bd49b689a2af6a5fde13b3
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/727291165338773283
--- Starting run for GradientBoostingRegressor ---
Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
MAE: 0.0488, MSE: 0.0076, RMSE: 0.0871, R2 Score: 0.9755


Successfully registered model 'GradientBoostingRegressor_Regression_Model'.
2025/11/14 05:11:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: GradientBoostingRegressor_Regression_Model, version 1
Created version '1' of model 'GradientBoostingRegressor_Regression_Model'.


------------------------------
üèÉ View run GradientBoostingRegressor at: http://127.0.0.1:5000/#/experiments/727291165338773283/runs/9df6140ca5f2468db7a484a8a6625e42
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/727291165338773283

All runs logged to MLflow UI.


In [8]:
#for loading mode --- may take some time --- not worked even once

# run_id = '9df6140ca5f2468db7a484a8a6625e42'
# logged_model_uri = f'runs:/{run_id}/model' # Replace run_id with the ID of the best run
# logged_model_uri = 'runs:/9df6140ca5f2468db7a484a8a6625e42/model' #- GradientBoostingRegressora
logged_model_uri = 'runs:/b677d0245f9e47b3a234da3bbf1173a4/model' # - linear regression
loaded_model = mlflow.sklearn.load_model(logged_model_uri)

Downloading artifacts:   0%|          | 0/1 [04:07<?, ?it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 63.93it/s]


In [4]:
# we can also use the pyfunc to load the models --- faster
model_info_to_load = "mlflow-artifacts:/727291165338773283/models/m-e3ae57002f5848fdb64625d92086de80/artifacts"
loaded_model = mlflow.pyfunc.load_model(model_info_to_load)

Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:58<00:00, 11.65s/it] 


In [5]:
predictions = loaded_model.predict(x_test)
result = pd.DataFrame(x_test)
result['actual class'] = y_test
result['predicted class'] = predictions 

In [7]:
result.head()

Unnamed: 0.1,Unnamed: 0,model_photography,price_2,page,country_grouped_9,country_grouped_24,country_grouped_29,country_grouped_46,country_grouped_RARE_GROUP,page1_main_category_1,...,location_4,location_5,location_6,month_sin,month_cos,day_sin,day_cos,model_target_encoded_scaled,actual class,predicted class
0,31956,0,0,1,0,0,1,0,0,0,...,0,1,0,0.8660254,-0.5,0.571268,0.820763,-1.234297,6.044095,6.039656
1,97097,0,0,2,0,0,1,0,0,1,...,0,0,0,0.5,-0.866025,-0.988468,0.151428,0.144244,5.80053,5.902571
2,90252,0,0,1,0,0,1,0,0,1,...,0,1,0,0.5,-0.866025,-0.724793,0.688967,-1.234297,5.976924,6.058026
3,125226,0,1,2,0,0,1,0,0,0,...,1,0,0,1.224647e-16,-1.0,0.651372,-0.758758,0.144244,5.080116,5.119024
4,83215,0,0,1,1,0,0,0,0,0,...,0,0,0,0.5,-0.866025,0.998717,-0.050649,-1.234297,5.529871,5.515966


In [10]:
# findinf the important features of the loaded model 
# ------# linear regression/ Lasso/ Ridge#----------

# Get the feature names
features = x_train.columns

# Get the coefficient values
coefficient = loaded_model.coef_

# Create a DataFrame for easy viewing and sorting
importance_df = pd.DataFrame({
    'Feature': features,
    'Coefficient': coefficient,
    'Absolute_Coefficient': np.abs(coefficient)
}).sort_values(by='Absolute_Coefficient', ascending=False)

print(importance_df)

                        Feature   Coefficient  Absolute_Coefficient
2                       price_2 -8.437343e-01          8.437343e-01
20                     colour_8 -4.229371e-01          4.229371e-01
12        page1_main_category_4 -3.631125e-01          3.631125e-01
10        page1_main_category_2  2.933976e-01          2.933976e-01
9         page1_main_category_1  2.679667e-01          2.679667e-01
17                     colour_5  2.436069e-01          2.436069e-01
11        page1_main_category_3 -1.982518e-01          1.982518e-01
22                    colour_10 -1.844057e-01          1.844057e-01
15                     colour_3  1.699177e-01          1.699177e-01
31                   location_5 -9.597119e-02          9.597119e-02
1             model_photography -8.865918e-02          8.865918e-02
25                    colour_13 -8.187547e-02          8.187547e-02
29                   location_3  7.310736e-02          7.310736e-02
13                     colour_1 -6.669468e-02   

In [7]:
# findinf the important features of the loaded model 
# ------# linear regression/ Lasso/ Ridge#----------

# Feature importances are stored in the 'feature_importances_' attribute
importance = loaded_model.feature_importances_

# Create a DataFrame for easy viewing and sorting
importance_df = pd.DataFrame({
    'Features': x_train.columns,
    'Importance' : importance
}).sort_values(by='Importance', ascending= False)

importance_df.head(10)

Unnamed: 0,Features,Importance
2,price_2,0.581198
12,page1_main_category_4,0.16268
11,page1_main_category_3,0.092053
3,page,0.026634
10,page1_main_category_2,0.015803
15,colour_3,0.015043
31,location_5,0.011765
30,location_4,0.009713
9,page1_main_category_1,0.009142
14,colour_2,0.008943


In [None]:
# A more robust and model-agnostic method that works for any trained model 
# (including your Linear, Ridge, Lasso, and Gradient Boosting models) is Permutation Importance.
# It works by measuring the decrease in a model's score (e.g., R^2 or RMSE) when a single feature's values are randomly shuffled (permuted). 
# A feature that causes a large drop in the score is considered important.

In [6]:
# Model-Agnostic Approach (Permutation Importance)
from sklearn.inspection import permutation_importance

# Assuming 'best_model' is the trained model you want to examine 
# (e.g., Gradient Boosting or Ridge)

result = permutation_importance(
    loaded_model, 
    x_test, # Use the test set to measure importance on unseen data
    y_test, 
    n_repeats=10, # Number of times to permute a feature
    random_state=42, 
    n_jobs=-1
)

# Extract and sort the results
sorted_idx = result.importances_mean.argsort()[::-1]

importance_df = pd.DataFrame({
    'Feature': x_test.columns[sorted_idx],
    'Importance_Mean': result.importances_mean[sorted_idx],
    'Importance_Std': result.importances_std[sorted_idx] # Standard deviation indicates reliability
})

print(importance_df)

                        Feature  Importance_Mean  Importance_Std
0                       price_2     1.245618e+00    1.164879e-02
1         page1_main_category_4     3.585880e-01    4.096960e-03
2         page1_main_category_3     1.714628e-01    1.401958e-03
3                          page     5.167028e-02    5.774063e-04
4                      colour_3     3.055580e-02    2.843731e-04
5             model_photography     2.976659e-02    6.475022e-04
6         page1_main_category_1     2.478847e-02    5.983050e-04
7         page1_main_category_2     2.220751e-02    4.170882e-04
8   model_target_encoded_scaled     2.188636e-02    3.962183e-04
9                    location_5     1.756603e-02    3.151836e-04
10                   location_4     1.584654e-02    2.841314e-04
11                     colour_2     1.561708e-02    1.216780e-04
12                     colour_8     1.042791e-02    1.462444e-04
13                   location_1     9.583885e-03    1.671195e-04
14                   loca

In [None]:
# Next Step
# 
# Train and log the regression model on ML flow
# Perform unsupervised learning and log them on ML flow as well
# build stream lit app
# create pipe lines

# Learning and Knowledge gain:
# Deepen the understanding on each model, their hyper parameters and grid search
# Learn to interpret the MLflow graphs and make the most use of MLflow