In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
import mlflow
import mlflow.sklearn
import pickle
import mlflow

In [3]:
# Load data
train_data = pd.read_csv(r"D:\Click_stream\train_data.csv")
test_data = pd.read_csv(r"D:\Click_stream\test_data.csv")

In [4]:
train_data.head()

Unnamed: 0,year,month,day,order,country,session_id,page1_main_category,page2_clothing_model,colour,location,model_photography,price,price_2,page
0,2008,6,22,21,29,15648,3,C20,13,1,2,48,1,2
1,2008,5,19,6,29,10018,2,B26,13,3,1,57,1,2
2,2008,7,15,2,29,19388,3,C13,9,5,1,48,1,1
3,2008,5,2,2,29,7181,2,B11,2,4,1,43,2,1
4,2008,6,9,16,29,13493,2,B31,9,5,1,57,1,2


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132379 entries, 0 to 132378
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   year                  132379 non-null  int64 
 1   month                 132379 non-null  int64 
 2   day                   132379 non-null  int64 
 3   order                 132379 non-null  int64 
 4   country               132379 non-null  int64 
 5   session_id            132379 non-null  int64 
 6   page1_main_category   132379 non-null  int64 
 7   page2_clothing_model  132379 non-null  object
 8   colour                132379 non-null  int64 
 9   location              132379 non-null  int64 
 10  model_photography     132379 non-null  int64 
 11  price                 132379 non-null  int64 
 12  price_2               132379 non-null  int64 
 13  page                  132379 non-null  int64 
dtypes: int64(13), object(1)
memory usage: 14.1+ MB


In [6]:
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
train_data['page2_clothing_model'] = le1.fit_transform(train_data['page2_clothing_model'])

le2 = LabelEncoder()
test_data['page2_clothing_model'] = le2.fit_transform(test_data['page2_clothing_model'])

In [7]:
from sklearn.preprocessing import StandardScaler
train_features = train_data[['page1_main_category', 'page2_clothing_model', 'colour']]
train_target = train_data['price']

test_features = test_data[['page1_main_category', 'page2_clothing_model', 'colour']]
test_target = test_data['price']

scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

In [8]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [9]:
model_params = {
    "Linear Regressor": (LinearRegression(), {}),
    
    "Ridge Regressor": (Ridge(), {
        "alpha": [0.01, 0.1, 1, 10, 100]
    }),
    
    "Lasso Regressor": (Lasso(), {
        "alpha": [0.01, 0.1, 1, 10, 100]
    }),
    
    "Gradient Boosting Regressor": (GradientBoostingRegressor(), {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7]
    }),
    
    "Random Forest Regressor": (RandomForestRegressor(), {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10]
    })
}

In [10]:
reports = []

for name, (model, param_grid) in model_params.items():
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring="r2", n_jobs=-1)
        grid_search.fit(train_features, train_target)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(train_features, train_target)
        best_params = "Default Parameters"

    predictions = best_model.predict(test_features)
    mae = mean_absolute_error(test_target, predictions)
    r2 = r2_score(test_target, predictions)

    reports.append((name, best_model, best_params, mae, r2))

In [11]:
for name, model, best_params, mae, r2 in reports:
    print(f"Model: {name}")
    print(f"Best Parameters: {best_params}")
    print(f"MAE: {mae:.4f}")
    print(f"R2 Score: {r2:.4f}")
    print("\n")

Model: Linear Regressor
Best Parameters: Default Parameters
MAE: 10.0456
R2 Score: 0.1364


Model: Ridge Regressor
Best Parameters: {'alpha': 10}
MAE: 10.0454
R2 Score: 0.1364


Model: Lasso Regressor
Best Parameters: {'alpha': 0.01}
MAE: 10.0402
R2 Score: 0.1364


Model: Gradient Boosting Regressor
Best Parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300}
MAE: 0.4828
R2 Score: 0.9574


Model: Random Forest Regressor
Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
MAE: 0.4361
R2 Score: 0.9553




In [12]:
import mlflow
import mlflow.sklearn
import mlflow.pyfunc

In [14]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Price_Prediction_(Regression)")

for name, model, best_params, mae, r2 in reports:
    with mlflow.start_run(run_name=name) as run:
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("R2", r2)
        
        if name == "Linear Regressor":
            mlflow.sklearn.log_model(model, "linear_model")
        elif name == "Ridge Regressor":
            mlflow.sklearn.log_model(model, "ridge_model")
        elif name == "Lasso Regressor":
            mlflow.sklearn.log_model(model, "lasso_model")
        elif name == "Gradient Boosting Regressor":
            mlflow.sklearn.log_model(model, "gradient_boosting_model")
        elif name == "Random Forest Regressor":
            mlflow.sklearn.log_model(model, "random_forest_model")
        else:
            pass

2025/12/09 11:15:41 INFO mlflow.tracking.fluent: Experiment with name 'Price_Prediction_(Regression)' does not exist. Creating a new experiment.




üèÉ View run Linear Regressor at: http://127.0.0.1:5000/#/experiments/601029064602870059/runs/1aed3ec981914119b8c0c3a8be70a989
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/601029064602870059




üèÉ View run Ridge Regressor at: http://127.0.0.1:5000/#/experiments/601029064602870059/runs/dd72677bf3634f8b9e1de9d373c838e3
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/601029064602870059




üèÉ View run Lasso Regressor at: http://127.0.0.1:5000/#/experiments/601029064602870059/runs/592fa7c298264152b593970b6b7c5444
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/601029064602870059




üèÉ View run Gradient Boosting Regressor at: http://127.0.0.1:5000/#/experiments/601029064602870059/runs/fa59a3c955424b8ca95f26a128e276a8
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/601029064602870059




üèÉ View run Random Forest Regressor at: http://127.0.0.1:5000/#/experiments/601029064602870059/runs/e71375fd31ca4c47ac869252d04527be
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/601029064602870059


In [18]:
model_name ='Gradient Boosting Regressor'
run_id = 'fa59a3c955424b8ca95f26a128e276a8'
model_uri = f'runs:/{run_id}/gradient_boosting_model'

with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri= model_uri , name= model_name)

Registered model 'Gradient Boosting Regressor' already exists. Creating a new version of this model...
2025/12/09 11:27:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Gradient Boosting Regressor, version 3
Created version '3' of model 'Gradient Boosting Regressor'.


üèÉ View run Gradient Boosting Regressor at: http://127.0.0.1:5000/#/experiments/601029064602870059/runs/fa59a3c955424b8ca95f26a128e276a8
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/601029064602870059


In [19]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
model_name = "Gradient Boosting Regressor"
model_version = "1"
model_uri = f"models:/{model_name}/{model_version}"
model = mlflow.pyfunc.load_model(model_uri)

 - mlflow (current: 3.6.0, required: mlflow==3.1.0)
 - cloudpickle (current: 3.1.2, required: cloudpickle==3.1.1)
 - numpy (current: 2.3.5, required: numpy==2.1.3)
 - pandas (current: 2.3.3, required: pandas==2.2.3)
 - psutil (current: 7.1.3, required: psutil==6.1.0)
 - scikit-learn (current: 1.7.2, required: scikit-learn==1.6.1)
 - scipy (current: 1.16.3, required: scipy==1.15.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [20]:
import pickle

with open('le1_clothing_model.pkl', 'wb') as f:
    pickle.dump(le1, f)

with open('le2_clothing_model.pkl', 'wb') as f:
    pickle.dump(le2, f)

with open('regression_standard_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('gradient_boosting_model.pkl', 'wb') as f:
    pickle.dump(model, f)