In [2]:
import pandas as pd
import pickle
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV


In [3]:
train_data = pd.read_csv(r"D:\Click_stream\train_data.csv")
test_data = pd.read_csv(r"D:\Click_stream\test_data.csv")

In [4]:
# Load label encoders
le1 = pickle.load(open(r"D:\Click_stream\le1_clothing_model.pkl", "rb"))
le2 = pickle.load(open(r"D:\Click_stream\le2_clothing_model.pkl", "rb"))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
train_data['page2_clothing_model'] = le1.transform(train_data['page2_clothing_model'])
test_data['page2_clothing_model'] = le2.transform(test_data['page2_clothing_model'])

In [6]:
from sklearn.preprocessing import StandardScaler
train_features = train_data[['page1_main_category', 'page2_clothing_model', 'colour', 'order', 'price', 'location', 'model_photography']]
train_target = train_data['price_2']

test_features = test_data[['page1_main_category', 'page2_clothing_model', 'colour', 'order', 'price', 'location', 'model_photography']]
test_target = test_data['price_2']
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

In [7]:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV


In [8]:
model_params = {
    "Logistic_Regression": (LogisticRegression(), {
        "C": [0.01, 0.1, 1, 10, 100],  
        "solver": ["liblinear", "lbfgs"] 
    }),
    
    "Random_Forest": (RandomForestClassifier(), {
        "n_estimators": [50, 100, 200],  
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10] 
    }),
    
    "Decision_Tree": (DecisionTreeClassifier(), {
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10],
        "criterion": ["gini", "entropy"]  
    })
}

In [9]:
reports = []
for name, (model, param_grid) in model_params.items():
    if param_grid: 
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
        grid_search.fit(train_features, train_target)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(train_features, train_target)
        best_params = "Default Parameters"

    predictions = best_model.predict(test_features)
    accuracy = accuracy_score(test_target, predictions)
    report = classification_report(test_target, predictions)
    confusion = confusion_matrix(test_target, predictions)

    reports.append((name, best_model, best_params, accuracy, report, confusion))

In [10]:
for name, model, best_params, accuracy, report, confusion in reports:
    print(f"Model: {name}")
    print(f"Best Parameters: {best_params}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:\n{report}")
    print(f"Confusion Matrix:\n{confusion}\n")    

Model: Logistic_Regression
Best Parameters: {'C': 10, 'solver': 'liblinear'}
Accuracy: 0.9985
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     16981
           2       1.00      1.00      1.00     16114

    accuracy                           1.00     33095
   macro avg       1.00      1.00      1.00     33095
weighted avg       1.00      1.00      1.00     33095

Confusion Matrix:
[[16930    51]
 [    0 16114]]

Model: Random_Forest
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     16981
           2       1.00      1.00      1.00     16114

    accuracy                           1.00     33095
   macro avg       1.00      1.00      1.00     33095
weighted avg       1.00      1.00      1.00     33095

Confusion Matrix:
[[16981     0]
 [  

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Purchase_Classification_Models")

for name, model, best_params, accuracy, report, confusion in reports:
    with mlflow.start_run(run_name=name) as run:
        mlflow.sklearn.log_model(model, f"{name}_model")
        mlflow.log_params(best_params)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_text(report, "classification_report.txt")
        mlflow.log_text(str(confusion), "confusion_matrix.txt")




üèÉ View run Logistic_Regression at: http://127.0.0.1:5000/#/experiments/191990390282879002/runs/8d3f002278db4f2aac4e2e56d3da73b9
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/191990390282879002




üèÉ View run Random_Forest at: http://127.0.0.1:5000/#/experiments/191990390282879002/runs/6e8146009b414202a7ceaf713ad56496
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/191990390282879002




üèÉ View run Decision_Tree at: http://127.0.0.1:5000/#/experiments/191990390282879002/runs/dc3dd6d745544d7f90b00eef2f9c4946
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/191990390282879002


In [12]:
model_name ='Random Forest Classifier'
run_id = 'd96ecf051ad24653858820abc91bbb59'
model_uri = f'runs:/{run_id}/Random_Forest_model'

with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri= model_uri , name= model_name)

Registered model 'Random Forest Classifier' already exists. Creating a new version of this model...
2025/12/09 10:41:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Classifier, version 3


üèÉ View run Random_Forest at: http://127.0.0.1:5000/#/experiments/191990390282879002/runs/d96ecf051ad24653858820abc91bbb59
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/191990390282879002


Created version '3' of model 'Random Forest Classifier'.


In [13]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
model_name = "Random Forest Classifier"
model_version = "1"
model_uri = f"models:/{model_name}/{model_version}"
model = mlflow.pyfunc.load_model(model_uri)


 - mlflow (current: 3.6.0, required: mlflow==3.1.0)
 - cloudpickle (current: 3.1.2, required: cloudpickle==3.1.1)
 - numpy (current: 2.3.5, required: numpy==2.1.3)
 - pandas (current: 2.3.3, required: pandas==2.2.3)
 - psutil (current: 7.1.3, required: psutil==6.1.0)
 - scikit-learn (current: 1.7.2, required: scikit-learn==1.6.1)
 - scipy (current: 1.16.3, required: scipy==1.15.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [14]:
import pickle

with open("random_forest_classifier_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("classification_standard_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le2, f)    