In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import transformations as TR
import utils as UT
import json
import train_model as TM

In [2]:
UT.setup_logger()

[2025-01-02 14:21:51,971] [INFO] [utils]: Logger set up: ./logs.log


In [3]:
args_dict = {
    "config": "./config.yaml",
    "override": json.dumps({
        "mlflow.mlflow_experiment_name": "n_clusters_investigation",
        "model.kmeans.params.model_params.n_clusters": 2
    })
}

In [4]:
cluster_range = range(2, 11)

for n_clusters in cluster_range:
    print(f"Running pipeline with n_clusters={n_clusters}")
    
    args_dict["override"] = json.dumps({
        "mlflow.mlflow_experiment_name": "n_clusters_investigation",
        "model.kmeans.params.model_params.n_clusters": n_clusters
    })
    
    TM.run_training_pipeline(args_dict)


[2025-01-02 00:30:12,902] [INFO] [utils]: Logger set up: ./logs.log
[2025-01-02 00:30:12,914] [INFO] [utils]: Loading data from ./consumers_features.parquet.gzip


Running pipeline with n_clusters=2


[2025-01-02 00:30:13,114] [INFO] [utils]: Data loaded successfully with shape: (315462, 13)
2025/01/02 00:30:13 INFO mlflow.tracking.fluent: Experiment with name 'n_clusters_investigation' does not exist. Creating a new experiment.
[2025-01-02 00:30:13,135] [INFO] [train_model]: Applying Standard Scaling to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items']
[2025-01-02 00:30:13,135] [INFO] [transformations]: Standard scaling pipeline built successfully
[2025-01-02 00:30:13,266] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 00:30:13,266] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 00:30

Running pipeline with n_clusters=3


[2025-01-02 01:15:17,309] [INFO] [train_model]: Applying Standard Scaling to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items']
[2025-01-02 01:15:17,309] [INFO] [transformations]: Standard scaling pipeline built successfully
[2025-01-02 01:15:17,448] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 01:15:17,449] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 01:15:17,709] [INFO] [train_model]: Applying PCA to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 

Running pipeline with n_clusters=4


[2025-01-02 02:00:17,100] [INFO] [train_model]: Applying Standard Scaling to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items']
[2025-01-02 02:00:17,101] [INFO] [transformations]: Standard scaling pipeline built successfully
[2025-01-02 02:00:17,232] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 02:00:17,233] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 02:00:17,508] [INFO] [train_model]: Applying PCA to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 

Running pipeline with n_clusters=5


[2025-01-02 02:44:56,443] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 02:44:56,444] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 02:44:56,705] [INFO] [train_model]: Applying PCA to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items'] with pca_n_components=5
[2025-01-02 02:44:56,705] [INFO] [transformations]: PCA pipeline built successfully for columns ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items'] with 5 components
[2025-01-02 02:44:56,

Running pipeline with n_clusters=6


[2025-01-02 03:29:22,049] [INFO] [train_model]: Applying Standard Scaling to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items']
[2025-01-02 03:29:22,049] [INFO] [transformations]: Standard scaling pipeline built successfully
[2025-01-02 03:29:22,174] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 03:29:22,174] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 03:29:22,456] [INFO] [train_model]: Applying PCA to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 

Running pipeline with n_clusters=7


[2025-01-02 04:14:09,143] [INFO] [train_model]: Applying Standard Scaling to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items']
[2025-01-02 04:14:09,143] [INFO] [transformations]: Standard scaling pipeline built successfully
[2025-01-02 04:14:09,279] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 04:14:09,280] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 04:14:09,547] [INFO] [train_model]: Applying PCA to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 

Running pipeline with n_clusters=8


[2025-01-02 04:58:10,527] [INFO] [transformations]: Standard scaling pipeline built successfully
[2025-01-02 04:58:10,659] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 04:58:10,659] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 04:58:10,929] [INFO] [train_model]: Applying PCA to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items'] with pca_n_components=5
[2025-01-02 04:58:10,930] [INFO] [transformations]: PCA pipeline built successfully for columns ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 

Running pipeline with n_clusters=9


[2025-01-02 05:43:02,172] [INFO] [train_model]: Applying Standard Scaling to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items']
[2025-01-02 05:43:02,172] [INFO] [transformations]: Standard scaling pipeline built successfully
[2025-01-02 05:43:02,301] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 05:43:02,302] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 05:43:02,572] [INFO] [train_model]: Applying PCA to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 

Running pipeline with n_clusters=10


[2025-01-02 06:27:53,179] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 06:27:53,180] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 06:27:53,476] [INFO] [train_model]: Applying PCA to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items'] with pca_n_components=5
[2025-01-02 06:27:53,477] [INFO] [transformations]: PCA pipeline built successfully for columns ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items'] with 5 components
[2025-01-02 06:27:53,

In [4]:
cluster_range = [3,5]

for n_clusters in cluster_range:
    print(f"Running pipeline with n_clusters={n_clusters}")
    
    args_dict["override"] = json.dumps({
        "mlflow.mlflow_experiment_name": "two_clusterings",
        "model.kmeans.params.model_params.n_clusters": n_clusters
    })
    
    TM.run_training_pipeline(args_dict)


[2025-01-02 14:22:00,806] [INFO] [utils]: Logger set up: ./logs.log
[2025-01-02 14:22:00,806] [INFO] [utils]: Loading data from ./consumers_features.parquet.gzip
[2025-01-02 14:22:01,001] [INFO] [utils]: Data loaded successfully with shape: (315462, 13)


Running pipeline with n_clusters=3


2025/01/02 14:22:01 INFO mlflow.tracking.fluent: Experiment with name 'two_clusterings' does not exist. Creating a new experiment.
[2025-01-02 14:22:01,022] [INFO] [train_model]: Applying Standard Scaling to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items']
[2025-01-02 14:22:01,022] [INFO] [transformations]: Standard scaling pipeline built successfully
[2025-01-02 14:22:01,152] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 14:22:01,152] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 14:22:01,440] [INFO] [train_model]: Applying PCA to columns: ['recency', 'total_baskets', 'total_spend_mon

Running pipeline with n_clusters=5


[2025-01-02 15:07:18,053] [INFO] [train_model]: Applying Standard Scaling to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items']
[2025-01-02 15:07:18,053] [INFO] [transformations]: Standard scaling pipeline built successfully
[2025-01-02 15:07:18,184] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 15:07:18,185] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 15:07:18,465] [INFO] [train_model]: Applying PCA to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 

In [4]:
cluster_range = [3,5]

for n_clusters in cluster_range:
    print(f"Running pipeline with n_clusters={n_clusters}")
    
    args_dict["override"] = json.dumps({
        "mlflow.mlflow_experiment_name": "two_clusterings_2",
        "model.kmeans.params.model_params.n_clusters": n_clusters
    })
    
    TM.run_training_pipeline(args_dict)


[2025-01-02 13:20:24,640] [INFO] [utils]: Logger set up: ./logs.log
[2025-01-02 13:20:24,642] [INFO] [utils]: Loading data from ./consumers_features.parquet.gzip
[2025-01-02 13:20:24,823] [INFO] [utils]: Data loaded successfully with shape: (315462, 13)
2025/01/02 13:20:24 INFO mlflow.tracking.fluent: Experiment with name 'two_clusterings_2' does not exist. Creating a new experiment.


Running pipeline with n_clusters=3


[2025-01-02 13:20:24,859] [INFO] [train_model]: Applying Standard Scaling to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items']
[2025-01-02 13:20:24,860] [INFO] [transformations]: Standard scaling pipeline built successfully
[2025-01-02 13:20:24,988] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 13:20:24,988] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 13:20:25,256] [INFO] [train_model]: Applying PCA to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 

Running pipeline with n_clusters=5


[2025-01-02 14:04:02,404] [INFO] [train_model]: Applying Standard Scaling to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 'total_returned_items']
[2025-01-02 14:04:02,405] [INFO] [transformations]: Standard scaling pipeline built successfully
[2025-01-02 14:04:02,530] [INFO] [train_model]: Applying One-Hot Encoding to columns: {'favourite_metal': ['0', '1', '10', '13', '15', '16', '17', '22', '23', '24', '25', '3', '4', '5', '6', '7'], 'favourite_store_type': ['Concept Store', 'Online', 'Shop In Shop']}
[2025-01-02 14:04:02,530] [INFO] [transformations]: Pipeline with one-hot encoding built successfully
[2025-01-02 14:04:02,837] [INFO] [train_model]: Applying PCA to columns: ['recency', 'total_baskets', 'total_spend_money', 'total_refund_money', 'total_net_revenue', 'average_basket_spend', 'total_items_purchased', 'total_distinct_items_purchased', 

KeyboardInterrupt: 