In [1]:
from tools.DataLoader import DataLoader
from tools.ClusterPipeline import ClusterPipeline
from tools.AddCountFeatureTransformer import AddCountFeatureTransformer

from sklearn.pipeline import Pipeline

from sklearn.discriminant_analysis import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_selector

In [2]:
# Step 1: Load Data using DataLoader
data_loader = DataLoader(
    request_type="local", path="db/mod_05_topic_10_various_data.pkl"
)
df_dict = data_loader.load_data()
df = df_dict["concrete"]

In [3]:
# Feature Engineering Pipeline
feature_engineering_pipeline = Pipeline(
    steps=[
        (
            "add_count",
            AddCountFeatureTransformer(
                material_columns=[
                    "Cement",
                    "BlastFurnaceSlag",
                    "FlyAsh",
                    "Water",
                    "Superplasticizer",
                    "CoarseAggregate",
                    "FineAggregate",
                ],
                new_feature_name="Count",
            ),
        ),
    ]
)

In [4]:
# # Define categorical transformer
# cat_transformer = Pipeline(
#     steps=[
#         ("imputer", SimpleImputer(strategy="most_frequent")),
#         ("encoder", ce.TargetEncoder()),
#     ]
# )

# Define numerical transformer
num_transformer = Pipeline(steps=[("scaler", StandardScaler())])

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        # ("cat", cat_transformer, make_column_selector(dtype_include=object)),
        ("num", num_transformer, make_column_selector(dtype_include=np.number)),
    ],
    n_jobs=-1,
    verbose_feature_names_out=False,
).set_output(transform="pandas")

In [5]:
# Create full pipeline with feature engineering and preprocessor
full_pipeline_kmeans = Pipeline(
    steps=[
        ("feature_engineering", feature_engineering_pipeline),
        ("preprocessing", preprocessor),
        ("model", KMeans()),
    ]
)

In [6]:
# Define the parameter grid
param_grids = {
    "kmeans": {
        "model__n_clusters": [2, 3, 4, 5, 6, 7, 8, 9, 10],
        "model__random_state": [40],
    }
}

In [7]:
# Combine pipelines into a dictionary
model_pipelines = {"kmeans": full_pipeline_kmeans}

In [8]:
# Create an instance of ClusterPipeline and train models
model_pipeline = ClusterPipeline()
model_pipeline.train(df, model_pipelines, param_grids, scoring="silhouette_score")

In [9]:
model_pipeline.display_results(df, help_text=True)
model_pipeline.visualize_pipeline("kmeans")

Evaluation Metrics for Best Models:


Unnamed: 0,Model,Silhouette Score,Davies-Bouldin Index,Calinski-Harabasz Index
0,kmeans,0.233689,1.488261,213.323272



Best Parameters for Each Model:


Unnamed: 0,Model,model__random_state,model__n_clusters
0,kmeans,40,6



Metric Explanations:
Silhouette Score: Measures how similar an object is to its own cluster compared to other clusters.
  - Range: [-1, 1], higher is better.
  - Higher values indicate better-defined clusters.
Davies-Bouldin Index: Measures the average similarity ratio of each cluster with its most similar cluster.
  - Range: [0, ∞), lower is better.
  - Lower values indicate better clustering.
Calinski-Harabasz Index: Ratio of the sum of between-cluster dispersion to within-cluster dispersion.
  - Range: [0, ∞), higher is better.
  - Higher values indicate better-defined clusters.


In [10]:
# Generate cluster report
cluster_report = model_pipeline.generate_cluster_report(df)
display(cluster_report)

Unnamed: 0_level_0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Age,CompressiveStrength,ObjectCount
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,213.7,0.0,121.6,167.0,8.1,1014.3,793.5,28.0,33.36,285
1,339.0,0.0,0.0,192.0,0.0,1013.2,784.0,28.0,26.06,197
2,386.0,118.0,0.0,162.0,11.6,938.0,782.0,28.0,56.615,188
3,183.9,185.3,0.0,192.0,0.0,965.4,749.1,28.0,26.59,166
4,252.05,110.75,123.0,193.3,8.5,877.6,749.15,28.0,32.325,138
5,340.5,38.0,0.0,228.0,0.0,932.0,670.0,270.0,43.355,56


In [11]:
# Evaluate feature importance
# model_pipeline.feature_importance(df)