This notebook executes K-Means clustering analysis on preprocessed data.
It evaluates the model using cluster quality metrics to determine the optimal number of groups and saves the best model for the next step.

### Configuration and Library Imports

In [1]:
%run ../src/settings.py

In [2]:
import pandas as pd

from src.settings import load_config
from src.smart_k_means import calculate_best_k_with_entropy
from src.smart_k_means import evaluate_cluster
from src.smart_k_means import get_comparision_clusters_graph

In [3]:
config = load_config()
MIN_CLUSTERS = 3
MAX_CLUSTERS = 7

In [4]:
config = load_config()
QTD_MIN_CLUSTERS = 3
QTD_MAX_CLUSTERS = 7


### Data Loading and Overview

In [5]:
path_file = config['datasets'][5]

# Column names to be ignored, according to the original notebook
cols_to_ignore = ["RendaMedia"]
id_column = "Cod_Setor"

df_original = pd.read_excel(path_file, dtype={id_column: "object"})
df_processed = df_original.drop(columns=[id_column, *cols_to_ignore])

display(df_original)
display(df_processed.describe().T)

Unnamed: 0,Cod_Setor,2010_VD1,2010_VD2,2010_VD3,2010_VD4,2010_VE1,2010_VE2,2010_VE3,2010_VED1,2010_VED2,2010_VH1,2010_VH2,2010_VH3,2010_VH4,2010_VH5,VA4_2010,RendaMedia
0,411520005010001,0.723636,0.923523,0.568170,0.774691,0.836859,0.153226,0.888877,1.000000,0.9856,1.000000,0.024468,1.0000,1.000000,0.572691,0.190181,4800.240000
1,411520005010002,0.669091,0.947785,0.659827,0.805556,0.910891,0.291475,0.891996,1.000000,0.9868,1.000000,0.257003,1.0000,0.588340,0.550909,0.060300,5965.110000
2,411520005010003,0.607273,0.948840,0.617171,0.811728,0.877588,0.387865,0.936590,1.000000,0.9688,1.000000,0.239116,1.0000,0.833664,0.645631,0.009710,4789.690000
3,411520005010004,0.567273,0.962553,0.571139,0.753086,0.928218,0.599078,0.910499,0.722431,0.9716,1.000000,0.107661,1.0000,0.070003,0.692930,0.148177,8623.780000
4,411520005010005,0.534545,0.979430,0.689390,0.780864,0.920567,0.361367,0.966112,1.000000,0.9844,1.000000,0.086061,0.9961,0.201024,0.700398,0.161849,8165.580000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,412625605000066,0.352727,0.974684,0.641199,0.608025,0.853285,0.009217,0.331393,0.929482,0.7972,1.000000,0.004050,0.0338,1.000000,0.646378,0.462668,0.576274
479,412625605000067,0.367273,0.952532,0.595437,0.595679,0.872412,0.000000,0.319647,0.760690,0.7136,1.000000,0.000000,0.0657,0.990597,0.706497,0.691973,0.575469
480,412625605000068,0.360000,0.946730,0.529968,0.574074,0.640414,0.003840,0.385759,0.888972,0.6440,0.994001,0.001687,0.0314,0.980984,0.704630,0.477806,0.544284
481,412625605000069,0.320000,0.621308,0.437635,0.293210,0.333933,0.000000,0.442100,0.882221,0.6436,0.982603,0.000000,0.0144,1.000000,0.686706,0.472028,0.475316


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
2010_VD1,483.0,0.418543,0.104599,0.0,0.352727,0.403636,0.465455,1.0
2010_VD2,483.0,0.919942,0.125474,0.0,0.918513,0.959388,0.983914,1.0
2010_VD3,483.0,0.612828,0.098168,0.0,0.563647,0.608936,0.659355,1.0
2010_VD4,483.0,0.636569,0.131548,0.0,0.552469,0.638889,0.725309,1.0
2010_VE1,483.0,0.811266,0.157044,0.0,0.753375,0.863186,0.917079,1.0
2010_VE2,483.0,0.04501,0.107741,0.0,0.0,0.0,0.029186,1.0
2010_VE3,483.0,0.555011,0.18745,0.0,0.415021,0.522037,0.698337,1.0
2010_VED1,483.0,0.930897,0.117805,0.0,0.882971,1.0,1.0,1.0
2010_VED2,483.0,0.816855,0.152264,0.0,0.7348,0.8472,0.9328,1.0
2010_VH1,483.0,0.986195,0.07194,0.0,1.0,1.0,1.0,1.0


### Evaluate user-desired options

In [6]:


pre_results = [
  evaluate_cluster(df_processed, k)
  for k in range(MIN_CLUSTERS, MAX_CLUSTERS + 1)
]

df_pre_results = pd.DataFrame(pre_results,
                              columns=['cluster', 'num_clusters',
                                       'silhouette_score',
                                       'davies_bouldin_score',
                                       'dunn_score'])

comparison_graph = get_comparision_clusters_graph(df_pre_results.round(3),
                                                  title="Cluster Evaluation Metrics Comparison")

df_pre_results

Unnamed: 0,cluster,num_clusters,silhouette_score,davies_bouldin_score,dunn_score
0,3 groups,3,0.299611,1.509108,0.228483
1,4 groups,4,0.264569,1.975772,0.211635
2,5 groups,5,0.233157,1.935129,0.195972
3,6 groups,6,0.213813,1.822688,0.17576
4,7 groups,7,0.189489,1.928467,0.146368


In [7]:
comparison_graph

### Applying the Smart K-Means

In [8]:



results = calculate_best_k_with_entropy(df_processed, QTD_MIN_CLUSTERS,
                                        QTD_MAX_CLUSTERS)
results[0]

Unnamed: 0,cluster,qty_subindicators,silhouette_score,davies_bouldin_score,dunn_score,subindicators,details
0,3 groups,7,0.568408,0.988493,0.378603,"[2010_VH2, 2010_VD2, 2010_VD1, 2010_VE2, 2010_...","{'cluster': '3 groups', 'num_clusters': 3, 'la..."
1,5 groups,7,0.521769,0.917974,0.330873,"[2010_VH2, 2010_VD2, 2010_VD1, 2010_VE2, 2010_...","{'cluster': '5 groups', 'num_clusters': 5, 'la..."
2,4 groups,7,0.515995,0.970469,0.296098,"[2010_VH2, 2010_VD2, 2010_VD1, 2010_VE2, 2010_...","{'cluster': '4 groups', 'num_clusters': 4, 'la..."
3,6 groups,4,0.529098,0.979725,0.170257,"[2010_VE2, 2010_VH4, 2010_VED1, 2010_VH1]","{'cluster': '6 groups', 'num_clusters': 6, 'la..."
4,7 groups,4,0.524111,0.858092,0.199247,"[2010_VE2, 2010_VH4, 2010_VED1, 2010_VH1]","{'cluster': '7 groups', 'num_clusters': 7, 'la..."


In [9]:
pd.DataFrame(results[0].iloc[0]['details']['details'])

Unnamed: 0,group,count,score,centroids
0,C1,413,0.629718,"[[0.027892432821587293, 0.9185911974744843, 0...."
1,C2,38,0.225383,"[[0.027892432821587293, 0.9185911974744843, 0...."
2,C3,32,0.184476,"[[0.027892432821587293, 0.9185911974744843, 0...."


In [10]:
import plotly.express as px

df_entropies = results[1]
px.bar(
  df_entropies.sort_values(by=["entropy"], ascending=False),
  x="entropy",
  y="subindicator",
  color="entropy",
  color_continuous_scale="sunset",
  text_auto=True,
  title="Subindicators Entropy"
)

In [11]:
results[2]

Unnamed: 0,cluster,iteration,excluded_indicator,entropy,silhouette_score,davies_bouldin_score,dunn_score
0,3 groups,1,VA4_2010,8.915879,0.299611,1.509108,0.228483
1,3 groups,2,2010_VE3,8.820641,0.311467,1.464015,0.229442
2,3 groups,3,2010_VH5,8.7497,0.362337,1.592643,0.221977
3,3 groups,4,2010_VD3,8.694307,0.375873,1.41239,0.269605
4,3 groups,5,2010_VE1,8.571906,0.360978,1.58881,0.174847
5,3 groups,6,2010_VED2,8.274006,0.314741,1.304022,0.250992
6,3 groups,7,2010_VH3,8.160091,0.491032,1.114875,0.287805
7,3 groups,8,2010_VD4,7.035503,0.477056,1.434891,0.256312
8,3 groups,9,2010_VH2,6.798749,0.568408,0.988493,0.378603
9,4 groups,1,VA4_2010,8.915879,0.264569,1.975772,0.211635
