<b> Clustering Analysis

In [2]:
from clustering_functions import *
from sklearn.metrics import silhouette_score
from pre_processing_functions import preprocess
from pre_processing_functions import preprocess_semscalling
%load_ext autoreload
%autoreload 2

In [15]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
def clustering(path):
    df = preprocess(path)
    df['som_cluster'] = som_cluster(df,som_size=3, iterations=5000, sigma=1.0, learning_rate=0.5)['som_cluster']
    df['Kmeans_cluster'] = kmeans_clustering(df, n_clusters=9, random_state=42)['Kmeans_cluster']
    df['hierachical_cluster'] = hierarchical_clustering(df, n_clusters=9, linkage='ward')['hierarchical_cluster']

    return df

In [4]:
df= clustering('customer_info.csv')

 [ 5000 / 5000 ] 100% - 0:00:00 left 
 quantization error: 2.411554776690376


In [9]:
data = preprocess_semscalling('customer_info.csv')
data['kmeans_cluster'] = df['Kmeans_cluster']   
data['som_cluster'] = df['som_cluster']     
data['hierarchical_cluster'] = df['hierachical_cluster']    

In [28]:
def calcular_silhouettes(X, dicionario_labels):
    for nome, labels in dicionario_labels.items():
        if len(set(labels)) > 1:
            try:
                score = silhouette_score(X, labels)
                print(f"{nome}: Silhouette Score = {score:.3f}")
            except Exception as e:
                print(f"{nome}: Erro ao calcular o silhouette score → {e}")
        else:
            print(f"{nome}: apenas um cluster encontrado (não é possível calcular o silhouette score)")

In [25]:
label_dict = {
    "KMeans": df['Kmeans_cluster'].values,
    "SOM": df['som_cluster'].values,
    "Hierarchical": df['hierachical_cluster'].values
}

data = df[['lifetime_spend_groceries', 'lifetime_spend_electronics',
        'typical_hour', 'lifetime_spend_vegetables',
        'lifetime_spend_nonalcohol_drinks', 'lifetime_spend_alcohol_drinks',
        'lifetime_spend_meat', 'lifetime_spend_fish', 'lifetime_spend_hygiene',
        'lifetime_spend_videogames', 'lifetime_spend_petfood',
        'lifetime_total_distinct_products']]
scores = calcular_silhouettes(data, label_dict)


KMeans: Silhouette Score = 0.195
SOM: Erro ao calcular o silhouette score → unhashable type: 'numpy.ndarray'
Hierarchical: Silhouette Score = 0.163


In [26]:
import umap
import numpy as np

# Reduzir para 2 dimensões
umap_2d = umap.UMAP(n_components=2, random_state=42)
X_umap = umap_2d.fit_transform(data)

  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [27]:
scores_umap = calcular_silhouettes(X_umap, label_dict)

KMeans: Silhouette Score = 0.402
SOM: Erro ao calcular o silhouette score → unhashable type: 'numpy.ndarray'
Hierarchical: Silhouette Score = 0.437


In [6]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import silhouette_samples, silhouette_score

def plot_silhouette(df, feature_cols, cluster_col):
    X = df[feature_cols].values
    labels = df[cluster_col].values
    n_clusters = len(np.unique(labels))

    silhouette_vals = silhouette_samples(X, labels)
    silhouette_avg = silhouette_score(X, labels)

    fig, ax = plt.subplots(figsize=(8, 6))
    y_lower = 10

    for i in np.unique(labels):
        ith_cluster_vals = silhouette_vals[labels == i]
        ith_cluster_vals.sort()

        size_cluster_i = ith_cluster_vals.shape[0]
        y_upper = y_lower + size_cluster_i

        color = plt.cm.nipy_spectral(float(i) / n_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                         0, ith_cluster_vals,
                         facecolor=color, edgecolor=color, alpha=0.7)

        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10

    ax.axvline(x=silhouette_avg, color="red", linestyle="--")
    ax.set_xlabel("Coeficiente de Silhueta")
    ax.set_ylabel("Label do Cluster")
    ax.set_title(f"Silhouette Plot - {cluster_col}")
    plt.show()

In [10]:
data

Unnamed: 0,customer_id,customer_name,customer_gender,kids_home,teens_home,number_complaints,distinct_stores_visited,lifetime_spend_groceries,lifetime_spend_electronics,typical_hour,...,year_first_transaction,loyalty_card_number,latitude,longitude,customer_age,years_active,customer_educlevel,kmeans_cluster,som_cluster,hierarchical_cluster
0,29930.0,April Clark,1.0,2.0,2.0,1.0,4.0,7789.0,5601.0,13.0,...,2018.0,0.0,38.721807,-9.125534,53.0,7.0,,3,"(0, 0)",2
1,6813.0,Paul Ketchum,0.0,0.0,1.0,0.0,4.0,8653.0,35.0,14.0,...,2013.0,1.0,38.734668,-9.163533,81.0,12.0,Bsc.,6,"(2, 1)",6
2,39451.0,Mary Downing,1.0,2.0,3.0,0.0,7.0,15605.0,4275.0,14.0,...,2011.0,0.0,38.787126,-9.147077,36.0,14.0,,5,"(0, 0)",2
3,21557.0,Manuel Kueny,0.0,0.0,0.0,1.0,1.0,13440.0,16366.0,14.0,...,2009.0,0.0,38.741816,-9.159700,49.0,16.0,,0,"(0, 2)",5
4,16415.0,Curtis Tharp,0.0,1.0,1.0,1.0,5.0,49250.0,3197.0,14.0,...,2012.0,1.0,38.785921,-9.149221,59.0,13.0,Phd.,1,"(1, 0)",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34055,37071.0,Mary Diaz,1.0,1.0,1.0,0.0,2.0,62321.0,5250.0,16.0,...,2018.0,0.0,38.750813,-9.103808,26.0,7.0,,7,"(0, 1)",4
34056,27422.0,John Gross,0.0,1.0,1.0,0.0,3.0,16084.0,3796.0,13.0,...,2019.0,1.0,38.745020,-9.169168,41.0,6.0,,1,"(1, 0)",1
34057,39540.0,Kathleen Harper,1.0,1.0,0.0,1.0,2.0,14631.0,8784.0,17.0,...,2018.0,0.0,38.767834,-9.172368,25.0,7.0,,0,"(0, 2)",5
34058,18367.0,Jenny Landry,1.0,1.0,1.0,1.0,4.0,37420.0,4702.0,9.0,...,2015.0,1.0,38.732462,-9.156155,64.0,10.0,Msc.,1,"(1, 2)",1


In [16]:
data.groupby('kmeans_cluster').mean(numeric_only=True)

Unnamed: 0_level_0,customer_id,customer_gender,kids_home,teens_home,number_complaints,distinct_stores_visited,lifetime_spend_groceries,lifetime_spend_electronics,typical_hour,lifetime_spend_vegetables,lifetime_spend_nonalcohol_drinks,lifetime_spend_alcohol_drinks,lifetime_spend_meat,lifetime_spend_fish,lifetime_spend_hygiene,lifetime_spend_videogames,lifetime_spend_petfood,lifetime_total_distinct_products,percentage_of_products_bought_promotion,year_first_transaction,loyalty_card_number,latitude,longitude,customer_age,years_active,hierarchical_cluster
kmeans_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
0,19932.298036,0.505832,0.260712,0.291958,0.932842,1.381768,22249.964088,10942.453039,15.654942,214.625844,520.707182,1018.053898,894.942603,870.569982,284.906691,1126.809085,301.053407,115.484653,28.217978,2017.062308,0.468386,38.748017,-9.169661,56.292634,7.937692,5.088398
1,20083.97063,0.484745,0.959395,0.802908,0.957514,3.095637,28985.078985,3834.643855,13.745081,386.370174,587.155118,694.516111,987.229541,968.819618,656.37582,393.561449,328.693185,229.244083,39.241801,2013.572854,0.680354,38.747764,-9.15726,57.728087,11.427146,3.650128
2,20198.922628,0.494057,1.224804,0.834582,1.30222,2.625163,10341.357479,898.489123,11.763624,268.441848,192.847275,567.794393,569.703072,404.575241,383.193541,161.90536,250.696793,55.344472,56.93291,2017.969276,0.524333,38.745519,-9.156571,31.997668,7.030724,0.895492
3,19838.25646,0.498385,2.516021,1.759044,0.695478,2.930103,20180.420866,5002.034884,10.39761,853.628165,722.351744,1499.979845,1494.519057,1382.187209,861.543605,444.530685,527.255814,265.054264,18.832403,2013.078488,0.721576,38.748549,-9.154966,57.178359,11.921512,2.661176
4,20300.247803,0.489675,0.9971,0.305053,0.719903,3.161819,10559.988137,1983.849297,13.88739,1995.597232,396.022408,396.504086,54.268893,77.810281,704.785808,199.460457,246.162346,80.447056,10.334141,2014.63181,0.600835,38.747606,-9.156566,56.177065,10.36819,3.018234
5,19755.079508,0.49865,2.869427,1.894149,0.794419,3.009301,20065.552955,5003.437144,10.211881,845.164116,794.182718,1552.350435,1525.735374,1338.188599,891.159616,410.122112,217.985899,301.967597,19.780071,2012.892589,0.69757,38.748015,-9.156383,56.573837,12.107411,2.183318
6,19965.336767,0.501577,0.551104,0.971705,0.858146,3.833593,10704.255834,547.137061,14.727643,470.193399,167.345386,178.273155,820.667227,351.075047,428.849275,189.89342,316.450284,52.993483,66.660528,2014.638007,0.865882,38.74757,-9.157761,63.592096,10.361993,5.867353
7,19992.90657,0.505726,0.532369,0.978059,0.093791,2.095359,61713.006631,5109.671489,14.56311,1111.449247,490.044304,1481.882942,1379.315853,1717.614165,851.451477,595.778481,389.699819,146.107896,12.12982,2013.766727,0.785413,38.751634,-9.146159,56.377517,11.233273,6.301085
8,19993.096497,0.498156,1.099631,1.012477,0.977935,3.837984,30966.52858,3217.098341,13.182176,336.759988,705.463737,714.003503,1004.374616,955.437246,1602.735403,313.058697,350.49201,251.830363,40.436282,2012.272588,0.826675,38.747196,-9.156324,55.34161,12.727412,1.336202


In [13]:
data.groupby('hierarchical_cluster').mean(numeric_only=True)

Unnamed: 0_level_0,customer_id,customer_gender,kids_home,teens_home,number_complaints,distinct_stores_visited,lifetime_spend_groceries,lifetime_spend_electronics,typical_hour,lifetime_spend_vegetables,...,lifetime_spend_petfood,lifetime_total_distinct_products,percentage_of_products_bought_promotion,year_first_transaction,loyalty_card_number,latitude,longitude,customer_age,years_active,kmeans_cluster
hierarchical_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,20105.090517,0.49214,1.310091,0.841734,1.370994,2.495081,10118.279412,881.824544,11.475963,242.201927,...,238.773327,54.286004,55.94758,2017.910751,0.494929,38.74516,-9.156531,32.350761,7.089249,2.163793
1,19912.046714,0.492417,1.110698,1.014034,0.976421,3.778038,30693.500101,3186.683923,12.535288,334.766714,...,350.701112,250.838625,40.034652,2012.393327,0.815369,38.747284,-9.156491,55.372781,12.606673,5.392922
2,19783.435503,0.496342,2.994032,1.946477,0.84097,3.097767,20055.011937,4970.890836,10.141972,815.876781,...,346.350019,311.609164,20.407369,2012.798229,0.696188,38.747585,-9.157129,57.154293,12.201771,4.154987
3,20314.573481,0.489282,1.000398,0.301569,0.719558,3.164066,10553.444199,1985.454144,13.896796,1996.765923,...,245.846851,80.492155,10.07356,2014.625414,0.6,38.7476,-9.156564,56.169326,10.374586,4.0
4,19985.62925,0.496161,0.954223,1.130018,0.23042,2.277514,33079.009506,5133.99159,13.776746,1072.913492,...,414.578428,156.783912,13.429004,2013.908958,0.766362,38.75092,-9.148227,53.377916,11.091042,5.038391
5,19950.049345,0.504417,0.264149,0.29473,0.931648,1.382638,22331.508681,10646.502589,15.588242,210.343893,...,294.844959,119.143162,28.33984,2017.080719,0.467256,38.748038,-9.169523,56.935425,7.919281,0.130673
6,20042.717419,0.503133,0.52122,0.985255,0.716207,4.00472,10123.329574,420.355054,14.707435,478.184252,...,321.559106,51.143066,69.734551,2015.030702,0.891813,38.747532,-9.156713,59.854261,9.969298,5.513576
7,20262.933272,0.50139,0.82493,0.58823,1.249954,2.441427,26982.771548,4395.144115,15.505653,439.440315,...,316.673309,178.540778,40.540275,2014.677016,0.549583,38.74842,-9.160082,59.290361,10.322984,2.550973
8,19975.972987,0.508665,0.532008,0.991131,0.092966,2.086646,71374.930683,4991.282875,13.677064,1054.404179,...,386.118756,147.421509,11.351863,2013.399592,0.794088,38.751799,-9.146116,60.963405,11.600408,6.824669


In [14]:
data.groupby('som_cluster').mean(numeric_only=True)

Unnamed: 0_level_0,customer_id,customer_gender,kids_home,teens_home,number_complaints,distinct_stores_visited,lifetime_spend_groceries,lifetime_spend_electronics,typical_hour,lifetime_spend_vegetables,...,lifetime_total_distinct_products,percentage_of_products_bought_promotion,year_first_transaction,loyalty_card_number,latitude,longitude,customer_age,years_active,kmeans_cluster,hierarchical_cluster
som_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(0, 0)",19854.611996,0.480176,2.797357,1.864859,0.779058,3.001627,22298.76042,4974.291427,10.423246,960.779939,...,310.318536,19.191009,2013.316503,0.699085,38.747791,-9.156,52.240935,11.683497,4.710268,2.312098
"(0, 1)",19833.368772,0.513142,0.527972,0.978972,0.098313,2.09541,69003.898784,5101.787368,14.261671,1110.277756,...,146.11887,12.109307,2013.625736,0.794821,38.751493,-9.146201,58.087485,11.374264,6.945861,6.747352
"(0, 2)",19946.104564,0.503177,0.277354,0.301271,0.890468,1.408781,23186.613518,10842.231947,15.839977,247.268746,...,118.691219,28.33139,2017.017331,0.47227,38.748127,-9.168469,55.240497,7.982669,0.253033,5.117562
"(1, 0)",20075.407562,0.492461,1.153421,1.046764,0.975922,3.799304,31137.780329,3201.509627,13.964045,347.38302,...,252.075852,39.723576,2012.286244,0.819068,38.747432,-9.156278,54.823289,12.713756,6.126885,1.825099
"(1, 1)",19901.120401,0.496199,1.995379,1.509638,0.587717,2.785041,23846.477349,4866.797507,11.273943,836.383825,...,231.760413,19.295982,2013.469444,0.726969,38.749251,-9.153636,55.032593,11.530556,3.535117,3.134995
"(1, 2)",19972.281357,0.502742,1.570459,1.160452,0.889925,2.93708,22756.161412,4420.515079,11.194928,482.829746,...,247.391021,32.08981,2013.010624,0.668609,38.747783,-9.156861,65.657025,11.989376,2.763879,2.994859
"(2, 0)",20295.644948,0.489544,0.998415,0.307638,0.720757,3.162492,10576.788906,1989.468413,13.842395,1995.761699,...,80.679947,10.397933,2014.638125,0.601365,38.747617,-9.156541,56.078142,10.361875,3.99912,3.020471
"(2, 1)",20004.429243,0.501382,0.588723,0.929961,0.920011,3.678386,11895.842178,722.577114,15.75152,477.244002,...,60.199005,63.335075,2014.754008,0.832504,38.74752,-9.158436,63.831951,10.245992,5.459646,5.927861
"(2, 2)",20095.352113,0.499406,1.050263,0.870626,1.168335,2.948345,10521.573222,804.982861,11.981707,312.764704,...,54.69048,59.953353,2017.066011,0.613949,38.746172,-9.156596,39.262481,7.933989,3.033939,2.030714


In [13]:
df_semscalling = preprocess_semscalling('customer_info.csv')

In [14]:
df_semscalling

Unnamed: 0,customer_id,customer_name,customer_gender,kids_home,teens_home,number_complaints,distinct_stores_visited,lifetime_spend_groceries,lifetime_spend_electronics,typical_hour,lifetime_spend_vegetables,lifetime_spend_nonalcohol_drinks,lifetime_spend_alcohol_drinks,lifetime_spend_meat,lifetime_spend_fish,lifetime_spend_hygiene,lifetime_spend_videogames,lifetime_spend_petfood,lifetime_total_distinct_products,percentage_of_products_bought_promotion,year_first_transaction,loyalty_card_number,latitude,longitude,customer_age,years_active,customer_educlevel
0,29930.0,April Clark,1.0,2.0,2.0,1.0,4.0,7789.0,5601.0,13.0,726.0,962.0,1213.0,1598.0,1894.0,457.0,412.0,428.0,386.0,15.874075,2018.0,0.0,38.721807,-9.125534,53.0,7.0,
1,6813.0,Paul Ketchum,0.0,0.0,1.0,0.0,4.0,8653.0,35.0,14.0,792.0,102.0,104.0,741.0,346.0,394.0,75.0,226.0,73.0,122.789042,2013.0,1.0,38.734668,-9.163533,81.0,12.0,Bsc.
2,39451.0,Mary Downing,1.0,2.0,3.0,0.0,7.0,15605.0,4275.0,14.0,1585.0,980.0,1872.0,1323.0,1971.0,920.0,335.0,192.0,319.0,10.159789,2011.0,0.0,38.787126,-9.147077,36.0,14.0,
3,21557.0,Manuel Kueny,0.0,0.0,0.0,1.0,1.0,13440.0,16366.0,14.0,28.0,269.0,1855.0,939.0,785.0,139.0,679.0,270.0,221.0,25.994254,2009.0,0.0,38.741816,-9.159700,49.0,16.0,
4,16415.0,Curtis Tharp,0.0,1.0,1.0,1.0,5.0,49250.0,3197.0,14.0,258.0,726.0,547.0,983.0,1492.0,1046.0,112.0,144.0,244.0,31.782174,2012.0,1.0,38.785921,-9.149221,59.0,13.0,Phd.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34055,37071.0,Mary Diaz,1.0,1.0,1.0,0.0,2.0,62321.0,5250.0,16.0,813.0,528.0,1524.0,687.0,1687.0,1048.0,379.0,258.0,163.0,16.233149,2018.0,0.0,38.750813,-9.103808,26.0,7.0,
34056,27422.0,John Gross,0.0,1.0,1.0,0.0,3.0,16084.0,3796.0,13.0,502.0,748.0,694.0,1200.0,568.0,949.0,263.0,319.0,379.0,61.417938,2019.0,1.0,38.745020,-9.169168,41.0,6.0,
34057,39540.0,Kathleen Harper,1.0,1.0,0.0,1.0,2.0,14631.0,8784.0,17.0,402.0,262.0,484.0,509.0,1472.0,368.0,804.0,110.0,228.0,53.561736,2018.0,0.0,38.767834,-9.172368,25.0,7.0,
34058,18367.0,Jenny Landry,1.0,1.0,1.0,1.0,4.0,37420.0,4702.0,9.0,195.0,485.0,518.0,1231.0,904.0,1088.0,401.0,376.0,151.0,43.501363,2015.0,1.0,38.732462,-9.156155,64.0,10.0,Msc.


In [10]:
# Suponha que as colunas de features sejam todas menos as de cluster
feature_cols = [col for col in df.columns if col not in ['som_cluster', 'hierarchical_cluster', 'kmeans_cluster', 'customer_name', 'customer_educlevel']]

# Plot para cada método
plot_silhouette(df, feature_cols, 'som_cluster')
plot_silhouette(df, feature_cols, 'hierarchical_cluster')
plot_silhouette(df, feature_cols, 'kmeans_cluster')

TypeError: unhashable type: 'numpy.ndarray'