In [28]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

In [29]:
# Load the clustered data
df_labeled = pd.read_parquet('../../data/processed/clustered_data.parquet')

In [30]:
# Extract clusters and true labels
clusters = df_labeled['cluster'].to_numpy()
# Remove the cluster column to get the feature data
feature_columns = df_labeled.drop(columns=['cluster'])

In [31]:
feature_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460509 entries, 0 to 460508
Data columns (total 17 columns):
 #   Column                              Non-Null Count   Dtype              
---  ------                              --------------   -----              
 0   id_prenotazione                     460509 non-null  object             
 1   id_paziente                         460509 non-null  object             
 2   data_nascita                        460509 non-null  datetime64[ns, UTC]
 3   sesso                               460509 non-null  object             
 4   regione_residenza                   460509 non-null  object             
 5   tipologia_servizio                  460509 non-null  object             
 6   descrizione_attivita                460509 non-null  object             
 7   data_contatto                       460509 non-null  object             
 8   tipologia_struttura_erogazione      460509 non-null  object             
 9   id_professionista_sanitari

In [32]:
# NOTE this is not needed, cause these columns are note used in the clustering

# Convert the datetime columns to Unix timestamp
feature_columns['data_erogazione'] = feature_columns['data_erogazione'].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan)
feature_columns['data_nascita'] = feature_columns['data_nascita'].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan)

In [33]:
# Convert non-numeric columns to numeric using LabelEncoder
for column in feature_columns.columns:
    if feature_columns[column].dtype == 'object':
        le = LabelEncoder()
        feature_columns[column] = le.fit_transform(feature_columns[column])
        
# NOTE Sarebbe meglio usare OneHotEncoder ma avendo un dataset molto grande non posso permettermi di fare one hot encoding in quanto la memoria non basta

In [34]:
# supponendo che X sia il tuo dataframe o array di feature
scaler = StandardScaler()
X_standardized = scaler.fit_transform(feature_columns)

In [38]:
X_standardized_df = pd.DataFrame(X_standardized)
print(X_standardized_df.head())

         0         1         2         3         4    5         6         7   \
0  0.171870 -0.246634 -0.010057  1.088312  0.250476  0.0 -0.308295 -1.732474   
1  1.478943 -0.173594 -0.340067 -0.918854  1.284751  0.0  0.515240 -1.732413   
2 -1.073324 -0.454155  0.237424  1.088312 -0.956180  0.0  0.412298 -1.732376   
3  1.308585  0.854205 -0.511492 -0.918854 -1.128559  0.0  1.184363 -1.732368   
4  1.477928  1.289516 -0.678198  1.088312 -1.645697  0.0  0.412298 -1.732285   

         8         9         10        11        12        13        14  \
0 -1.828384 -0.006025 -0.086304 -1.983333  0.624216 -0.037041 -1.578466   
1 -1.416034 -1.203662 -0.086304 -1.991300 -0.286234  0.450240 -1.578466   
2  0.645713  0.330431 -1.538514 -1.986431 -0.931505 -0.037041 -1.578466   
3 -1.828384  1.136801  1.849977 -1.991742 -0.220823  0.450240 -1.578466   
4  0.645713 -0.895148 -1.538514 -1.980234  0.778020  0.937521 -1.578466   

         15        16  
0 -1.409541 -1.088845  
1 -1.409541 -1.08884

In [None]:
# Function to calculate and normalize Silhouette Score
def calculate_silhouette_score(features , clusters):
    # Calculate the silhouette scores for each sample
    silhouette_vals = silhouette_samples(features, clusters)

    # Calculate the mean silhouette score
    mean_silhouette = silhouette_score(features, clusters)

    # Normalize the silhouette scores to a range between 0 and 1
    normalized_silhouette_vals = (silhouette_vals - silhouette_vals.min()) / (silhouette_vals.max() - silhouette_vals.min())
    normalized_mean_silhouette = (mean_silhouette - silhouette_vals.min()) / (silhouette_vals.max() - silhouette_vals.min())

    return normalized_mean_silhouette, normalized_silhouette_vals, mean_silhouette, silhouette_vals

In [None]:
# Calculate the normalized silhouette values
normalized_mean_silhouette, normalized_silhouette_vals, mean_silhouette, silhouette_vals = calculate_silhouette_score(X_standardized_df, clusters)  

KeyboardInterrupt: 

In [None]:
# Print the results
print('Normalized Mean Silhouette Score:', normalized_mean_silhouette)
print('Normalized Silhouette Values:', normalized_silhouette_vals)

In [None]:
# Plot the silhouette values for each sample
plt.figure(figsize=(10, 7))
y_lower = 10
for i in np.unique(clusters):
    ith_cluster_silhouette_vals = silhouette_vals[clusters == i]
    ith_cluster_silhouette_vals.sort()
    size_cluster_i = ith_cluster_silhouette_vals.shape[0]
    y_upper = y_lower + size_cluster_i

    plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_vals, alpha=0.7)
    plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    y_lower = y_upper + 10

In [None]:
plt.title('Silhouette plot for the various clusters')
plt.xlabel('Silhouette coefficient values')
plt.ylabel('Cluster label')
plt.axvline(x=mean_silhouette, color="red", linestyle="--")
plt.yticks([])
plt.show()