In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

In [2]:
# Load the clustered data
df_labeled = pd.read_parquet('../../data/processed/clustered_data.parquet')

In [3]:
# Extract clusters and true labels
clusters = df_labeled['cluster'].to_numpy()
# Remove the cluster column to get the feature data
feature_columns = df_labeled.drop(columns=['cluster'])

In [4]:
feature_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460509 entries, 0 to 460508
Data columns (total 5 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   regione_residenza                   460509 non-null  object
 1   tipologia_struttura_erogazione      460509 non-null  object
 2   tipologia_professionista_sanitario  460509 non-null  object
 3   fascia_eta                          460509 non-null  object
 4   incremento_teleassistenze           460509 non-null  object
dtypes: object(5)
memory usage: 17.6+ MB


In [5]:
# NOTE this is not needed, cause these columns are note used in the clustering

# Convert the datetime columns to Unix timestamp
# feature_columns['data_erogazione'] = feature_columns['data_erogazione'].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan)
# feature_columns['data_nascita'] = feature_columns['data_nascita'].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan)

In [6]:
# Convert non-numeric columns to numeric using LabelEncoder
for column in feature_columns.columns:
    if feature_columns[column].dtype == 'object':
        le = LabelEncoder()
        feature_columns[column] = le.fit_transform(feature_columns[column])
        
# NOTE Sarebbe meglio usare OneHotEncoder ma avendo un dataset molto grande non posso permettermi di fare one hot encoding in quanto la memoria non basta

In [7]:
feature_columns.head()

Unnamed: 0,regione_residenza,tipologia_struttura_erogazione,tipologia_professionista_sanitario,fascia_eta,incremento_teleassistenze
0,11,0,4,3,4
1,17,1,4,4,2
2,4,6,1,3,2
3,3,0,8,4,2
4,0,6,1,5,2


In [8]:
# Function to calculate and normalize Silhouette Score
def calculate_silhouette_score(features , clusters):
    # Calculate the silhouette scores for each sample
    silhouette_vals = silhouette_samples(features, clusters)

    # Calculate the mean silhouette score
    mean_silhouette = silhouette_score(features, clusters)

    # Normalize the silhouette scores to a range between 0 and 1
    normalized_silhouette_vals = (silhouette_vals - silhouette_vals.min()) / (silhouette_vals.max() - silhouette_vals.min())
    normalized_mean_silhouette = (mean_silhouette - silhouette_vals.min()) / (silhouette_vals.max() - silhouette_vals.min())

    return normalized_mean_silhouette, normalized_silhouette_vals, mean_silhouette, silhouette_vals

In [9]:
# Calculate the normalized silhouette values
normalized_mean_silhouette, normalized_silhouette_vals, mean_silhouette, silhouette_vals = calculate_silhouette_score(feature_columns, clusters)  

In [None]:
# Print the results
print('Normalized Mean Silhouette Score:', normalized_mean_silhouette)
print('Normalized Silhouette Values:', normalized_silhouette_vals)

In [None]:
# Plot the silhouette values for each sample
plt.figure(figsize=(10, 7))
y_lower = 10
for i in np.unique(clusters):
    ith_cluster_silhouette_vals = silhouette_vals[clusters == i]
    ith_cluster_silhouette_vals.sort()
    size_cluster_i = ith_cluster_silhouette_vals.shape[0]
    y_upper = y_lower + size_cluster_i

    plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_vals, alpha=0.7)
    plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    y_lower = y_upper + 10

In [None]:
plt.title('Silhouette plot for the various clusters')
plt.xlabel('Silhouette coefficient values')
plt.ylabel('Cluster label')
plt.axvline(x=mean_silhouette, color="red", linestyle="--")
plt.yticks([])
plt.show()