In [2]:
import pandas as pd
import plotly.express as px

In [9]:
# Load the data
file_path = '../../data/processed/clustered_data_all_feature.parquet'
data = pd.read_parquet(file_path)


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460509 entries, 0 to 460508
Data columns (total 18 columns):
 #   Column                              Non-Null Count   Dtype              
---  ------                              --------------   -----              
 0   id_prenotazione                     460509 non-null  object             
 1   id_paziente                         460509 non-null  object             
 2   data_nascita                        460509 non-null  datetime64[ns, UTC]
 3   sesso                               460509 non-null  object             
 4   regione_residenza                   460509 non-null  object             
 5   tipologia_servizio                  460509 non-null  object             
 6   descrizione_attivita                460509 non-null  object             
 7   data_contatto                       460509 non-null  object             
 8   tipologia_struttura_erogazione      460509 non-null  object             
 9   id_professionista_sanitari

In [11]:
data.head()

Unnamed: 0,id_prenotazione,id_paziente,data_nascita,sesso,regione_residenza,tipologia_servizio,descrizione_attivita,data_contatto,tipologia_struttura_erogazione,id_professionista_sanitario,tipologia_professionista_sanitario,data_erogazione,durata_erogazione_sec,fascia_eta,anno,quadrimestre,incremento_teleassistenze,cluster
0,JUOCPQCUG1H96QO,LCWFXF28X79P382V,1980-04-24 00:00:00+00:00,male,Piemonte,Teleassistenza,Insegnamento al paziente e/o alla sua famiglia...,2019-01-09T11:21:54+01:00,Azienda Ospedaliera,MZRAER00D13A287J,Infermiere,2019-02-04 10:00:00+00:00,2925,36-47,2019,1,high_increment,3
1,XDP99NXRHPOYFYU,LRFZOI94E58U012I,1972-01-30 00:00:00+00:00,female,Toscana,Teleassistenza,Prestazione infermieristica connessa alla nece...,2019-01-10T11:57:43+01:00,Azienda Ospedaliera Universitaria,EBCRPU12T24U316I,Infermiere,2019-02-01 10:00:00+00:00,1895,48-59,2019,1,high_increment,1
2,6VYI0AK57ECO5WU,JOBCGJ98A04B765H,1986-06-27 00:00:00+00:00,male,Emilia romagna,Teleassistenza,Prestazione dietistica,2019-01-11T07:04:17+01:00,Ospedale a gestione diretta,PMEJNL62M27F357I,Dietista,2019-02-03 06:00:00+00:00,1165,36-47,2019,1,high_increment,1
3,VLFS5J9E3Q74Q2W,TKROSE51Z77Q065E,1967-10-21 00:00:00+00:00,female,Campania,Teleassistenza,Psicoterapia individuale,2019-01-11T07:41:52+01:00,Azienda Ospedaliera,VNNRMQ15C69W200K,Psicologo,2019-02-01 06:00:00+00:00,1969,48-59,2019,1,high_increment,0
4,XD8SZX9JOVCMEXW,WRRVSK73Z45L640S,1963-08-24 00:00:00+00:00,male,Abruzzo,Teleassistenza,Prestazione dietistica,2019-01-11T15:33:29+01:00,Ospedale a gestione diretta,GIQCNR34T04P154U,Dietista,2019-02-05 14:00:00+00:00,3099,60-69,2019,1,high_increment,1


## Visualizing Feature Distributions Across Clusters

### Introduction
To gain a deeper understanding of how different features are distributed within the clusters identified by our clustering algorithm, we will create a series of visualizations. These visualizations will help us identify the unique characteristics of each cluster and validate our clustering results.

### Proposed Visualizations
We will leverage the power of Plotly to create interactive visualizations:

#### **Bar Charts**
* **Age distribution per cluster:** A bar chart will visualize the distribution of ages within each cluster.
* **Teleassistance increase per cluster:** Another bar chart will show how the increase in teleassistance varies across clusters.
* **Distribution of healthcare professionals per cluster:** A bar chart can be used to compare the distribution of different healthcare professionals types within each cluster.
* **Distribution of residence regions per cluster:** A bar chart will visualize the distribution of residence regions of patients across clusters.

#### **Scatter Map**
* **Geographic distribution (residence region) per cluster:** A scatter map will visualize the geographic distribution of patients within each cluster.

#### **Technologies Used**
* **Plotly:** The Plotly library was used to create these interactive graphs, offering a dynamic and engaging visualization.


### Analysis of age distribution by cluster
The following code was developed to analyze the distribution of age groups within the different clusters identified by a previous clustering algorithm. By calculating the percentage of each age group belonging to each cluster, you can identify the dominant cluster for each age group and get a clearer view of the relationship between age and cluster.

In [12]:
# Calculate the percentage of each age group belonging to each cluster
df_crosstab = pd.crosstab(data['fascia_eta'], data['cluster'], normalize='index') * 100

# Find the cluster with the highest percentage for each age group
df_max_cluster = df_crosstab.idxmax(axis=1)

# Extract the corresponding highest percentages for each age group
df_max_percentage = df_crosstab.max(axis=1)

print("Highest percentage for each age group:")
print(df_max_percentage)
# Display the percentage table
print(df_crosstab)

# Display the cluster with the highest percentage for each age group
print("Cluster with the highest percentage for each age group:")
print(df_max_cluster)

Highest percentage for each age group:
fascia_eta
0-11     75.587685
12-23    85.492546
24-35    38.669138
36-47    44.283325
48-59    46.305465
60-69    43.455497
70+      83.033496
dtype: float64
cluster             0          1          2          3
fascia_eta                                            
0-11        11.545887  12.791263   0.075165  75.587685
12-23       85.492546   7.910705   0.013484   6.583265
24-35       38.669138  16.296571  33.123262  11.911029
36-47       23.893628  19.682221  44.283325  12.140826
48-59       22.512844  46.305465  27.011410   4.170281
60-69       43.455497  12.653817  32.225631  11.665055
70+          7.563678   7.304825  83.033496   2.098002
Cluster with the highest percentage for each age group:
fascia_eta
0-11     3
12-23    0
24-35    0
36-47    2
48-59    1
60-69    0
70+      2
dtype: uint16


In [13]:
import nbformat
print(nbformat.__version__)


5.10.4


Visual representation of the relationship between age group and cluster, highlighting the dominant cluster for each group.

In [14]:
# Create a DataFrame for the bar chart
pie_data = pd.DataFrame({
    'age_group': df_max_cluster.index,
    'percentage': df_max_percentage,
    'cluster': df_max_cluster.values
})

# Define a color map for clusters
cluster_colors = {
    0: 'skyblue',
    1: 'lightgreen',
    2: 'lightcoral',
    3: 'gold',
    # Add more colors if there are more clusters
}

# Create a bar chart using Plotly
fig = px.bar(
    pie_data,
    x='age_group',
    y='percentage',
    color='cluster',
    color_discrete_map=cluster_colors,
    title='Distrbuzione delle fasce d\'età per cluster',
    labels={'age_group': 'fascia età', 'percentage': 'Percentuale massima di appartenenza al cluster (%)', 'cluster': 'Cluster'},
)

# Customize the chart
fig.update_layout(
    xaxis_title='Fascia d\'età',
    yaxis_title='Percentuale (%)',
    legend_title='Cluster',
    xaxis_tickangle=-45,
    width=900  # Increase the width of the chart
)

# Show the chart
fig.show()

### Analysis of the distribution of the increase in remote assistance by cluster
This code allows you to visualize how the variations in remote assistance are distributed in the different clusters, providing an overview of the relationship between the behavior of remote assistance and data segmentation.


In [15]:
# Calculate the frequency of each 'incremento_teleassistenze' category per cluster
cluster_counts = data.groupby(['cluster', 'incremento_teleassistenze']).size().reset_index(name='count')

# Create an interactive bar chart with Plotly
fig = px.bar(
    cluster_counts,
    x='cluster',
    y='count',
    color='incremento_teleassistenze',
    title='Distribuzione delle variazioni delle teleassistenze per cluster',
    labels={'cluster': 'Cluster', 'count': 'Numero di occorrenze', 'incremento_teleassistenze': 'Teleassistance Variation'},
    barmode='group',
    color_discrete_sequence=px.colors.qualitative.Pastel
)

# Customize the chart
fig.update_layout(
    xaxis_title='Cluster',
    yaxis_title='Numero di occorrenze',
    legend_title='Variazione Teleassistenza',
)

# Show the chart
fig.show()

### Analysis of the distribution of types of healthcare professionals by cluster
The following code calculates the frequency of each type of healthcare professional per cluster and visualizes the results in an interactive bar chart. This visualization provides insights into the distribution of different healthcare professional types within each cluster.



In [16]:
# Calculate the frequency of each type of healthcare professional per cluster
cluster_counts = data.groupby(['cluster', 'tipologia_professionista_sanitario']).size().reset_index(name='count')

# Create an interactive bar chart with Plotly
fig = px.bar(
    cluster_counts,
    x='cluster',
    y='count',
    color='tipologia_professionista_sanitario',
    title='Distribuzione dei professionisti sanitari per cluster',
    labels={'cluster': 'Cluster', 'count': 'Numero di professionisti', 'healthcare_professional_type': 'Tipo di Professionista'},
    barmode='group',
    color_discrete_sequence=px.colors.qualitative.Pastel
)

# Customize the chart
fig.update_layout(
    xaxis_title='Cluster',
    yaxis_title='Numero di professionisti',
    legend_title='Tipo di Professionista',
    legend=dict(
        x=1.05,  # Horizontal position of the legend
        y=1,     # Vertical position of the legend
        traceorder='normal'  # Order of items in the legend
    )
)

# Show the chart
fig.show()

### Analysis of the distribution of regions of residence by cluster
The following code calculates the frequency of each geographic area per cluster and visualizes the results in an interactive bar chart. This visualization provides insights into the distribution of residence regions within each cluster.

In [17]:
# Calculate the frequency of each geographic area per cluster
area_counts = data.groupby(['cluster', 'regione_residenza']).size().reset_index(name='count')

# Create an interactive bar chart with Plotly
fig = px.bar(
    area_counts,
    x='cluster',
    y='count',
    color='regione_residenza',
    barmode='group',
    title='Distribuzione delle regioni di residenza per cluster',
    labels={'cluster': 'Cluster', 'count': 'Numero di occorrenze', 'regione_residenza': 'Regione di Residenza'},
)

# Customize the chart
fig.update_layout(legend_title_text='Regione di Residenza')

# Show the chart
fig.show()

### Analysis of sex distribution by cluster
The code first calculates the conditional probabilities of belonging to a specific gender given a cluster. This is done using a cross-tabulation and normalizing by row. Subsequently, it determines the gender category with the highest probability for each cluster. The maximum probability for each cluster is also extracted for further analysis. This analysis can be useful in understanding potential gender-based biases or differences within the clustering results.

In [18]:
# Calculate the percentage of each gender within each cluster
sex_crosstab = pd.crosstab(data['sesso'], data['cluster'], normalize='index') * 100

# Identify the gender with the highest percentage for each cluster
max_sex_per_cluster = sex_crosstab.idxmax(axis=1)

# Extract the corresponding highest percentages for each cluster
max_percentage_per_cluster = sex_crosstab.max(axis=1)

print("Percentage of each gender within each cluster:")
print(sex_crosstab)

print("\nGender with the highest percentage for each cluster:")
print(max_sex_per_cluster)

print("\nHighest percentage for each cluster:")
print(max_percentage_per_cluster)

Percentage of each gender within each cluster:
cluster          0          1          2          3
sesso                                              
female   31.525788  18.062509  33.599526  16.812178
male     33.252852  17.716956  30.469369  18.560823

Gender with the highest percentage for each cluster:
sesso
female    2
male      0
dtype: uint16

Highest percentage for each cluster:
sesso
female    33.599526
male      33.252852
dtype: float64


This code analyzes the distribution of men and women within the different clusters and displays it in an interactive bar graph with Plotly.

In [19]:
# Melt the crosstab DataFrame for easier plotting
melted_gender_data = sex_crosstab.reset_index().melt(id_vars='sesso', var_name='cluster', value_name='percentage')

# Create a bar chart using Plotly
fig = px.bar(
    melted_gender_data,
    x='cluster',
    y='percentage',
    color='sesso',
    title='Distribuzione di uomini e donne per cluster',
    labels={'cluster': 'Cluster', 'percentage': 'Percentuale (%)', 'sesso': 'Sesso'},
    barmode='group',
    color_discrete_map={'female': '#FF69B4', 'male': '#1E90FF'}
    
)

# Customize the chart
fig.update_layout(
    xaxis_title='Cluster',
    yaxis_title='Percentuale',
    legend_title='Sesso',
    bargap=0.4
)

# Show the chart
fig.show()

### Analysis of the geographical distribution (region_residence) by cluster
This code integrates the geographic coordinates (latitude and longitude) of the regions into a DataFrame and analyzes the percentage distribution of clusters in each region. Calculate the percentage of membership of each cluster for each region and identify the most representative cluster, reporting the result with the corresponding percentage

In [20]:
# Add latitude and longitude for each region
region_coords = {
    'Abruzzo': (42.351221, 13.398438),
    'Basilicata': (40.639470, 15.805148),
    'Calabria': (38.905975, 16.594401),
    'Campania': (40.839565, 14.250849),
    'Emilia-Romagna': (44.494887, 11.342616),
    'Friuli Venezia Giulia': (45.649526, 13.776818),
    'Lazio': (41.892770, 12.482520),
    'Liguria': (44.411308, 8.932699),
    'Lombardia': (45.466797, 9.190498),
    'Marche': (43.616759, 13.518875),
    'Molise': (41.561918, 14.668747),
    'Piemonte': (45.070312, 7.686856),
    'Puglia': (41.125595, 16.866667),
    'Sardegna': (39.215311, 9.110616),
    'Sicilia': (37.600000, 14.015356),
    'Toscana': (43.769560, 11.255814),
    'Trentino-Alto Adige': (46.499334, 11.356624),
    'Umbria': (43.112203, 12.388784),
    'Valle d\'Aosta': (45.737502, 7.320149),
    'Veneto': (45.434904, 12.338452)
}

# Convert the dictionary to a DataFrame
coords_df = pd.DataFrame.from_dict(region_coords, orient='index', columns=['latitude', 'longitude']).reset_index()
coords_df.rename(columns={'index': 'regione_residenza'}, inplace=True)

# Merge the data with the coordinates
data = pd.merge(data, coords_df, on='regione_residenza')

# Calculate the percentage of each cluster for each region
region_cluster_crosstab = pd.crosstab(data['regione_residenza'], data['cluster'], normalize='index') * 100

# Identify the cluster with the highest percentage for each region
max_cluster_per_region = region_cluster_crosstab.idxmax(axis=1)

# Extract the corresponding highest percentages for each region
max_percentage_per_region = region_cluster_crosstab.max(axis=1)

# Print the cluster with the highest percentage for each region
for region, cluster in max_cluster_per_region.items():
    print(f"Regione: {region}, Cluster: {cluster}, Percentuale: {max_percentage_per_region[region]:.2f}%")

Regione: Abruzzo, Cluster: 2, Percentuale: 37.14%
Regione: Basilicata, Cluster: 0, Percentuale: 36.93%
Regione: Calabria, Cluster: 0, Percentuale: 36.27%
Regione: Campania, Cluster: 0, Percentuale: 69.16%
Regione: Lazio, Cluster: 2, Percentuale: 42.74%
Regione: Liguria, Cluster: 0, Percentuale: 43.43%
Regione: Lombardia, Cluster: 2, Percentuale: 41.73%
Regione: Marche, Cluster: 0, Percentuale: 35.38%
Regione: Molise, Cluster: 0, Percentuale: 38.73%
Regione: Piemonte, Cluster: 0, Percentuale: 41.38%
Regione: Puglia, Cluster: 2, Percentuale: 35.54%
Regione: Sardegna, Cluster: 0, Percentuale: 32.99%
Regione: Sicilia, Cluster: 2, Percentuale: 34.01%
Regione: Toscana, Cluster: 0, Percentuale: 39.91%
Regione: Umbria, Cluster: 0, Percentuale: 33.87%
Regione: Veneto, Cluster: 2, Percentuale: 33.53%


Graph that displays the map of Italy, and each point represents a region, the color of the point represents the cluster with the highest percentage of membership for that region.

In [21]:
# Create a DataFrame for the map
map_data = pd.DataFrame({
    'regione_residenza': max_cluster_per_region.index,
    'cluster': max_cluster_per_region.values,
    'percentage': max_percentage_per_region.values
})

# Merge with coordinates
map_data = pd.merge(map_data, coords_df, on='regione_residenza')

# Create a scatter map with Plotly
fig = px.scatter_mapbox(
    map_data,
    lat='latitude',
    lon='longitude',
    color='cluster',
    size='percentage',
    hover_name='regione_residenza',
    hover_data=['percentage'],
    title='Cluster con percentuale di appartenenza maggiore per regione in Italia',
    color_continuous_scale=px.colors.cyclical.IceFire,
    mapbox_style='carto-positron',
    zoom=5
)

# Customize the map
fig.update_layout(
    mapbox=dict(
        center=dict(lat=41.8719, lon=12.5674),  # Centered on Italy
        zoom=5
    ),
    margin={"r":0,"t":0,"l":0,"b":0},
    legend=dict(
        x=0.99,  # Positioned at the top right
        y=0.99,  # Positioned at the top right
        xanchor='right',
        yanchor='top',
        traceorder='normal',
        font=dict(size=12),
        bgcolor='rgba(0, 0, 0, 0.7)',  # Dark background color with transparency
        bordercolor='white',  # White border color
        borderwidth=1  # Border width
    )
)

# Show the map
fig.show()

Graph that displays the map of Italy, and each point represents a region, the color of the point represents the cluster with the highest percentage of membership for that region. The size of the points is proportional to the percentage of membership of the dominant cluster.

In [22]:
# Melt the crosstab DataFrame for easier plotting
melted_data = region_cluster_crosstab.reset_index().melt(id_vars='regione_residenza', var_name='cluster', value_name='percentage')

# Merge with coordinates
melted_data = pd.merge(melted_data, coords_df, on='regione_residenza')

# Create a scatter map with Plotly
fig = px.scatter_mapbox(
    melted_data,
    lat='latitude',
    lon='longitude',
    color='cluster',
    size='percentage',
    hover_name='regione_residenza',
    hover_data=['percentage'],
    title='Percentuale di appartenenza a ciascun cluster per regione in Italia',
    color_continuous_scale=px.colors.cyclical.IceFire,
    mapbox_style='carto-positron',
    zoom=5
)

# Customize the map
fig.update_layout(
    mapbox=dict(
        center=dict(lat=41.8719, lon=12.5674),  # Centered on Italy
        zoom=5
    ),
    margin={"r":0,"t":0,"l":0,"b":0},
    legend=dict(
        x=0.99,  # Positioned at the top right
        y=0.99,  # Positioned at the top right
        xanchor='right',
        yanchor='top',
        traceorder='normal',
        font=dict(size=12),
        bgcolor='rgba(0, 0, 0, 0.7)',  # Dark background color with transparency
        bordercolor='white',  # White border color
        borderwidth=1  # Border width
    )
)

# Adjust marker size to ensure visibility
fig.update_traces(marker=dict(sizemin=5))

# Show the map
fig.show()

In [23]:
# Analyzes the age group distribution (fascia_eta) by teleassistance variation and dominant cluster using a bar chart
'''
This function visualizes the dominant teleassistance increment category in each age group 
and includes the dominant cluster information for each age group.
'''

# Step 1: Create crosstab for teleassistance increment per age group
df_crosstab_increment = pd.crosstab(data['fascia_eta'], data['incremento_teleassistenze'], normalize='index') * 100

# Identify the increment category with the highest percentage per age group
df_max_increment = df_crosstab_increment.idxmax(axis=1)
df_max_percentage_increment = df_crosstab_increment.max(axis=1)

# Step 2: Create crosstab for clusters per age group
df_crosstab_cluster = pd.crosstab(data['fascia_eta'], data['cluster'], normalize='index') * 100

# Identify the cluster with the highest percentage per age group
df_max_cluster = df_crosstab_cluster.idxmax(axis=1)

# Step 3: Create a DataFrame for visualization
pie_data = pd.DataFrame({
    'age_group': df_max_increment.index,
    'incremento_teleassistenze': df_max_increment.values,
    'percentage_increment': df_max_percentage_increment,
    'dominant_cluster': df_max_cluster.values  # Add dominant cluster information
})

# Step 4: Create the bar chart
fig = px.bar(
    pie_data,
    x='age_group',
    y='percentage_increment',
    color='incremento_teleassistenze',
    text='dominant_cluster',  # Add dominant cluster as text inside bars
    title='Distribuzione delle fasce d\'età per variazione teleassistenza e cluster dominante',
    labels={'age_group': 'Fascia d\'età', 'percentage_increment': 'Percentuale (%)', 'incremento_teleassistenze': 'Incremento Teleassistenza'}
)

# Customize layout
fig.update_layout(
    xaxis_title='Fascia d\'età',
    yaxis_title='Percentuale (%)',
    xaxis_tickangle=-45,
    width=900,
    height=600
)

# Show text (cluster numbers) on top of the bars
fig.update_traces(textposition='outside')

# Show the bar chart
fig.show()

# Print results for debugging
print(df_max_increment, df_max_percentage_increment, df_max_cluster)



fascia_eta
0-11         high_increment
12-23        high_increment
24-35    constant_increment
36-47         low_increment
48-59         low_increment
60-69         low_increment
70+           low_increment
dtype: object fascia_eta
0-11     87.546922
12-23    58.058911
24-35    36.417972
36-47    55.809764
48-59    55.067804
60-69    43.402318
70+      61.739809
dtype: float64 fascia_eta
0-11     3
12-23    0
24-35    0
36-47    2
48-59    1
60-69    0
70+      2
dtype: uint16


In [24]:

# Calculate the frequency of each 'incremento_teleassistenze' category per cluster
cluster_counts = data.groupby(['cluster', 'incremento_teleassistenze']).size().reset_index(name='count')

# Identify the 'incremento_teleassistenze' category with the highest count for each cluster
dominant_increment_per_cluster = cluster_counts.loc[cluster_counts.groupby('cluster')['count'].idxmax()]

# Merge the frequency data with the dominant 'incremento_teleassistenze' category
cluster_counts_merged = cluster_counts.merge(dominant_increment_per_cluster, on='cluster', suffixes=('', '_dominant'))

# Custom function to format text annotations for bars
def format_bar_text(row):
    if row['incremento_teleassistenze'] == row['incremento_teleassistenze_dominant']:
        return f"{row['incremento_teleassistenze']} ({row['count']})"
    return ""

# Create the bar chart
fig = px.bar(
    cluster_counts_merged,
    x='cluster',
    y='count',
    color='incremento_teleassistenze',
    title='Distribuzione delle variazioni delle teleassistenze per cluster',
    labels={'cluster': 'Cluster', 'count': 'Numero di occorrenze', 'incremento_teleassistenze': 'Teleassistance Variation'},
    barmode='group',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    text=cluster_counts_merged.apply(format_bar_text, axis=1)  # Apply custom formatting function
)

# Customize the chart
fig.update_layout(
    xaxis_title='Cluster',
    yaxis_title='Numero di occorrenze',
    legend_title='Variazione Teleassistenza',
    showlegend=True  # Ensure legend is displayed
)

#Customize text annotations
fig.update_traces(
    textposition='outside', # Show text outside the bars
    textfont_size=100,  # Set text font size
    textfont_color='white'  # Ensure text color is readable
)
# Show the chart
fig.show()