In [23]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.mixture import GaussianMixture

# Chargement data

In [24]:
data_2020 = pd.read_csv('data/2020.csv')
data_2021 = pd.read_csv('data/2021.csv')
data_2022 = pd.read_csv('data/2022.csv')

print(data_2020.columns)
print(data_2020.head())

Index(['Country name', 'Regional indicator', 'Ladder score',
       'Standard error of ladder score', 'upperwhisker', 'lowerwhisker',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Ladder score in Dystopia',
       'Explained by: Log GDP per capita', 'Explained by: Social support',
       'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption',
       'Dystopia + residual'],
      dtype='object')
  Country name Regional indicator  Ladder score  \
0      Finland     Western Europe        7.8087   
1      Denmark     Western Europe        7.6456   
2  Switzerland     Western Europe        7.5599   
3      Iceland     Western Europe        7.5045   
4       Norway     Western Europe        7.4880   

   Standard error of ladder score  upperwhisker  lowerwhis

In [25]:
# Histogramme

data_2020 = data_2020.sort_values(by='Ladder score', ascending=False)
fig = px.bar(
    data_2020,
    x='Country name',
    y='Ladder score',
    title='Histogramme du score de bonheur par pays en 2020',
    color='Regional indicator'
)
fig.update_xaxes(categoryorder='total descending')
fig.show()

In [26]:
df_numeric = data_2020[['Ladder score', 'Logged GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices','Perceptions of corruption', 'Generosity']]



## Carte chloropèthe

In [27]:
# carte 
carte = px.choropleth(
    data_2020,
    locations='Country name',
    locationmode='country names',
    color='Ladder score',
    title='Carte du score de bonheur par pays en 2020',
    color_continuous_scale='thermal',
)
carte.update_geos(showframe=False, showcoastlines=False)
carte.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
carte.show()

In [28]:
# Calculate the correlation matrix

correlation_matrix = df_numeric.corr()

# Create the heatmap
fig = px.imshow(
    correlation_matrix,
    title="Correlation Heatmap",
    labels=dict(x="Columns", y="Columns", color="Correlation"),
    color_continuous_scale="RdBu",
    zmin=-1, zmax=1, text_auto=True
)
fig.update_layout(width=900, height=800)
fig.show()

In [29]:
# Standardisation du df

df_standardise = (df_numeric - df_numeric.mean()) / df_numeric.std()

print(df_standardise.head())

   Ladder score  Logged GDP per capita  Social support  \
0      2.099724               1.118155        1.198886   
1      1.953087               1.230285        1.212563   
2      1.876037               1.401668        1.104339   
3      1.826229               1.229085        1.366357   
4      1.811394               1.491442        1.183710   

   Healthy life expectancy  Freedom to make life choices  \
0                 1.056313                      1.407737   
1                 1.127394                      1.427026   
2                 1.368253                      1.171414   
3                 1.212051                      1.405357   
4                 1.240499                      1.463584   

   Perceptions of corruption  Generosity  
0                  -3.069408   -0.295857  
1                  -3.223286    0.532051  
2                  -2.451252    0.793626  
3                  -0.122225    1.722644  
4                  -2.682511    0.982163  


In [30]:
from sklearn.decomposition import PCA
# ACP sur donnees standardisees

pca = PCA(n_components=2)  # Choisissez le nombre de composantes souhaité
principal_components = pca.fit_transform(df_standardise)
principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
principal_df[['Country name','Regional indicator', 'Ladder score', 'Generosity', 'Social support', 'Logged GDP per capita', 'Healthy life expectancy', 'Freedom to make life choices', 'Perceptions of corruption']] = data_2020[['Country name','Regional indicator', 'Ladder score', 'Generosity', 'Social support', 'Logged GDP per capita', 'Healthy life expectancy', 'Freedom to make life choices', 'Perceptions of corruption']]


# Visualisation
fig = px.scatter(
    principal_df, 
    template='plotly_white',
    x='PC1', 
    y="PC2", 
    hover_name='Country name',  # Le nom principal affiché
    color='Regional indicator',  # Couleur basée sur cette colonne
    hover_data={ 
        'Country name': False,  # Masquer ces colonnes
        'Regional indicator': False,
        'Ladder score': True,
        'Generosity': True,
        'Social support': True,
        'Logged GDP per capita': True,
        'Healthy life expectancy': True,
        'Freedom to make life choices': True,
        'Perceptions of corruption': True
    },
    color_discrete_sequence=px.colors.qualitative.G10,
)
fig.update_traces(marker=dict(size=8))

fig.show()

# DBSCAN

# K-means

## Score silhouette

In [31]:
# Standardisation des données
scaler = StandardScaler()
df_standardise = scaler.fit_transform(df_numeric)

# Liste pour stocker les scores de silhouette
silhouette_scores = []

# Tester de 2 à 10 clusters
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(df_standardise)  # ou df_numeric
    score = silhouette_score(df_standardise, kmeans.labels_)
    silhouette_scores.append(score)

# Affichage des scores de silhouette
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=list(range(2, 11)),
    y=silhouette_scores,
    mode='lines+markers',
    name='Silhouette Score',
    line=dict(color='blue'),
    marker=dict(size=8)
))

fig.update_layout(
    title='Méthode de la silhouette pour KMeans',
    xaxis_title='Nombre de clusters',
    yaxis_title='Indice de silhouette moyen',
    template='plotly_white'
)

fig.show()

#choix du nombre de cluster 
max_silhouette_score = max(silhouette_scores)
n_clusters = silhouette_scores.index(max_silhouette_score) + 2

n_clusters 


3

## Clustering

In [32]:
# Clustering avec K-Means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)  # 3 clusters par exemple
kmeans.fit(df_standardise)
labels = kmeans.labels_

# Ajout des labels au dataframe
principal_df['cluster'] = labels

# Convertir les clusters en type catégoriel pour légende continue
principal_df["cluster"] = principal_df["cluster"].astype(str)

# Visualisation
fig_kmeans = px.scatter(
    template='plotly_white',
    data_frame=principal_df, 
    x='PC1', 
    y='PC2', 
    color='cluster', 
    hover_name='Country name',
    color_discrete_sequence=px.colors.qualitative.G10,
)
# Cacher la légende
fig_kmeans.update_layout(showlegend=False)
fig_kmeans.show()


# Gaussian Mixture

### Silhouette Score

In [33]:
# Liste pour stocker les scores de silhouette
silhouette_scores_gmm = []

# Tester de 2 à 10 clusters
for n_clusters in range(2, 11):
    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    gmm.fit(df_standardise)  # ou df_numeric
    labels = gmm.predict(df_standardise)
    score = silhouette_score(df_standardise, labels)
    silhouette_scores_gmm.append(score)

# Création du graphique avec Plotly
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=list(range(2, 11)),
    y=silhouette_scores_gmm,
    mode='lines+markers',
    name='Silhouette Score',
    line=dict(color='red'),
    marker=dict(size=8)
))

fig.update_layout(
    title='Méthode de la silhouette pour GMM',
    xaxis_title='Nombre de clusters',
    yaxis_title='Indice de silhouette moyen',
    template='plotly_white'  # Utiliser le thème blanc
)

fig.show()

max_silhouette_score = max(silhouette_scores_gmm)
n_clusters = silhouette_scores_gmm.index(max_silhouette_score) + 2

n_clusters 


3

### Clustering

In [34]:

# Standardisation des données
scaler = StandardScaler()
df_standardise = scaler.fit_transform(df_numeric)

# Clustering avec Gaussian Mixture
gmm = GaussianMixture(n_components=n_clusters, random_state=42)  # 3 clusters par exemple
gmm.fit(df_standardise)
labels = gmm.predict(df_standardise)

# Ajout des labels au dataframe
principal_df['cluster'] = labels

# Convertir les clusters en type catégoriel pour légende continue
principal_df["cluster"] = principal_df["cluster"].astype(str)

# Visualisation
fig = px.scatter(
    template='plotly_white',
    data_frame=principal_df, 
    x='PC1', 
    y='PC2', 
    color='cluster', 
    hover_name='Country name',
    color_discrete_sequence=px.colors.qualitative.G10,
)
fig.update_layout(showlegend=False)
fig.show()

# Consensus clustering


### Avec les deux méthodes

In [35]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.cluster import SpectralClustering
import plotly.express as px

# 1. Générer un jeu de données pour l'exemple (remplacer par vos propres données)
from sklearn.datasets import make_blobs

df_standardise = make_blobs(n_samples=300, centers=3, random_state=42)[0]

# Standardisation des données
scaler = StandardScaler()
df_standardise = scaler.fit_transform(df_standardise)

# 2. Appliquer K-Means
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(df_standardise)

# 3. Appliquer Gaussian Mixture Models (GMM)
gmm = GaussianMixture(n_components=3, random_state=42)
gmm_labels = gmm.fit_predict(df_standardise)

# 4. Construction de la matrice de consensus
n_samples = df_standardise.shape[0]
n_iterations = 2  # Nombre de méthodes de clustering

C = np.zeros((n_samples, n_samples))  # Matrice de consensus initialisée à 0

# Comparaison des labels pour K-Means et GMM
for i in range(n_samples):
    for j in range(i + 1, n_samples):
        if kmeans_labels[i] == kmeans_labels[j]:
            C[i, j] += 1
            C[j, i] += 1
        if gmm_labels[i] == gmm_labels[j]:
            C[i, j] += 1
            C[j, i] += 1

# Normaliser la matrice de consensus
C = C / n_iterations

# 5. Créer la matrice de distance (en évitant les divisions par 0)
distance_matrix = 1 / (C + np.eye(n_samples))  # Ajout de l'identité pour éviter division par zéro
distance_matrix[np.isinf(distance_matrix)] = 0  # Remplacer les infinis par 0
distance_matrix[np.isnan(distance_matrix)] = 0  # Remplacer les NaN par 0

# 6. Appliquer Spectral Clustering sur la matrice de distance
spectral = SpectralClustering(n_clusters=3, affinity='precomputed', random_state=42)
final_labels = spectral.fit_predict(distance_matrix)

# 7. Visualisation avec Plotly
# Convertir les données en DataFrame pour une meilleure gestion avec Plotly
principal_df = pd.DataFrame(df_standardise, columns=["PC1", "PC2"])
principal_df['cluster'] = final_labels

# Visualisation avec Plotly
fig = px.scatter(
    principal_df,
    x='PC1', 
    y='PC2', 
    color='cluster',
    title="Clustering Consensuel avec K-Means et GMM",
    labels={'cluster': 'Cluster'},
    color_continuous_scale="Viridis"
)

fig.update_layout(
    template="plotly_white",  # Fond blanc pour un look propre
    showlegend=True
)

fig.show()



divide by zero encountered in divide


Graph is not fully connected, spectral embedding may not work as expected.



### DBSCAN

In [36]:
# Clustering DBSCAN

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Standardisation des données
scaler = StandardScaler()
df_standardise = scaler.fit_transform(df_numeric)

# Clustering
dbscan = DBSCAN(eps=1.5, min_samples=2)
dbscan.fit(df_standardise)
labels = dbscan.labels_

# Ajout des labels au dataframe
principal_df['cluster'] = labels

# Convertir les clusters en type catégoriel pour légende continue
principal_df["cluster"] = principal_df["cluster"].astype(str)

# Visualisation

fig = px.scatter(
    template='plotly_white',
    data_frame= principal_df, 
    x='PC1', 
    y='PC2', 
    color='cluster', 
    hover_name='Country name',
    color_discrete_sequence=px.colors.qualitative.G10,
)
fig.show()

ValueError: Length of values (153) does not match length of index (300)

Heatmap / Correlation
Clustering

ACP

Chloropleth