In [None]:
import pandas as pd
import plotly.express as px
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import tools as t

# Chargement data

In [2]:
data_2020 = pd.read_csv('data/2020.csv')
data_2021 = pd.read_csv('data/2021.csv')
#data_2022 = pd.read_csv('data/2022.csv')

In [3]:
# Histogramme
data_2020 = data_2020.sort_values(by='Ladder score', ascending=False)
fig = px.bar(
    data_2020,
    x='Country name',
    y='Ladder score',
    title='Histogramme du score de bonheur par pays en 2020',
    color='Regional indicator'
)
fig.update_xaxes(categoryorder='total descending')
fig.show()

In [4]:
df_numeric = data_2020[['Ladder score', 'Logged GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices','Perceptions of corruption', 'Generosity']]



## Carte chloropèthe

In [5]:
# carte 
carte = px.choropleth(
    data_2020,
    locations='Country name',
    locationmode='country names',
    color='Ladder score',
    title='Carte du score de bonheur par pays en 2020',
    color_continuous_scale='thermal',
)
carte.update_geos(showframe=False, showcoastlines=False)
carte.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
carte.show()

In [6]:
# Calculate the correlation matrix

correlation_matrix = df_numeric.corr()

# Create the heatmap
fig = px.imshow(
    correlation_matrix,
    title="Correlation Heatmap",
    labels=dict(x="Columns", y="Columns", color="Correlation"),
    color_continuous_scale="RdBu",
    zmin=-1, zmax=1, text_auto=True
)
fig.update_layout(width=900, height=800)
fig.show()

In [7]:
# Standardisation du df

df_standardise = (df_numeric - df_numeric.mean()) / df_numeric.std()
print(df_standardise.head())

   Ladder score  Logged GDP per capita  Social support  \
0      2.099724               1.118155        1.198886   
1      1.953087               1.230285        1.212563   
2      1.876037               1.401668        1.104339   
3      1.826229               1.229085        1.366357   
4      1.811394               1.491442        1.183710   

   Healthy life expectancy  Freedom to make life choices  \
0                 1.056313                      1.407737   
1                 1.127394                      1.427026   
2                 1.368253                      1.171414   
3                 1.212051                      1.405357   
4                 1.240499                      1.463584   

   Perceptions of corruption  Generosity  
0                  -3.069408   -0.295857  
1                  -3.223286    0.532051  
2                  -2.451252    0.793626  
3                  -0.122225    1.722644  
4                  -2.682511    0.982163  


In [8]:
from sklearn.decomposition import PCA
# ACP sur donnees standardisees

pca = PCA(n_components=2)  # Choisissez le nombre de composantes souhaité
principal_components = pca.fit_transform(df_standardise)
principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
principal_df[['Country name','Regional indicator', 'Ladder score', 'Generosity', 'Social support', 'Logged GDP per capita', 'Healthy life expectancy', 'Freedom to make life choices', 'Perceptions of corruption']] = data_2020[['Country name','Regional indicator', 'Ladder score', 'Generosity', 'Social support', 'Logged GDP per capita', 'Healthy life expectancy', 'Freedom to make life choices', 'Perceptions of corruption']]

# Visualisation
fig = px.scatter(
    principal_df, 
    template='plotly_white',
    x='PC1', 
    y="PC2", 
    hover_name='Country name',  # Le nom principal affiché
    color='Regional indicator',  # Couleur basée sur cette colonne
    hover_data={ 
        'Country name': False,  # Masquer ces colonnes
        'Regional indicator': False,
        'Ladder score': True,
        'Generosity': True,
        'Social support': True,
        'Logged GDP per capita': True,
        'Healthy life expectancy': True,
        'Freedom to make life choices': True,
        'Perceptions of corruption': True
    },
    color_discrete_sequence=px.colors.qualitative.G10,
)
fig.update_traces(marker=dict(size=8))

fig.show()

# DBSCAN

# K-means

## Score silhouette

Le silhouette score permet de trouver le nombre de cluster à mettre dans le k-means.

In [None]:
# Standardisation des données
df_standardise = t.standardise_data(df_numeric)

# Liste pour stocker les scores de silhouette
silhouette_scores = {}
for n_clusters in range(2, 11):
    kmeans, label, _ = t.kmeans_clustering(df_standardise, n_clusters)
    score = silhouette_score(df_standardise, label)
    silhouette_scores[n_clusters] = score

#Afficher le silhouette score
t.display_silhouette_scores(silhouette_scores, 'KMeans')

#choix du nombre de cluster 
n_clusters = [k for k, v in silhouette_scores.items() if v == max(silhouette_scores.values())][0]

NameError: name 't' is not defined

## Clustering

In [None]:
# Clustering avec K-Means
kmeans, labels, centroids = t.kmeans_clustering(df_standardise, n_clusters)

In [None]:
# Ajout des labels au dataframe
principal_df['cluster'] = labels

# Convertir les clusters en type catégoriel pour légende continue
principal_df["cluster"] = principal_df["cluster"].astype(str)

# Visualisation
fig = t.display_clustering(principal_df)


## Ajouter les centroïdes


In [None]:
# Créer un DataFrame des centroïdes avec les noms de colonnes
centroids_df = pd.DataFrame(centroids, columns=df_numeric.columns)

# Appliquer la transformation PCA
centroids_pca = pca.transform(centroids_df)  # Utiliser un DataFrame avec les mêmes colonnes

# Convertir en DataFrame pour manipulation
centroids_pca_df = pd.DataFrame(centroids_pca, columns=['PC1', 'PC2'])
centroids_pca_df['cluster'] = range(n_clusters)  # Ajouter les étiquettes des clusters

t.add_centroids(fig, centroids_pca_df)


# Gaussian Mixture

### Silhouette Score

In [13]:
# Liste pour stocker les scores de silhouette
silhouette_scores_gmm = {}

    
# Tester de 2 à 10 clusters
for n_clusters in range(2, 11):
    gmm, label, _ = t.gausian_mixture_clustering(df_standardise, n_clusters)
    score = silhouette_score(df_standardise, label)
    silhouette_scores_gmm[n_clusters] = score


#Afficher le silhouette score
t.display_silhouette_scores(silhouette_scores_gmm, 'Gausian Mixture')

#choix du nombre de cluster 
n_clusters = [k for k, v in silhouette_scores_gmm.items() if v == max(silhouette_scores_gmm.values())][0]


### Clustering

In [14]:
kmeans, labels, centroids = t.gausian_mixture_clustering(df_standardise, n_clusters)

In [15]:

# Standardisation des données
scaler = StandardScaler()
df_standardise = scaler.fit_transform(df_numeric)

#clustering avec Gausian mixture
gmm, label, centroids = t.gausian_mixture_clustering(df_standardise, n_clusters)

# Ajout des labels au dataframe
principal_df['cluster'] = labels

# Convertir les clusters en type catégoriel pour légende continue
principal_df["cluster"] = principal_df["cluster"].astype(str)

fig = t.display_clustering(principal_df)

## Avec les centroïdes

In [16]:
# Calcul des centroïdes des clusters à partir des moyennes de GMM
gmm_centroids = pd.DataFrame(centroids, columns=df_numeric.columns)  # Moyennes des clusters

# Appliquer la transformation PCA sur les centroïdes
centroids_pca_gmm = pca.transform(gmm_centroids)

# Créer un DataFrame pour les centroïdes projetés
centroids_pca_gmm_df = pd.DataFrame(centroids_pca_gmm, columns=['PC1', 'PC2'])
centroids_pca_gmm_df['cluster'] = range(n_clusters)  # Ajouter les étiquettes des clusters

t.add_centroids(fig, centroids_pca_gmm_df)


# Consensus clustering


### Avec les deux méthodes

In [None]:
final_label = t.consensus_plot(df_standardise, n_clusters)
t.visualise_consensus(df_standardise, data_2020, final_label)


divide by zero encountered in divide

