In [1]:
import pandas as pd
import plotly.express as px
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import tools as t
from sklearn.decomposition import PCA

# Données

In [2]:
data_2020 = pd.read_csv('data/2020.csv')
data_2021 = pd.read_csv('data/2021.csv')
#data_2022 = pd.read_csv('data/2022.csv')

df_total = data_2020[['Country name','Regional indicator', 'Ladder score', 'Generosity', 'Social support', 'Logged GDP per capita', 'Healthy life expectancy', 'Freedom to make life choices', 'Perceptions of corruption']]
df_total = pd.DataFrame(df_total)
df_numeric = df_total[['Ladder score', 'Logged GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices','Perceptions of corruption', 'Generosity']]
df_standardise = t.standardise_data(df_numeric)

In [3]:
pca = PCA(n_components=2)  # Choisissez le nombre de composantes souhaité
principal_components = pca.fit_transform(df_standardise)
df_total['PC1'] = principal_components[:,0]
df_total['PC2'] = principal_components[:,1]

# Clustering avec kmeans

### Silhouette Score

Principe : calculer le score pour chaque nb de cluster. Cela permet de connaitre le nombre de cluster optimal

In [4]:
#calculer tous les silhouette score
silhouette_scores = {}
for n_clusters in range(2, 11):
    kmeans, label, _ = t.kmeans_clustering(df_standardise, n_clusters)
    score = silhouette_score(df_standardise, label)
    silhouette_scores[n_clusters] = score

#Afficher le silhouette score
t.display_silhouette_scores(silhouette_scores, 'KMeans')

#nombre optimal de cluster = n_clusters
n_clusters = [k for k, v in silhouette_scores.items() if v == max(silhouette_scores.values())][0]



### Clustering

In [5]:
kmeans, labels, centroids = t.kmeans_clustering(df_standardise, n_clusters)

In [6]:
df_total['cluster'] = labels.astype(str)
fig = t.display_clustering(df_total)


### Avec centroides

In [7]:
centroids_df = pd.DataFrame(centroids, columns=df_numeric.columns)

#replacer les centroides dans l'espace ACP
centroids_pca = pca.transform(centroids_df)

# Convertir en DataFrame pour manipulation
centroids_pca_df = pd.DataFrame(centroids_pca, columns=['PC1', 'PC2'])
centroids_pca_df['cluster'] = range(n_clusters)

#afficher
t.add_centroids(fig, centroids_pca_df)


X has feature names, but PCA was fitted without feature names



# Gausian mixture

In [8]:
silhouette_scores_gmm = {}
for n_clusters in range(2, 11):
    gmm, label, _ = t.gausian_mixture_clustering(df_standardise, n_clusters)
    score = silhouette_score(df_standardise, label)
    silhouette_scores_gmm[n_clusters] = score

t.display_silhouette_scores(silhouette_scores_gmm, 'Gausian Mixture')

#choix du nombre de cluster 
n_clusters = [k for k, v in silhouette_scores_gmm.items() if v == max(silhouette_scores_gmm.values())][0]


In [9]:
gmm, labels, centroids = t.gausian_mixture_clustering(df_standardise, n_clusters)

In [10]:
# Ajout des labels au dataframe
df_total['cluster'] = labels

# Convertir les clusters en type catégoriel pour légende continue
df_total["cluster"] = df_total["cluster"].astype(str)

fig = t.display_clustering(df_total)

In [11]:
# Calcul des centroïdes des clusters à partir des moyennes de GMM
gmm_centroids = pd.DataFrame(centroids, columns=df_numeric.columns)  # Moyennes des clusters

# Appliquer la transformation PCA sur les centroïdes
centroids_pca_gmm = pca.transform(gmm_centroids)

# Créer un DataFrame pour les centroïdes projetés
centroids_pca_gmm_df = pd.DataFrame(centroids_pca_gmm, columns=['PC1', 'PC2'])
centroids_pca_gmm_df['cluster'] = range(n_clusters)  # Ajouter les étiquettes des clusters

t.add_centroids(fig, centroids_pca_gmm_df)



X has feature names, but PCA was fitted without feature names



# Consensus

In [12]:
final_label = t.consensus_plot(df_standardise, n_clusters)
t.visualise_consensus(df_total, final_label)


divide by zero encountered in divide

