In [3]:
import pandas as pd

In [4]:
# Chargement data
data_2020 = pd.read_csv('data/2020.csv')
data_2021 = pd.read_csv('data/2021.csv')
data_2022 = pd.read_csv('data/2022.csv')

print(data_2020.columns)
print(data_2020.head())

Index(['Country name', 'Regional indicator', 'Ladder score',
       'Standard error of ladder score', 'upperwhisker', 'lowerwhisker',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Ladder score in Dystopia',
       'Explained by: Log GDP per capita', 'Explained by: Social support',
       'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption',
       'Dystopia + residual'],
      dtype='object')
  Country name Regional indicator  Ladder score  \
0      Finland     Western Europe        7.8087   
1      Denmark     Western Europe        7.6456   
2  Switzerland     Western Europe        7.5599   
3      Iceland     Western Europe        7.5045   
4       Norway     Western Europe        7.4880   

   Standard error of ladder score  upperwhisker  lowerwhis

In [5]:
# Histogramme

import plotly.express as px
data_2020 = data_2020.sort_values(by='Ladder score', ascending=False)
fig = px.bar(
    data_2020,
    x='Country name',
    y='Ladder score',
    title='Histogramme du score de bonheur par pays en 2020',
    color='Regional indicator'
)
fig.update_xaxes(categoryorder='total descending')
fig.show()

In [6]:
df_numeric = data_2020[['Ladder score', 'Logged GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices','Perceptions of corruption', 'Generosity']]



In [7]:
# Calculate the correlation matrix

correlation_matrix = df_numeric.corr()

# Create the heatmap
fig = px.imshow(
    correlation_matrix,
    title="Correlation Heatmap",
    labels=dict(x="Columns", y="Columns", color="Correlation"),
    color_continuous_scale="RdBu",
    zmin=-1, zmax=1, text_auto=True
)
fig.update_layout(width=900, height=800)
fig.show()

In [10]:
# Standardisation du df

df_standardise = (df_numeric - df_numeric.mean()) / df_numeric.std()

print(df_standardise.head())

   Ladder score  Logged GDP per capita  Social support  \
0      2.099724               1.118155        1.198886   
1      1.953087               1.230285        1.212563   
2      1.876037               1.401668        1.104339   
3      1.826229               1.229085        1.366357   
4      1.811394               1.491442        1.183710   

   Healthy life expectancy  Freedom to make life choices  \
0                 1.056313                      1.407737   
1                 1.127394                      1.427026   
2                 1.368253                      1.171414   
3                 1.212051                      1.405357   
4                 1.240499                      1.463584   

   Perceptions of corruption  Generosity  
0                  -3.069408   -0.295857  
1                  -3.223286    0.532051  
2                  -2.451252    0.793626  
3                  -0.122225    1.722644  
4                  -2.682511    0.982163  


In [26]:
from sklearn.decomposition import PCA
# ACP sur donnees standardisees

pca = PCA(n_components=2)  # Choisissez le nombre de composantes souhaité
principal_components = pca.fit_transform(df_standardise)
principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
principal_df[['Country name','Regional indicator', 'Ladder score', 'Generosity', 'Social support', 'Logged GDP per capita', 'Healthy life expectancy', 'Freedom to make life choices', 'Perceptions of corruption']] = data_2020[['Country name','Regional indicator', 'Ladder score', 'Generosity', 'Social support', 'Logged GDP per capita', 'Healthy life expectancy', 'Freedom to make life choices', 'Perceptions of corruption']]


# Visualisation
fig = px.scatter(
    principal_df, 
    x='PC1', 
    y="PC2", 
    hover_name='Country name',  # Le nom principal affiché
    color='Regional indicator',  # Couleur basée sur cette colonne
    hover_data={ 
        'Country name': False,  # Masquer ces colonnes
        'Regional indicator': False,
        'Ladder score': True,
        'Generosity': True,
        'Social support': True,
        'Logged GDP per capita': True,
        'Healthy life expectancy': True,
        'Freedom to make life choices': True,
        'Perceptions of corruption': True
    },
)
fig.show()



In [29]:
# Clustering DBSCAN

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Standardisation des données
scaler = StandardScaler()
df_standardise = scaler.fit_transform(df_numeric)

# Clustering
dbscan = DBSCAN(eps=1.5, min_samples=2)
dbscan.fit(df_standardise)
labels = dbscan.labels_

# Ajout des labels au dataframe
principal_df['cluster'] = labels

# Visualisation

fig = px.scatter(
    principal_df, 
    x='PC1', 
    y='PC2', 
    color='cluster', 
    hover_name='Country name'
)
fig.show()



Heatmap / Correlation
Clustering

ACP

Chloropleth