In [49]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [50]:
path_2024 = "../data/raw/world-happiness-report-2024-yearly-updated/World-happiness-report-2024.csv"
path_all_years = "../data/raw/world-happiness-report-2024-yearly-updated/World-happiness-report-updated_2024.csv"

# Load datasets
df_2024 = pd.read_csv(path_2024) 
df_all_years = pd.read_csv(path_all_years, encoding='latin1')

# Quick sanity check
print('df_2024 shape:', df_2024.shape)
print('df_all_years shape:', df_all_years.shape)

df_2024 shape: (143, 12)
df_all_years shape: (2363, 11)


In [51]:
# Quality check of data

def data_quality_df(df):
    return pd.DataFrame({
        'dtype': df.dtypes.astype(str),
        'missing': df.isnull().sum(),
        'unique': df.nunique()
    })

display(data_quality_df(df_2024))
display(data_quality_df(df_all_years))

# Sanity check: whisker bounds (2024 data should be within min/max)
print((df_2024['lowerwhisker'] <= df_2024['Ladder score']).all())  # should be True
print((df_2024['upperwhisker'] >= df_2024['Ladder score']).all())  # should be True


Unnamed: 0,dtype,missing,unique
Country name,object,0,143
Regional indicator,object,0,10
Ladder score,float64,0,140
upperwhisker,float64,0,140
lowerwhisker,float64,0,136
Log GDP per capita,float64,3,134
Social support,float64,3,124
Healthy life expectancy,float64,3,119
Freedom to make life choices,float64,3,122
Generosity,float64,3,110


Unnamed: 0,dtype,missing,unique
Country name,object,0,165
year,int64,0,19
Life Ladder,float64,0,1814
Log GDP per capita,float64,28,1760
Social support,float64,13,484
Healthy life expectancy at birth,float64,63,1126
Freedom to make life choices,float64,36,550
Generosity,float64,81,650
Perceptions of corruption,float64,125,613
Positive affect,float64,24,442


True
True


In [52]:
# Match country names between datasets (for merging Regional indicator column later)

# Count unique countries in both datasets and identify non-matching entries
set_hist = set(df_all_years['Country name'])
set_2024 = set(df_2024['Country name'])

len_hist, len_2024 = len(set_hist), len(set_2024)
print('Unique countries: hist=', len_hist, ' 2024=', len_2024)

only_in_hist = sorted(list(set_hist - set_2024))
only_in_2024 = sorted(list(set_2024 - set_hist))

print('In historical only:', only_in_hist)
print('In 2024 only:', only_in_2024)

# Show rows for a few non-matching historical countries to inspect spelling/labels
display(df_all_years[df_all_years['Country name'].isin(only_in_hist)].head(30))


Unique countries: hist= 165  2024= 143
In historical only: ['Angola', 'Belarus', 'Belize', 'Bhutan', 'Burundi', 'Central African Republic', 'Cuba', 'Djibouti', 'Guyana', 'Haiti', 'Maldives', 'Oman', 'Qatar', 'Rwanda', 'Somalia', 'Somaliland region', 'South Sudan', 'Sudan', 'Suriname', 'Syria', 'Trinidad and Tobago', 'Turkmenistan', 'Türkiye']
In 2024 only: ['Turkiye']


Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
42,Angola,2011,5.589,8.944,0.723,51.22,0.584,0.05,0.911,0.667,0.361
43,Angola,2012,4.36,8.989,0.753,51.84,0.456,-0.141,0.906,0.591,0.305
44,Angola,2013,3.937,9.0,0.722,52.46,0.41,-0.109,0.816,0.65,0.371
45,Angola,2014,3.795,9.01,0.755,53.08,0.375,-0.173,0.834,0.595,0.368
160,Belarus,2006,5.658,9.489,0.918,60.06,0.707,-0.252,0.708,0.535,0.269
161,Belarus,2007,5.617,9.576,0.858,60.62,0.667,-0.23,0.695,0.502,0.235
162,Belarus,2008,5.463,9.677,0.904,61.18,0.64,-0.226,0.696,,0.246
163,Belarus,2009,5.564,9.681,0.908,61.74,0.679,-0.209,0.676,0.544,0.223
164,Belarus,2010,5.526,9.759,0.918,62.3,0.7,-0.168,0.706,0.532,0.208
165,Belarus,2011,5.225,9.813,0.91,62.86,0.656,-0.174,0.672,0.493,0.249


In [53]:
# Harmonize single known mismatch in country naming (2024 uses "Turkiye" instead of "Türkiye")
df_all_years['Country name'] = df_all_years['Country name'].replace({'Türkiye': 'Turkiye'})

# Region mapping from 2024 dataset to historical dataset
region_map = df_2024.set_index('Country name')['Regional indicator'].to_dict()
df_all_years['Regional indicator'] = df_all_years['Country name'].map(region_map)

# Unmatched countries
missing_regions = df_all_years[df_all_years['Regional indicator'].isnull()]['Country name'].unique()
print("Countries without mapped region:", missing_regions)

# Turkiye is now matched correctly. There are 22 countries without region mapping, due to being absent in 2024 data. 

Countries without mapped region: ['Angola' 'Belarus' 'Belize' 'Bhutan' 'Burundi' 'Central African Republic'
 'Cuba' 'Djibouti' 'Guyana' 'Haiti' 'Maldives' 'Oman' 'Qatar' 'Rwanda'
 'Somalia' 'Somaliland region' 'South Sudan' 'Sudan' 'Suriname' 'Syria'
 'Trinidad and Tobago' 'Turkmenistan']


In [None]:
# Missing values in 2005-2023 data (absolute and percentage)
display(data_quality_df(df_all_years))
display(df_all_years.isnull().mean().sort_values(ascending=False) * 100)

# No column has more than 5% missing values, so we can proceed without dropping any columns.

Unnamed: 0,dtype,missing,unique
Country name,object,0,165
year,int64,0,19
Life Ladder,float64,0,1814
Log GDP per capita,float64,28,1760
Social support,float64,13,484
Healthy life expectancy at birth,float64,63,1126
Freedom to make life choices,float64,36,550
Generosity,float64,81,650
Perceptions of corruption,float64,125,613
Positive affect,float64,24,442


Perceptions of corruption           5.289886
Regional indicator                  4.570461
Generosity                          3.427846
Healthy life expectancy at birth    2.666102
Freedom to make life choices        1.523487
Log GDP per capita                  1.184934
Positive affect                     1.015658
Negative affect                     0.677105
Social support                      0.550148
Country name                        0.000000
year                                0.000000
Life Ladder                         0.000000
dtype: float64

In [None]:
# Missing values by country for key columns
cols = ['Log GDP per capita','Social support','Healthy life expectancy at birth']

miss_frac_by_country = (df_all_years
                        .groupby('Country name')[cols]
                        .apply(lambda g: g.isnull().mean())     # fraction missing per column per country
                        .mean(axis=1)                            # average across the selected columns
                        .sort_values(ascending=False))

display(miss_frac_by_country.head(30))

# Some countries are systematically missing large amounts of data (not just “random” missing).
# Somaliland region has over 50% missing values in key columns, so will be dropped.

Country name
Somaliland region            0.666667
State of Palestine           0.375000
Taiwan Province of China     0.354167
Kosovo                       0.352941
Oman                         0.333333
South Sudan                  0.333333
Cuba                         0.333333
Hong Kong S.A.R. of China    0.333333
Qatar                        0.133333
Djibouti                     0.083333
Venezuela                    0.074074
Yemen                        0.047619
Afghanistan                  0.044444
Libya                        0.041667
Algeria                      0.030303
Bahrain                      0.027778
Kuwait                       0.025641
Morocco                      0.025641
Malta                        0.022222
Tunisia                      0.022222
Cyprus                       0.020833
Singapore                    0.020833
United Arab Emirates         0.020833
United States                0.018519
Canada                       0.018519
Vietnam                      0.018519

In [None]:
# Count number of years per country in historical data
years_per_country = df_all_years.groupby('Country name')['year'].nunique().sort_values(ascending=False)
display(years_per_country.head(20))
display(years_per_country.tail(20))

# Many stable countries: 18 years (full coverage). 
# Some have only 1–5 years. Those with few years also tend to have higher missingness. It might be worth dropping them ???


Country name
Lebanon       18
Jordan        18
Nicaragua     18
Nepal         18
Moldova       18
Mexico        18
Lithuania     18
Kyrgyzstan    18
Kenya         18
Kazakhstan    18
Japan         18
Egypt         18
Italy         18
Israel        18
Indonesia     18
India         18
Ghana         18
Germany       18
Georgia       18
France        18
Name: year, dtype: int64

Country name
Gambia                      5
Lesotho                     5
Qatar                       5
Sudan                       5
Trinidad and Tobago         5
Central African Republic    5
Burundi                     5
Angola                      4
Eswatini                    4
Djibouti                    4
South Sudan                 4
Somaliland region           4
Somalia                     3
Bhutan                      3
Belize                      2
Cuba                        1
Suriname                    1
Guyana                      1
Oman                        1
Maldives                    1
Name: year, dtype: int64

In [55]:
# Missing values in 2024 data
display(data_quality_df(df_2024))
df_2024[df_2024[['Log GDP per capita','Social support','Healthy life expectancy']].isnull().any(axis=1)]


Unnamed: 0,dtype,missing,unique
Country name,object,0,143
Regional indicator,object,0,10
Ladder score,float64,0,140
upperwhisker,float64,0,140
lowerwhisker,float64,0,136
Log GDP per capita,float64,3,134
Social support,float64,3,124
Healthy life expectancy,float64,3,119
Freedom to make life choices,float64,3,122
Generosity,float64,3,110


Unnamed: 0,Country name,Regional indicator,Ladder score,upperwhisker,lowerwhisker,Log GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Dystopia + residual
61,Bahrain,Middle East and North Africa,5.959,6.153,5.766,,,,,,,
87,Tajikistan,Commonwealth of Independent States,5.281,5.361,5.201,,,,,,,
102,State of Palestine,Middle East and North Africa,4.879,5.006,4.753,,,,,,,
