In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
import scipy
import geopandas as gpd
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import seaborn as sns

World Happiness Report from 2015-2019.
Kaggle File: https://www.kaggle.com/datasets/unsdsn/world-happiness?select=2015.csv

World map, region definitions: https://ourworldindata.org/world-region-map-definitions

In [None]:
#read in all the files
happy_2015 = pd.read_csv('2015.csv')
happy_2016 = pd.read_csv('2016.csv')
happy_2017 = pd.read_csv('2017.csv')
happy_2018 = pd.read_csv('2018.csv')
happy_2019 = pd.read_csv('2019.csv')
country_region = pd.read_csv('continents-according-to-our-world-in-data.csv')

In [None]:
#Add "Year" column
file_list = [happy_2015,happy_2016,happy_2017,happy_2018,happy_2019]
i = 2015
for data in file_list:
    data['Year'] = i
    i += 1

In [None]:
#Rename columns
happy_2015 = happy_2015.rename(columns={'Happiness Rank':'Rank','Happiness Score':'Happy_Score','Economy (GDP per Capita)':'Economy',
                                        'Health (Life Expectancy)':'Life_Expect','Trust (Government Corruption)':'Gov_Trust',
                                        'Dystopia Residual':'Dys_Res'})

happy_2016 = happy_2016.rename(columns={'Happiness Rank':'Rank','Happiness Score':'Happy_Score','Economy (GDP per Capita)':'Economy',
                                        'Health (Life Expectancy)':'Life_Expect','Trust (Government Corruption)':'Gov_Trust',
                                        'Dystopia Residual':'Dys_Res'})

happy_2017 = happy_2017.rename(columns={'Happiness.Rank':'Rank','Happiness.Score':'Happy_Score','Economy..GDP.per.Capita.':'Economy',
                                        'Health..Life.Expectancy.':'Life_Expect','Trust..Government.Corruption.':'Gov_Trust',
                                        'Dystopia.Residual':'Dys_Res'})

happy_2018 = happy_2018.rename(columns={'Overall rank':'Rank','Country or region':'Country','Score':'Happy_Score', 
                                        'GDP per capita':'Economy','Social support':'Family','Healthy life expectancy':'Life_Expect', 
                                        'Freedom to make life choices':'Freedom','Perceptions of corruption': 'Gov_Trust'})

happy_2019 = happy_2019.rename(columns={'Overall rank':'Rank','Country or region':'Country','Score':'Happy_Score', 
                                        'GDP per capita':'Economy','Social support':'Family','Healthy life expectancy':'Life_Expect',
                                        'Freedom to make life choices':'Freedom','Perceptions of corruption':'Gov_Trust'})

country_region = country_region.rename(columns={'Entity':'Country'})

In [None]:
#stack all dataframes into one, drop the columns that are not needed
stacked = pd.concat([happy_2015,happy_2016,happy_2017,happy_2018,happy_2019], ignore_index=True)
happy_data = stacked.drop(['Standard Error','Dys_Res','Lower Confidence Interval','Upper Confidence Interval','Whisker.high','Whisker.low','Region'],axis='columns')
country_region = country_region.drop(['Code','Year'], axis='columns')

In [None]:
happy_data = happy_data.replace('Czech Republic', 'Czechia')
happy_data = happy_data.replace('North Cyprus', 'Northern Cyprus')
happy_data = happy_data.replace('Somaliland Region', 'Somaliland')
happy_data = happy_data.replace('Somaliland region', 'Somaliland')
happy_data = happy_data.replace('Swaziland', 'Eswatini')
happy_data = happy_data.replace('Palestinian Territories', 'Palestine')
happy_data = happy_data.replace('Congo (Kinshasa)', 'Democratic Republic of Congo')
happy_data = happy_data.replace('Ivory Coast', "Cote d'Ivoire")
happy_data = happy_data.replace('Taiwan Province of China', 'Taiwan')
happy_data = happy_data.replace('Hong Kong S.A.R., China', 'Hong Kong')
happy_data = happy_data.replace('Swaziland', 'Eswatini')
happy_data = happy_data.replace('Trinidad & Tobago', 'Trinidad and Tobago')

In [None]:
def merge_region(df):
    df = pd.merge(df, country_region, how='left')
    return df

happy_data = pd.merge(happy_data, country_region, how='left')

In [None]:
#Fill in the rows with missing continents
def fill_in_rest(df):
    df.loc[df['Country'] == 'Macedonia', 'Continent'] = 'Europe'    
    df.loc[df['Country'] == 'Congo (Brazzaville)', 'Country'] = 'Brazzaville'
    df.loc[df['Country'] == 'Brazzaville', 'Continent'] = 'Africa'
    return df



In [None]:
#Check the columns that have missing values
for col in happy_data.columns:
    print(f'no values for : {col}')
    print(happy_data[happy_data[col].isna()])
    print(' ')

print(happy_data[happy_data['Country']=='United Arab Emirates'])


In [None]:
#impute the missing value for the UAE's trust number by filling it in with the average for all five years
def fill_nan_trust(df):
    gov_trust_avg = df.groupby('Country')['Gov_Trust'].mean().reset_index()
    uae = gov_trust_avg[gov_trust_avg['Country'] == 'United Arab Emirates'].reset_index()
    avg_num = round(uae.at[0, 'Gov_Trust'], 7)
    df['Gov_Trust'].fillna(avg_num, inplace = True)
    return df

In [None]:
# for col in ['Economy', 'Family', 'Life_Expect','Freedom', 'Gov_Trust', 'Generosity']:
#     fig = px.scatter(happy_data, x=f'{col}', y='Happy_Score', title=f'{col} vs. Happiness Score').show()
#     fig = px.histogram(happy_data, x=f'{col}', title=f'{col} distribution').show()
# fig = px.histogram(happy_data, x='Happy_Score').show()

Question 1a: Does a better economy mean higher happiness score?

Question 1b: How does the happiness score change for the top ten economies throughout the years.

In [None]:
def happy_economy(df):

    fig = px.scatter(df, x='Economy', y='Happy_Score', title='Economy vs. Happiness Score').show()

    top_10_econ_2015 = happy_2015.nlargest(10, 'Economy')
    countries_list = top_10_econ_2015['Country'].to_list()
    top_10 = df[df['Country'].isin(countries_list)]

    #separate out into quantiles
    df['Quantile'] = pd.qcut(df['Economy'], q=4, labels=['Bottom 25%', '25-50%', '50-75%', 'Top 25%'])
    group_quant = df.groupby('Quantile')['Happy_Score'].mean().reset_index()

    #bar graph to show the quantiles
    fig = px.bar(group_quant, x='Quantile', y='Happy_Score', title='Average happiness scores of four quartiles (based on the economy)',
                 text_auto=True, color_discrete_sequence=['darkgreen'])
    fig.update_layout(xaxis_title = 'Economy (in Quantiles)').show()

    #line chart to showcase the change in happiness for top ten economies
    fig = px.line(top_10, x='Year', y='Happy_Score', title='Happiness score for the top 10 economies (2015) over the years',
                  color = 'Country', markers=True).show()
    return df


Question 2: Does trust in government correlate with the Happiness Score of countries in Asia?

In [None]:
def Asian_country(df):
    asia = df[df['Continent'] == 'Asia']
    a_countries = asia['Country'].to_list()
    fig = px.scatter(asia, x='Gov_Trust', y='Happy_Score', 
                     title='Happiness score vs. Government Trust (for Asian and Middle Eastern Countries)',
                     color_discrete_sequence=['blue']).show()
    
    #Heatmap for happiness score of countries in Asia
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    Asia_cont = world[world['name'].isin(a_countries)]
    asia_group = asia.groupby('Country')['Happy_Score'].mean().reset_index()
    Asia_happy = Asia_cont.merge(asia_group, how = 'left', left_on='name', right_on='Country')

    fig, ax = plt.subplots(figsize=(10, 8))
    ax.set_xlim(20, 180)  
    ax.set_ylim(-10, 60)   

    Asia_happy.plot(ax=ax, column='Happy_Score', cmap='YlOrRd', legend=True)
    plt.title('Happiness Score of Asian and Middle Eastern Countries')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.show()

    #Heatmap for Government Trust of countries in Asia
    asia_trust_group = asia.groupby('Country')['Gov_Trust'].mean().reset_index()
    Asia_trust_ = Asia_cont.merge(asia_trust_group, how = 'left', left_on='name', right_on='Country')

    fig, ax = plt.subplots(figsize=(10, 8))
    ax.set_xlim(20, 180)  
    ax.set_ylim(-10, 60)   

    Asia_trust_.plot(ax=ax, column='Gov_Trust', cmap='YlOrRd', legend=True)
    plt.title('Government Trust for Asian and Middle Eastern Countries')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.show()


In [None]:
happy_data.pipe(merge_region).pipe(fill_in_rest).pipe(fill_nan_trust).pipe(happy_economy).pipe(Asian_country)

Interesting finds:

In [None]:
some_countries = ['South Korea', 'United States', 'China']
few_countries = happy_data[happy_data['Country'].isin(some_countries)]

for col in ['Happy_Score', 'Economy', 'Family', 'Life_Expect', 'Freedom', 'Gov_Trust', 'Generosity']:
    fig = px.line(few_countries, x='Year', y=f'{col}', title = f'{col}', color='Country').show()

In [None]:
top_10 = happy_data[happy_data['Rank'] <= 10]
top_10_group = top_10.groupby('Country')['Happy_Score'].mean().reset_index()
fig = px.bar(top_10_group, x='Country', y='Happy_Score', color='Country').show()
fig = px.histogram(top_10, x='Continent', title='Distribution of continents with top 10 happiest countries',
                    color_discrete_sequence = ['Navy'], text_auto=True).show()