In [44]:
import pandas as pd
from pylab import rcParams
rcParams['figure.figsize'] = 20, 8
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="darkgrid")
import plotly_express as px
import warnings
warnings.filterwarnings('ignore')

### Data preprocessing

In [2]:
df1 = pd.read_csv('./data/2015.csv')  
df1.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1

So we can see that in this table, columns: 'Dystopia Residual', 'Generosity', 'Trust (Government Corruption)' , 'Freedom', 'Health (Life Expectancy)', 'Family', 'Economy (GDP per Capita)', 'Standard Error' have type float.  
Colunm 'Happiness Rank' has type int.  
And columns 'Region', 'Country' have type object(string).  
There are no missing(null) values in any column.


Remove unnecessary columns and add column Date. For convenience, move the Date column to the first place

In [4]:
def drop_and_add_cols(df, drop_cols, year):
    df.drop(drop_cols, axis=1, inplace=True)
    df['Date'] = year
    df['Date'] = pd.to_datetime(df.Date, format='%Y')
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    return df

In [5]:
df1 = drop_and_add_cols(df1, ['Standard Error', 'Happiness Rank', 'Dystopia Residual'], 2015)

As a result, we got such  data set

In [6]:
df1.head()

Unnamed: 0,Date,Country,Region,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity
0,2015-01-01,Switzerland,Western Europe,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678
1,2015-01-01,Iceland,Western Europe,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363
2,2015-01-01,Denmark,Western Europe,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139
3,2015-01-01,Norway,Western Europe,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699
4,2015-01-01,Canada,North America,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811


We will carry out a similar preprocessing of features with the data sets for the years from 2016-2019. And then merge all them in one.

In [7]:
df2 = pd.read_csv('./data/2016.csv')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        157 non-null    object 
 1   Region                         157 non-null    object 
 2   Happiness Rank                 157 non-null    int64  
 3   Happiness Score                157 non-null    float64
 4   Lower Confidence Interval      157 non-null    float64
 5   Upper Confidence Interval      157 non-null    float64
 6   Economy (GDP per Capita)       157 non-null    float64
 7   Family                         157 non-null    float64
 8   Health (Life Expectancy)       157 non-null    float64
 9   Freedom                        157 non-null    float64
 10  Trust (Government Corruption)  157 non-null    float64
 11  Generosity                     157 non-null    float64
 12  Dystopia Residual              157 non-null    flo

In [8]:
df2 = drop_and_add_cols(df2, ['Happiness Rank','Lower Confidence Interval', 'Upper Confidence Interval','Dystopia Residual'], 2016)

In [9]:
df3 = pd.read_csv('./data/2017.csv')
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        155 non-null    object 
 1   Happiness.Rank                 155 non-null    int64  
 2   Happiness.Score                155 non-null    float64
 3   Whisker.high                   155 non-null    float64
 4   Whisker.low                    155 non-null    float64
 5   Economy..GDP.per.Capita.       155 non-null    float64
 6   Family                         155 non-null    float64
 7   Health..Life.Expectancy.       155 non-null    float64
 8   Freedom                        155 non-null    float64
 9   Generosity                     155 non-null    float64
 10  Trust..Government.Corruption.  155 non-null    float64
 11  Dystopia.Residual              155 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 

Since we will concatenate tables, names of the columns in them must match.  
So rename columns in data sets 3-5.

In [10]:
columns = {'Happiness.Score':'Happiness Score','Economy..GDP.per.Capita.': 'Economy (GDP per Capita)', 
           'Health..Life.Expectancy.': 'Health (Life Expectancy)','Trust..Government.Corruption.':'Trust (Government Corruption)', 
           'Trust..Government.Corruption.':'Trust (Government Corruption)'
          }
df3 = df3.rename(columns=columns)

In [11]:
df3 = drop_and_add_cols(df3, ['Happiness.Rank', 'Dystopia.Residual', 'Whisker.high', 'Whisker.low'], 2017)

In [12]:
df4 = pd.read_csv('./data/2018.csv')
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     155 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [13]:
columns = {'Country or region': 'Country', 'Score': 'Happiness Score', 'GDP per capita': 'Economy (GDP per Capita)',
          'Social support': 'Family', 'Healthy life expectancy': 'Health (Life Expectancy)', 
          'Freedom to make life choices': 'Freedom', 'Perceptions of corruption': 'Trust (Government Corruption)'}
df4 = df4.rename(columns=columns)

In [14]:
df4 = drop_and_add_cols(df4, 'Overall rank', 2018)

In [15]:
df4.head()

Unnamed: 0,Date,Country,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption)
0,2018-01-01,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2018-01-01,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,2018-01-01,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,2018-01-01,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,2018-01-01,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


In [16]:
df5 = pd.read_csv('./data/2019.csv')
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [17]:
df5 = df5.rename(columns=columns)

In [18]:
df5 = drop_and_add_cols(df5, 'Overall rank', 2019)

In [19]:
df5.head()

Unnamed: 0,Date,Country,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption)
0,2019-01-01,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2019-01-01,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,2019-01-01,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,2019-01-01,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,2019-01-01,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


Now we will find those countries that are present in all 5 data sets. In each data set, we will leave information only about these countries. Then merge the tables.

In [20]:
from functools import reduce
dfs = [df1['Country'], df2['Country'], df3['Country'], df4['Country'], df5['Country']]
countries = reduce(lambda left, right: pd.merge(left, right), dfs)

In [21]:
def new_cols(df, num, countries):
    df = df.merge(countries).sort_values('Country')
    if (num >= 3):
        df['Region'] = df1['Region']
        cols = df.columns.tolist()
        cols = cols[0:1] + cols[1:2] + cols[-1:] + cols[2:-1]
        df = df[cols]
    return df

In [22]:
df1 = new_cols(df1, 1, countries)
df2 = new_cols(df2, 2, countries)
df3 = new_cols(df3, 3, countries)
df4 = new_cols(df4, 4, countries)
df5 = new_cols(df5, 5, countries)
df_all = pd.concat([df1, df2, df3, df4, df5])

### Descriptive statistics

Let's group the data by region and look at some aggregates.

In [78]:
df1.sort_values('Happiness Score', ascending=False).head() # swland, Denmark, Norway, Finland, Finland

Unnamed: 0,Date,Country,Region,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity
0,2015-01-01,Switzerland,Western Europe,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678
1,2015-01-01,Iceland,Western Europe,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363
2,2015-01-01,Denmark,Western Europe,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139
3,2015-01-01,Norway,Western Europe,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699
4,2015-01-01,Canada,North America,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811


In [68]:
df_region = df_all.groupby(['Date', 'Region']).agg({
    'Happiness Score': ['mean', 'median', 'std', 'max', 'min'],
    'Economy (GDP per Capita)': ['mean'],
    'Family': ['mean'],
    'Freedom': ['mean']
})
df_region = df_region.sort_values(['Date', ('Happiness Score', 'mean')], ascending=False)
df_region

Unnamed: 0_level_0,Unnamed: 1_level_0,Happiness Score,Happiness Score,Happiness Score,Happiness Score,Happiness Score,Economy (GDP per Capita),Family,Freedom
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,std,max,min,mean,mean,mean
Date,Region,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2019-01-01,North America,7.271,7.271,0.306884,7.488,7.054,1.3645,1.53,0.5035
2019-01-01,Australia and New Zealand,7.262,7.262,0.022627,7.278,7.246,1.3705,1.49,0.558
2019-01-01,Western Europe,6.73235,6.9075,0.789418,7.769,5.082,1.308,1.45095,0.49755
2019-01-01,Latin America and Caribbean,6.14215,6.178,0.659646,7.167,4.628,1.06385,1.35555,0.48155
2019-01-01,Eastern Asia,5.67125,5.696,0.510437,6.118,5.175,0.9685,1.3375,0.40225
2019-01-01,Central and Eastern Europe,5.466429,5.535,0.53617,6.321,4.366,0.980679,1.287821,0.3405
2019-01-01,Southeastern Asia,5.43125,5.527,0.866301,6.595,4.015,0.891375,1.141,0.45
2019-01-01,Middle East and Northern Africa,5.419737,5.339,0.953914,7.228,3.334,0.914316,1.213579,0.420053
2019-01-01,Southern Asia,4.681857,4.681,0.665673,5.425,3.462,0.699286,0.981429,0.279429
2019-01-01,Sub-Saharan Africa,4.256129,4.332,0.609154,5.631,3.203,0.555516,0.986903,0.302645


From 2015 to 2019, Australia and New Zealand ranked first in terms of happiness. However, in 2019 North America managed to overtake them. In spite of the fact that the first 4 places over the course of 5 years were occupied by the countries of Western Europe, this regoin is only in third place in terms of the average indicator of happiness. Perhaps this is due to the fact that this region is quite large. It includes countries whose living standards are very different. Also we can take into account level of standard deviation in Western Europe. It is quite large, which proves our assumption.

### Build plots

In [79]:
fig = px.line(df_all, 
        title='Happiness Score',
        x='Country', 
        y='Happiness Score',
        color='Date')
fig.update_xaxes(tickangle=45)
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        xanchor="right",
        y = 1.02,
        x = 1,
        title_font_family="Times New Roman",
        font=dict(
            family="Courier",
            size=12,
            color="black"
        ),
        bgcolor="LightSteelBlue",
        bordercolor="Black",
        borderwidth=2
    )
)

This chart shows the ranking of happiness for various countries. Let's take a closer look at some of the countries with interesting indicators. You can see that in Syria, the level of happiness dropped dramatically by 2016. Perhaps this is due to the civil war. Russian military intervention in the Syrian Civil War began in September 2015, after an official request by the Syrian government for military aid against rebel groups.  
Another example is Venezuela. In 2016, inflation in this country reached 800 percent, again due to the civil war. Accordingly, the level of happiness of the population has been decreasing every year since 2015

In [81]:
fig = px.line(df_all, 
        title='Economy (GDP)',
        x='Country', 
        y='Economy (GDP per Capita)',
        color='Date')
fig.update_xaxes(tickangle=45)
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        xanchor="right",
        y = 1.02,
        x = 1,
        title_font_family="Times New Roman",
        font=dict(
            family="Courier",
            size=12,
            color="black"
        ),
        bgcolor="LightSteelBlue",
        bordercolor="Black",
        borderwidth=2
    )
)

This diagram shows the level of significance of the indicator 'Economy (GDP per Capita)' in different countries. You can note that this factor in the emirates is much higher than in other countries. This is logical, because the economy is actively developing here due to to oil production. In countries with a low level of economics, this factor is at a low level.

Let's make a table in which for each country we will take factor that contribute the most in evaluating the happiness in country

In [58]:
df_importance = pd.DataFrame(df_all.iloc[:,4:].idxmax(axis="columns"))
df_importance['Country'] = df_all['Country']
df_importance['Date'] = df_all['Date']

df_importance = df_importance.rename(columns={0:'most important factor'})

df_importance.loc[df_importance['most important factor'] == 'Family', 'most important factor'] = 1
df_importance.loc[df_importance['most important factor'] == 'Economy (GDP per Capita)', 'most important factor'] = 2
df_importance.loc[df_importance['most important factor'] == 'Freedom', 'most important factor'] = 3
df_importance.loc[df_importance['most important factor'] == 'Health (Life Expectancy)', 'most important factor'] = 4
df_importance.loc[df_importance['most important factor'] == 'Generosity', 'most important factor'] = 5
df_importance.loc[df_importance['most important factor'] == 'Trust (Government Corruption)', 'most important factor'] = 6
 
df_importance

Unnamed: 0,most important factor,Country,Date
135,5,Afghanistan,2015-01-01
85,2,Albania,2015-01-01
62,1,Algeria,2015-01-01
28,1,Argentina,2015-01-01
112,1,Armenia,2015-01-01
...,...,...,...
101,1,Venezuela,2019-01-01
88,1,Vietnam,2019-01-01
137,1,Yemen,2019-01-01
126,1,Zambia,2019-01-01


In [60]:
import plotly.graph_objects as go


def drow(year):
    fig = go.Figure()
    
    fig.add_trace(go.Bar(name = year,
        x=df_importance[df_importance['Date'] == year]['Country'],
        y=df_importance[df_importance['Date'] == year]['most important factor']))
    
    fig.update_layout(title = 'Most important factor ' + year)
    fig.update_xaxes(tickangle=45)
    fig.update_yaxes(ticktext=["Family", "Economy", "Freedom", 'Health', 'Generosity'],
                tickvals=[1, 2, 3, 4, 5])
    return fig
    
    
fig1 = drow('2015-01-01')
fig2 = drow('2017-01-01')
fig3 = drow('2019-01-01')

fig1.show()
fig2.show()
fig3.show()

This histogram shows the most important factor in evaluating the happiness in country. As we can see from year to year there are more and more countries for which the family factor makes the greatest contribution to happiness. Only in some Arab countries the economic factor remains the most important