In [1]:
import pandas as pd
from pylab import rcParams
rcParams['figure.figsize'] = 20, 8
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="darkgrid")
import plotly_express as px
import warnings
warnings.filterwarnings('ignore')

### Data preprocessing

In [2]:
df1 = pd.read_csv('./data/2015.csv')  
df1.head()

FileNotFoundError: [Errno 2] File ../Downloads/2015.csv does not exist: '../Downloads/2015.csv'

In [None]:
df1.info()

So we can see that in this table, columns: 'Dystopia Residual', 'Generosity', 'Trust (Government Corruption)' , 'Freedom', 'Health (Life Expectancy)', 'Family', 'Economy (GDP per Capita)', 'Standard Error' have type float.  
Colunm 'Happiness Rank' has type int.  
And columns 'Region', 'Country' have type object(string).  
There are no missing(null) values in any column.


Remove unnecessary columns and add column Date. For convenience, move the Date column to the first place

In [None]:
def drop_and_add_cols(df, drop_cols, year):
    df.drop(drop_cols, axis=1, inplace=True)
    df['Date'] = year
    df['Date'] = pd.to_datetime(df.Date, format='%Y')
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    return df

In [None]:
df1 = drop_and_add_cols(df1, ['Standard Error', 'Happiness Rank', 'Dystopia Residual'], 2015)

As a result, we got such  data set

In [None]:
df1.head()

We will do similar transformations with the data sets for the years 2016-2019. Then we are going to merge them all in one.

In [None]:
df2 = pd.read_csv('./data/2016.csv')
df2.info()

In [None]:
df2 = drop_and_add_cols(df2, ['Happiness Rank','Lower Confidence Interval', 'Upper Confidence Interval','Dystopia Residual'], 2016)

In [None]:
df3 = pd.read_csv('../Downloads/2017.csv')
df3.info()

Since later we plan on merging our tables in one, the names of the columns must be the same in all data sets. That is why we rename the colums in the tables 3,4,5.

In [None]:
columns = {'Happiness.Score':'Happiness Score','Economy..GDP.per.Capita.': 'Economy (GDP per Capita)', 
           'Health..Life.Expectancy.': 'Health (Life Expectancy)','Trust..Government.Corruption.':'Trust (Government Corruption)', 
           'Trust..Government.Corruption.':'Trust (Government Corruption)'
          }
df3 = df3.rename(columns=columns)

In [None]:
df3 = drop_and_add_cols(df3, ['Happiness.Rank', 'Dystopia.Residual', 'Whisker.high', 'Whisker.low'], 2017)

In [None]:
df4 = pd.read_csv('../Downloads/2018.csv')
df4.info()

In [None]:
columns = {'Country or region': 'Country', 'Score': 'Happiness Score', 'GDP per capita': 'Economy (GDP per Capita)',
          'Social support': 'Family', 'Healthy life expectancy': 'Health (Life Expectancy)', 
          'Freedom to make life choices': 'Freedom', 'Perceptions of corruption': 'Trust (Government Corruption)'}
df4 = df4.rename(columns=columns)

In [None]:
df4 = drop_and_add_cols(df4, 'Overall rank', 2018)

In [None]:
df4.head()

In [None]:
df5 = pd.read_csv('../Downloads/2019.csv')
df5.info()

In [None]:
df5 = df5.rename(columns=columns)

In [None]:
df5 = drop_and_add_cols(df5, 'Overall rank', 2019)

In [None]:
df5.head()

Now we will find those countries that are present in all 5 data sets. In each data set, we will leave information only about these countries. Then merge the tables.


In [None]:
from functools import reduce
dfs = [df1['Country'], df2['Country'], df3['Country'], df4['Country'], df5['Country']]
countries = reduce(lambda left, right: pd.merge(left, right), dfs)

In [None]:
def new_cols(df, num, countries):
    df = df.merge(countries).sort_values('Country')
    if (num >= 3):
        df['Region'] = df1['Region']
        cols = df.columns.tolist()
        cols = cols[0:1] + cols[1:2] + cols[-1:] + cols[2:-1]
        df = df[cols]
    return df

In [None]:
df1 = new_cols(df1, 1, countries)
df2 = new_cols(df2, 2, countries)
df3 = new_cols(df3, 3, countries)
df4 = new_cols(df4, 4, countries)
df5 = new_cols(df5, 5, countries)
df_all = pd.concat([df1, df2, df3, df4, df5])

### Descriptive statistics

Let's group the data by region and look at some aggregates.

In [None]:
df1.sort_values('Happiness Score', ascending=False).head() # swland, Denmark, Norway, Finland, Finland

In [None]:
df_region = df_all.groupby(['Date', 'Region']).agg({
    'Happiness Score': ['mean', 'median', 'std', 'max', 'min'],
    'Economy (GDP per Capita)': ['mean'],
    'Family': ['mean'],
    'Freedom': ['mean']
})
df_region = df_region.sort_values(['Date', ('Happiness Score', 'mean')], ascending=False)
df_region

From 2015 to 2018, Australia and New Zealand ranked the first in terms of happiness. However, in 2019 North America managed to take the first place. In spite of the fact that the first 4 places over the course of 5 years were occupied by the countries of Western Europe, this region is only at the third place in terms of the average indicator of happiness. Perhaps this is due to the fact that this region is quite large. It includes countries with very different living standards. Also we can take into account the level of standard deviation in Western Europe. It is quite large, which proves our assumption.

### Build plots

In [None]:
fig = px.line(df_all, 
        title='Happiness Score',
        x='Country', 
        y='Happiness Score',
        color='Date')
fig.update_xaxes(tickangle=45)
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        xanchor="right",
        y = 1.02,
        x = 1,
        title_font_family="Times New Roman",
        font=dict(
            family="Courier",
            size=12,
            color="black"
        ),
        bgcolor="LightSteelBlue",
        bordercolor="Black",
        borderwidth=2
    )
)

This chart shows the ranking based on level of happiness for various countries. Let's take a closer look at some of the countries with interesting indicators. We can see that in Syria, the level of happiness dropped dramatically since 2016. Perhaps this is caused by the civil war. Russian military intervention in the Syrian Civil War began in September 2015, after an official request by the Syrian government for military aid against rebel groups.
Another interesting case is Venezuela. In 2016, inflation in this country reached 800%, the reason is probably again the civil war. Accordingly, the level of happiness in the society has been decreasing every year since 2015

In [None]:
fig = px.line(df_all, 
        title='Economy (GDP)',
        x='Country', 
        y='Economy (GDP per Capita)',
        color='Date')
fig.update_xaxes(tickangle=45)
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        xanchor="right",
        y = 1.02,
        x = 1,
        title_font_family="Times New Roman",
        font=dict(
            family="Courier",
            size=12,
            color="black"
        ),
        bgcolor="LightSteelBlue",
        bordercolor="Black",
        borderwidth=2
    )
)

This diagram shows the level of significance of the indicator 'Economy (GDP per Capita)' in different countries. You can note that this factor is much higher in UAE than in other countries. This is logical because the economy is actively developing here due to the oil production. In countries with a low level of economic development this factor is less important.

Let's make a table in which for each country we will take factor that contribute the most in evaluating the happiness in country

In [None]:
df_importance = pd.DataFrame(df_all.iloc[:,4:].idxmax(axis="columns"))
df_importance['Country'] = df_all['Country']
df_importance['Date'] = df_all['Date']

df_importance = df_importance.rename(columns={0:'most important factor'})

df_importance.loc[df_importance['most important factor'] == 'Family', 'most important factor'] = 1
df_importance.loc[df_importance['most important factor'] == 'Economy (GDP per Capita)', 'most important factor'] = 2
df_importance.loc[df_importance['most important factor'] == 'Freedom', 'most important factor'] = 3
df_importance.loc[df_importance['most important factor'] == 'Health (Life Expectancy)', 'most important factor'] = 4
df_importance.loc[df_importance['most important factor'] == 'Generosity', 'most important factor'] = 5
df_importance.loc[df_importance['most important factor'] == 'Trust (Government Corruption)', 'most important factor'] = 6
 
df_importance

In [None]:
import plotly.graph_objects as go


def drow(year):
    fig = go.Figure()
    
    fig.add_trace(go.Bar(name = year,
        x=df_importance[df_importance['Date'] == year]['Country'],
        y=df_importance[df_importance['Date'] == year]['most important factor']))
    
    fig.update_layout(title = 'Most important factor ' + year)
    fig.update_xaxes(tickangle=45)
    fig.update_yaxes(ticktext=["Family", "Economy", "Freedom", 'Health', 'Generosity'],
                tickvals=[1, 2, 3, 4, 5])
    return fig
    
    
fig1 = drow('2015-01-01')
fig2 = drow('2017-01-01')
fig3 = drow('2019-01-01')

fig1.show()
fig2.show()
fig3.show()

This histogram shows the most important factor in evaluating happiness in a country. As we can see, from year to year there are more and more countries for which the family factor makes the greatest contribution to happiness. Only in some Arab countries the economic factor remains the most important