## Getting Started

Need `plotly` version to be 4.6.0 in order for the code to work

In [72]:
try:
    import plotly
    if plotly.__version__ == '4.6.0':
        print('Success')
    else:
        print('Upgrading...')
        !pip install plotly --upgrade
except ImportError:
    print('No available module')
    print('Installing...')
    !pip install plotly==4.6.0
    print('Success')
    
!pip install geopandas
!pip instal

Success


In [73]:
import os
import sys
import re

import pandas as pd
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import folium

import descartes
import country_converter as coco

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

---
## Load Data

In [74]:
# read happiness data into a list of dataframe
base_url = 'https://raw.githubusercontent.com/Benjaminnnnnn/CSE-163-FINAL-PROJECT/master/data'
files = [base_url + '/' + str(i) + '.csv' for i in range(2015, 2020)]

dataframes = [pd.read_csv(file) for file in files]

Let's see if each year's happiness data has the same dataformat (Unfortunately they don't neccesarily have the same column length nor column names) 

In [75]:
print('Length of Each Year\'s Happiness Data')
print([len(df.columns) for df in dataframes])
print()

print('Column Names for Each Year\'s Happiness Data')
print()
for df in dataframes:
    # A peek of the column differences
    print(df.columns.values[0:5])
    print()

Length of Each Year's Happiness Data
[12, 13, 12, 9, 9]

Column Names for Each Year's Happiness Data

['Country' 'Region' 'Happiness Rank' 'Happiness Score' 'Standard Error']

['Country' 'Region' 'Happiness Rank' 'Happiness Score'
 'Lower Confidence Interval']

['Country' 'Happiness.Rank' 'Happiness.Score' 'Whisker.high' 'Whisker.low']

['Overall rank' 'Country or region' 'Score' 'GDP per capita'
 'Social support']

['Overall rank' 'Country or region' 'Score' 'GDP per capita'
 'Social support']



Because they use slightly different column names to record the happiness data, we need to do similar name matching. Besides, there is no `Year` column in any of the dataset. So we want to add a `Year` column for computing the trend of happiness over year.

---
## Data Processing

In [76]:
# add an year column to each year's happiness data
year = 2015
for df in dataframes:
    df['Year'] = year
    year+=1

Compare the data format (column names) between `2015` and `2017`

In [77]:
dataframes[0].head(2)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual,Year
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,2015


In [78]:
dataframes[2].head(2)

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual,Year
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027,2017
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707,2017


In [79]:
print('Column numbers for each dataset')
print([len(df.columns) for df in dataframes])

Column numbers for each dataset
[13, 14, 13, 10, 10]


Since each year has slightly different style of documenting the happiness scores (i.e. they have different number of columns and column names), we need to make some data transformation that filter out all the columns that we need, and standardize their names by using `re`.

In [80]:
# create the columns to be extracted
# store the regex pattern of column and its full name in
# a pair of tuple, [0] for pattern, [1] for full name

columns = list(map(lambda x: ('(?i).*' + x[0] + '(?i).*', x[1]),
                       [('rank', 'happiness rank'),
                        ('country', 'country'),
                        ('score', 'happiness score'),
                        ('GDP', 'GDP per capita'),
                        ('family', 'family'),
                        ('social', 'family'), # same as "family" but some year use "social support" as the column name
                        ('freedom', 'freedom'),
                        ('health', 'life expectancy'),
                        ('corruption', 'government corruption'), 
                        ('trust', 'government corruption'), # same as "corruption"
                        ('generosity', 'generosity'),
                        ('year', 'year')
                        ])
              )


# using regex to match similar column names
# case-insensitive
column_filter = re.compile('|'.join([col_name[0] for col_name in columns]))

In [81]:
# filtering out all the matching columns
for i, df in enumerate(dataframes):
    all_matches = list(filter(column_filter.match, df.columns))
    dataframes[i] = df[all_matches]

Rename all the extracted columns for each dataset and make them consistent

In [82]:
def rename_cols(dataframes, sub_patterns):
    '''
    Rename the columns of the given dataframes to
    the sub_pattern defined.
    
    Parameters:
        - dataframes        - a list of dataframes to be renamed
        - sub_patterns      - a list of tuples consists matching patterns
                              on the first entry, and the replacements on
                              the second entry
        
    Returns:
        - renamed_dfs       - renamed dataframes
    '''
    # col  - column
    # pat  - pattern
    # repl - replacement
    for i, df in enumerate(dataframes):
        new_cols = {}
        for col in df.columns:
            for pat, repl in columns:
                if re.match(pat, col):
                    new_cols[col] = re.sub(pat, repl, col)
        dataframes[i] = df.rename(columns=new_cols)
    return dataframes

In [83]:
dataframes = rename_cols(dataframes, columns)
# check if the rename is correct by peeking one dataframe
dataframes[2].head(1)

Unnamed: 0,country,happiness rank,happiness score,GDP per capita,family,life expectancy,freedom,generosity,government corruption,year
0,Norway,1,7.537,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2017


Since all the column names are consistent now, let's concatnate the data for each year into one dataframe

In [84]:
happiness_data = pd.concat(dataframes, ignore_index=True)

In [85]:
happiness_data.shape

(782, 10)

In [86]:
print('Null Value in the Dataset')
happiness_data.isnull()

Null Value in the Dataset


Unnamed: 0,GDP per capita,country,family,freedom,generosity,government corruption,happiness rank,happiness score,life expectancy,year
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
777,False,False,False,False,False,False,False,False,False,False
778,False,False,False,False,False,False,False,False,False,False
779,False,False,False,False,False,False,False,False,False,False
780,False,False,False,False,False,False,False,False,False,False


In [87]:
happiness_data.describe()

Unnamed: 0,GDP per capita,family,freedom,generosity,government corruption,happiness rank,happiness score,life expectancy,year
count,782.0,782.0,782.0,782.0,781.0,782.0,782.0,782.0,782.0
mean,0.916047,1.078392,0.411091,0.218576,0.125436,78.69821,5.379018,0.612416,2016.993606
std,0.40734,0.329548,0.15288,0.122321,0.105816,45.182384,1.127456,0.248309,1.417364
min,0.0,0.0,0.0,0.0,0.0,1.0,2.693,0.0,2015.0
25%,0.6065,0.869363,0.309768,0.13,0.054,40.0,4.50975,0.440183,2016.0
50%,0.982205,1.124735,0.431,0.201982,0.091,79.0,5.322,0.64731,2017.0
75%,1.236187,1.32725,0.531,0.278832,0.15603,118.0,6.1895,0.808,2018.0
max,2.096,1.644,0.724,0.838075,0.55191,158.0,7.769,1.141,2019.0


## Research Questions
---

### Question 1
What is the distribution of the happiness scores? (I'm just plotting some graphs, and I will choose whichever that is more informative later)

In [88]:
fig = px.box(happiness_data, x='year', y='happiness score', 
             color='year', title='Happiness Yearly Distribution')
fig.update_layout(
        xaxis=dict(title='Year'),
        yaxis=dict(title='Happiness Score'),
        legend_title_text='Year'
)
fig.show()

In [89]:
fig = px.histogram(happiness_data, x='happiness score', color='year',
                   labels={'x':'Happiness Score', 'y':'Counts'}, opacity=0.6,
                   title='Happiness Data Distribution')
fig.show()

In [103]:
fig = make_subplots(rows=1, cols=5, shared_yaxes=True, vertical_spacing=0.04)

color_discrete_sequence= px.colors.sequential.Plasma_r
for i, color in list(zip(range(2015, 2020), color_discrete_sequence)):
    data = happiness_data[happiness_data['year']==i]['happiness score']
    fig.add_trace(go.Histogram(x=data, name=i), row=1, col=i-2014)
    #fig.add_trace(go.Line(y=[data.mean()], name='Mean Score', showlegend=False), row=1, col=i-2014)
fig.update_layout(height=430, title='Happiness Score Yearly Distribution', legend_title_text='Year')
fig.show()