In [165]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC, SVR
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.ensemble import RandomForestRegressor
import string
from wordcloud import WordCloud, ImageColorGenerator
import scipy
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp
import plotly.offline as py


In [74]:
cols = ["Country", "Rank", "Score", "GDP", "Family", "Health", "Freedom", "Generosity", "Corruption"]
def preprocess(df, year):
    temp = pd.DataFrame()
    temp_cols = []
    for col in cols:
        temp_cols.extend([x for x in df.columns if col in x])
    temp[cols] = df[temp_cols]
    temp["Year"] = year
    temp.set_index(["Country","Year"])
    return temp
def preprocess_2018(df, year):
    temp = pd.DataFrame()
    temp_cols = ['Country or region', 'Overall rank', 'Score', 'GDP per capita', 'Social support','Healthy life expectancy','Freedom to make life choices','Generosity','Perceptions of corruption'  ]
    temp[cols] = df[temp_cols]
    temp["Year"] = year
    temp.set_index(["Country","Year"])
    return temp

# Read in dataset
df_2015 = preprocess(pd.read_csv('./data/2015.csv'), 2015)
df_2016 = preprocess(pd.read_csv('./data/2016.csv'), 2016)
df_2017 = preprocess(pd.read_csv('./data/2017.csv'), 2017)
df_2018 = preprocess_2018(pd.read_csv('./data/2018.csv'), 2018)
df_2019 = preprocess_2018(pd.read_csv('./data/2019.csv'), 2019)

#Combine to one corpus
df = df_2015
df = df.append(df_2016)
df = df.append(df_2017)
df = df.append(df_2018)
df = df.append(df_2019)
df.head()


Unnamed: 0,Country,Rank,Score,GDP,Family,Health,Freedom,Generosity,Corruption,Year
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.29678,0.41978,2015
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.4363,0.14145,2015
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.34139,0.48357,2015
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.34699,0.36503,2015
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.45811,0.32957,2015


In [75]:
df.isnull().sum()

Country       0
Rank          0
Score         0
GDP           0
Family        0
Health        0
Freedom       0
Generosity    0
Corruption    1
Year          0
dtype: int64

In [76]:
#Drop NULL Values
df = df.dropna()

In [77]:
df.describe()

Unnamed: 0,Rank,Score,GDP,Family,Health,Freedom,Generosity,Corruption,Year
count,781.0,781.0,781.0,781.0,781.0,781.0,781.0,781.0,781.0
mean,78.773367,5.377232,0.914537,1.07878,0.612342,0.411254,0.218618,0.125436,2016.992318
std,45.162398,1.127071,0.405403,0.329581,0.248459,0.152911,0.122394,0.105816,1.417814
min,1.0,2.693,0.0,0.0,0.0,0.0,0.0,0.0,2015.0
25%,40.0,4.509,0.605,0.87021,0.44006,0.31048,0.13,0.054,2016.0
50%,79.0,5.321,0.982,1.125,0.647239,0.431,0.202,0.091,2017.0
75%,118.0,6.182,1.233748,1.328,0.808,0.531,0.27906,0.15603,2018.0
max,158.0,7.769,1.870766,1.644,1.141,0.724,0.838075,0.55191,2019.0


In [78]:
spearman_cormatrix= df.iloc[:,2:9].corr(method='spearman')
spearman_cormatrix

Unnamed: 0,Score,GDP,Family,Health,Freedom,Generosity,Corruption
Score,1.0,0.805734,0.647784,0.762476,0.544712,0.122515,0.272934
GDP,0.805734,1.0,0.588111,0.796573,0.366675,0.000841,0.219516
Family,0.647784,0.588111,1.0,0.586374,0.434294,-0.039661,0.050104
Health,0.762476,0.796573,0.586374,1.0,0.361724,0.009504,0.149732
Freedom,0.544712,0.366675,0.434294,0.361724,1.0,0.33216,0.426509
Generosity,0.122515,0.000841,-0.039661,0.009504,0.33216,1.0,0.271888
Corruption,0.272934,0.219516,0.050104,0.149732,0.426509,0.271888,1.0


In [91]:
#Combine latitude and longitude information
location = pd.read_csv('./data/countries.csv')
location = location[["name","latitude","longitude"]].rename(columns={"name": "Country"})
df_map = df.merge(location, on='Country', how='left')
df_map.to_csv("happy_map.csv")

#Load NULL filled happy_map.csv as final df
df = pd.read_csv('./data/happy_map_filled.csv')

In [103]:
def map_score_by_country(average_score):
    countries = average_score.index
    data = [ dict(
        type = 'choropleth',
        locations = countries,
        z = average_score,
        locationmode = 'country names',
        text = countries,
        marker = dict(
            line = dict(color = 'rgb(0,0,0)', width = 1)),
            colorbar = dict(autotick = True, tickprefix = '', 
            title = 'Happiness Score')
        )
    ]
    
    layout = dict(
        title = 'Average happiness score in countries',
        geo = dict(
            showframe = False,
            showocean = False,
            oceancolor = 'rgb(0,255,255)',
            projection = dict(
                type = 'orthographic',
                rotation = dict(
                    lon = 60,
                    lat = 10
                ),
            ),
            lonaxis =  dict(
                showgrid = True,
                gridcolor = 'rgb(102, 102, 102)'
            ),
            lataxis = dict(
                showgrid = True,
                gridcolor = 'rgb(102, 102, 102)'
            )
        ),
    )
    return dict(data=data, layout=layout)

In [104]:
#Plot 3D world map of happiness score
worldmap = df.groupby('Country')['Score'].mean()
map_fig = map_score_by_country(worldmap)
py.iplot(map_fig, validate=False, filename='worldmap')

In [133]:
def get_metrics(y_test, y_predicted):
    print("Mean Squared Error: ", mean_squared_error(y_test, pred))
    print("R2 Error: ", r2_score(y_test, pred))

In [141]:
# Split into Training and Testing Datasets
# train_col = ['GDP','Family','Health','Freedom','Generosity','Corruption']
train_col = ['GDP','Family','Health']
df_train = df[df['Year']!=2019]
X_train = df_train[train_col]
y_train = df_train[['Score']]
df_test = df[df['Year']==2019]
X_test = df_test[train_col]
y_test = df_test[['Score']]

In [142]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
get_metrics(y_test, pred)

Mean Squared Error:  0.4290011279070632
R2 Error:  0.6515283397167903


In [148]:
# Ridge Regression
rg = Ridge(alpha=0.1)
rg.fit(X_train, y_train)
pred = rg.predict(X_test)
get_metrics(y_test, pred)

Mean Squared Error:  0.4289401332931755
R2 Error:  0.6515778847948497


In [149]:
# Lasso Regression
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)
pred = lasso.predict(X_test)
get_metrics(y_test, pred)

Mean Squared Error:  0.40763638864034396
R2 Error:  0.6688826208118377


In [170]:
# SVM
svc = SVR()
svc.fit(X_train,np.ravel(y_train))
pred = svc.predict(X_test)
get_metrics(np.ravel(y_test),pred)

Mean Squared Error:  0.41756712213287417
R2 Error:  0.660816023866382






In [169]:
# Random Forest
rf = RandomForestRegressor()
rf.fit(X_train,np.ravel(y_train))
pred = rf.predict(X_test)
get_metrics(np.ravel(y_test),pred)

Mean Squared Error:  0.4417513181373022
R2 Error:  0.6411715372064233



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.

