In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import logging
logging.root.setLevel(10)

import sys
import json
import os
sys.path.insert(0, os.path.realpath('..'))
os.chdir('..')

from src import data
from src.model import get_splits

with open('config.json', 'r') as cfg:
    config = json.load(cfg)
    
random_seed = 42

In [2]:
dataset = data.load_data(config, False)
clean_data = data.filter_bad_data(dataset)
samples_pcnt = data.create_samples(clean_data)[0]

INFO:root:Dropped 0 rows with NaN values
INFO:root:Dropped 0 rows with empty string values
INFO:root:Dropped 0 rows with invalid essential columns


In [23]:
train, test = get_splits(samples_pcnt, test_size=config["test_size"], random_seed=random_seed)

train['Total expenditure per capita (1000s USD)'] /= 1000
test['Total expenditure per capita (1000s USD)'] /= 1000

train_fts = train.iloc[:,2:-2].to_numpy()
test_fts = test.iloc[:,2:-2].to_numpy()

train_labels = train["Happiness score"].to_numpy()
test_labels = test["Happiness score"].to_numpy()

In [24]:
from sklearn.linear_model import LinearRegression
# Best model was LinearRegression with no polynomial features
model = LinearRegression(fit_intercept=False).fit(train_fts, train_labels)

In [25]:
from sklearn.metrics import mean_squared_error

train_r2_score = model.score(train_fts, train_labels)
test_r2_score = model.score(test_fts, test_labels)

train_preds = model.predict(train_fts)
test_preds = model.predict(test_fts)

train_mse = mean_squared_error(train_labels, train_preds)
test_mse = mean_squared_error(test_labels, test_preds)

print("Train: R2 score: {r}, MSE: {m}".format(r=train_r2_score, m=train_mse))
print("Test: R2 score: {r}, MSE: {m}".format(r=test_r2_score, m=test_mse))

Train: R2 score: 0.9355945525735124, MSE: 0.044114646738409216
Test: R2 score: 0.8522101301946099, MSE: 0.07847607385224795


In [139]:
named_weights = dataset.groupby(['Function code', 'Function'], as_index=False).sum()[['Function code', 'Function']].set_index('Function code')
named_weights["Weight"] = model.coef_

In [140]:
parent_data = data.load_data(config, True)

parent_code_mapping = parent_data.groupby(['Function code', 'Function'], as_index=False).sum()[['Function code', 'Function']]
parent_code_mapping = parent_code_mapping.rename(columns={"Function code": "Parent code", "Function": "Parent function"})
named_weights["Parent code"] = named_weights.index.str[:3]
named_weights = pd.merge(named_weights, parent_code_mapping, on=["Parent code"])
named_weights["Happiness gain per 1% investment"] = named_weights["Weight"]/100
named_weights

Unnamed: 0,Function,Weight,Parent code,Parent function,Happiness gain per 1% investment
0,"Executive and legislative organs, financial, f...",-0.413015,010,General public services,-0.004130
1,Foreign economic aid,49.168607,010,General public services,0.491686
2,General services,17.290954,010,General public services,0.172910
3,Basic research,-1.226327,010,General public services,-0.012263
4,RandD General public services,-113.564723,010,General public services,-1.135647
...,...,...,...,...,...
64,Unemployment,0.589917,100,Social protection,0.005899
65,Housing,-0.965967,100,Social protection,-0.009660
66,Social exclusion n.e.c.,17.236207,100,Social protection,0.172362
67,RandD Social protection,-391.729622,100,Social protection,-3.917296


In [141]:
named_weights.nlargest(10, ["Weight"])

Unnamed: 0,Function,Weight,Parent code,Parent function,Happiness gain per 1% investment
44,RandD Health,78.120285,70,Health,0.781203
1,Foreign economic aid,49.168607,10,General public services,0.491686
24,Communication,45.218878,40,Economic affairs,0.452189
47,Cultural services,39.540543,80,"Recreation, culture and religion",0.395405
56,Education not definable by level,37.893231,90,Education,0.378932
15,Law courts,36.243701,30,Public order and safety,0.362437
30,Pollution abatement,31.356956,50,Environment protection,0.31357
21,Fuel and energy,27.735115,40,Economic affairs,0.277351
22,"Mining, manufacturing and construction",27.015524,40,Economic affairs,0.270155
12,Defence n.e.c.,25.527021,20,Defence,0.25527


In [142]:
named_weights.nsmallest(10, ["Weight"])

Unnamed: 0,Function,Weight,Parent code,Parent function,Happiness gain per 1% investment
67,RandD Social protection,-391.729622,100,Social protection,-3.917296
37,Street lighting,-242.08715,60,Housing and community amenities,-2.420871
38,RandD Housing and community amenities,-170.847295,60,Housing and community amenities,-1.708473
17,RandD Public order and safety,-157.302738,30,Public order and safety,-1.573027
50,"RandD Recreation, culture and religion",-147.816854,80,"Recreation, culture and religion",-1.478169
4,RandD General public services,-113.564723,10,General public services,-1.135647
11,RandD Defence,-113.50554,20,Defence,-1.135055
33,Environmental protection n.e.c.,-81.694626,50,Environment protection,-0.816946
51,"Recreation, culture and religion n.e.c.",-77.842583,80,"Recreation, culture and religion",-0.778426
39,Housing and community amenities n.e.c.,-61.725333,60,Housing and community amenities,-0.617253


In [144]:
named_weights["Difference to 0"] = 0 - named_weights["Weight"].abs()
named_weights.nlargest(10, ["Difference to 0"])

Unnamed: 0,Function,Weight,Parent code,Parent function,Happiness gain per 1% investment,Difference to 0
61,Old age,0.038013,100,Social protection,0.00038,-0.038013
52,Pre-primary and primary education,0.245861,90,Education,0.002459,-0.245861
18,Public order and safety n.e.c.,-0.365145,30,Public order and safety,-0.003651,-0.365145
0,"Executive and legislative organs, financial, f...",-0.413015,10,General public services,-0.00413,-0.413015
13,Police services,-0.565712,30,Public order and safety,-0.005657,-0.565712
64,Unemployment,0.589917,100,Social protection,0.005899,-0.589917
65,Housing,-0.965967,100,Social protection,-0.00966,-0.965967
3,Basic research,-1.226327,10,General public services,-0.012263,-1.226327
43,Public health services,1.956107,70,Health,0.019561,-1.956107
20,"Agriculture, forestry, fishing and hunting",2.54477,40,Economic affairs,0.025448,-2.54477
