In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import logging
logging.root.setLevel(10)

import sys
import json
import os
sys.path.insert(0, os.path.realpath('..'))
os.chdir('..')

from src import data
from src.model import get_splits

with open('config.json', 'r') as cfg:
    config = json.load(cfg)
    
random_seed = 42

In [2]:
dataset = data.load_data(config, False)
clean_data = data.filter_bad_data(dataset)
samples_pcnt = data.create_samples(clean_data)[0]

INFO:root:Dropped 0 rows with NaN values
INFO:root:Dropped 0 rows with empty string values
INFO:root:Dropped 0 rows with invalid essential columns


In [5]:
train, test = get_splits(samples_pcnt, test_size=config["test_size"], random_seed=random_seed)

train['Total expenditure per capita (1000s USD)'] /= 1000
test['Total expenditure per capita (1000s USD)'] /= 1000

train_fts = train['Total expenditure per capita (1000s USD)'].to_numpy().reshape(-1, 1)
test_fts = test['Total expenditure per capita (1000s USD)'].to_numpy().reshape(-1, 1)

train_labels = train["Happiness score"].to_numpy()
test_labels = test["Happiness score"].to_numpy()

In [6]:
from sklearn.linear_model import LinearRegression
# Best model was LinearRegression with no polynomial features
model = LinearRegression().fit(train_fts, train_labels)

In [7]:
from sklearn.metrics import mean_squared_error

train_r2_score = model.score(train_fts, train_labels)
test_r2_score = model.score(test_fts, test_labels)

train_preds = model.predict(train_fts)
test_preds = model.predict(test_fts)

train_mse = mean_squared_error(train_labels, train_preds)
test_mse = mean_squared_error(test_labels, test_preds)

print("Train: R2 score: {r}, MSE: {m}".format(r=train_r2_score, m=train_mse))
print("Test: R2 score: {r}, MSE: {m}".format(r=test_r2_score, m=test_mse))

Train: R2 score: 0.3583214258247438, MSE: 0.43951908961827574
Test: R2 score: 0.4000304924958994, MSE: 0.31858239974084757
