# World Happiness Analysis

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:80% !important; }</style>"))

## Get the data

In [None]:
%%bash

# *Warning*: You need kaggle-api to be setup, for more information: https://github.com/Kaggle/kaggle-api

if [ ! -d data ]; then
    kaggle datasets download -d unsdsn/world-happiness
    mkdir -p data
    unzip world-happiness.zip -d data
fi

ls data

## Simple analysis of the 2015 dataset

In [None]:
happiness_2015 = pd.read_csv("./data/2015.csv")

In [None]:
happiness_2015

In [None]:
happiness_2015.groupby("Region").mean().sort_values("Happiness Score", ascending=False)

In [None]:
variables = [
    "Economy (GDP per Capita)",
    "Family",
    "Health (Life Expectancy)",
    "Freedom",
    "Trust (Government Corruption)",
    "Generosity",
]

reduced_2015 = happiness_2015[variables]

In [None]:
fig, axs = plt.subplots(
    ncols=2, nrows=len(variables) // 2, figsize=(16, 2 * len(variables))
)

for i, column in enumerate(variables):
    sns.regplot(
        data=happiness_2015, y="Happiness Score", x=column, ax=axs[i // 2, i % 2]
    );

In [None]:
regions = happiness_2015["Region"].unique()
regions

In [None]:
fig, axs = plt.subplots(
    ncols=2, nrows=len(regions) // 2, figsize=(12, 2 * len(regions))
)
fig.tight_layout(pad=10)
fig.autofmt_xdate(rotation=45)

reduced_with_region = happiness_2015[variables + ["Region"]]

for i, region in enumerate(regions):
    axs[i // 2, i % 2].set_title(f"{region} correlation plot")
    sns.heatmap(
        data=reduced_with_region[reduced_with_region["Region"] == region].corr(),
        ax=axs[i // 2, i % 2],
    );

In [None]:
plot = sns.catplot(
    data=happiness_2015, kind="box", x="Region", y="Happiness Score", aspect=2
)
plot.set_xticklabels(rotation=45);

In [None]:
sns.scatterplot(
    data=happiness_2015,
    y="Happiness Score",
    x="Family",
    hue="Region",
    size="Economy (GDP per Capita)",
)
plt.legend(loc="center right", bbox_to_anchor=(2.2, 0.5), ncol=2);

## ML testing

In [None]:
import numpy as np
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    MultiTaskLasso,
    PassiveAggressiveRegressor,
    SGDRegressor,
)
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

### Helpers

In [None]:
errors = pd.DataFrame(data={"name": [], "mse": [], "r2": []})

In [None]:
def pred_real_plot(predictions, test_y):
    """
    predictions: prediction list from model.predict()
    test_y: ground truth

    The dotted represents a perfect result, the farther apart
    from that line the dots are, the more the model predicted
    a wrong result.
    """
    pred_real = list(zip(predictions, test_y.values.flatten()))
    g = sns.jointplot(x=predictions, y=test_y.values.flatten())
    g.ax_joint.set_xlabel("ground truth")
    g.ax_joint.set_ylabel("predictions")

    x0, x1 = g.ax_joint.get_xlim()
    y0, y1 = g.ax_joint.get_ylim()
    lims = [max(x0, y0), min(x1, y1)]
    g.ax_joint.plot(lims, lims, ":k")

In [None]:
def regression(reg, train_x, train_y, test_x, test_y, name):
    """
    reg: regression model to use
    train_: training data
    test_: testing data
    name: model name

    This will print the different scores of the models

    There are currently two scoring methods used:
    - mse: mean squared error
    - r2: r squared
    Measures how close we are to the regression line
    """
    reg.fit(train_x, train_y)
    predictions = reg.predict(test_x)
    mse = mean_squared_error(test_y, predictions)
    r2 = r2_score(test_y, predictions)
    print("MSE:", mse)
    print("R^2:", r2)

    errors = {"name": name, "mse": mse, "r2": r2}

    return predictions, errors

### Split train/test

In [None]:
# split train test
train, test = train_test_split(happiness_2015, test_size=0.2, random_state=42)

In [None]:
train_x = train[variables]
train_y = train[["Happiness Score"]]

test_x = test[variables]
test_y = test[["Happiness Score"]]

### Linear Regression

Ordinary least squares

In [None]:
pred, error = regression(
    LinearRegression(), train_x, train_y, test_x, test_y, "linear regression"
)

errors = errors.append(error, ignore_index=True)

In [None]:
pred_real_plot(pred.flatten(), test_y)

### Ridge Regression

Also called Tikhonov regularization, this is just a linear least squares with l2 regularization

In [None]:
Ridge?

In [None]:
from sklearn.linear_model import Ridge

In [None]:
pred, error = regression(Ridge(), train_x, train_y, test_x, test_y, "ridge regression")

errors = errors.append(error, ignore_index=True)

In [None]:
pred_real_plot(pred.flatten(), test_y)

### Decision Tree Regressor

Basic decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz

In [None]:
tree = DecisionTreeRegressor(max_depth=4)
pred, error = regression(
    tree, train_x, train_y, test_x, test_y, "decision tree regressor"
)

errors = errors.append(error, ignore_index=True)

In [None]:
from graphviz import Source
from IPython.display import SVG

graph = Source(export_graphviz(tree, out_file=None, feature_names=variables))
SVG(graph.pipe(format="svg"))

In [None]:
pred_real_plot(pred, test_y)

### Random Forest Regressor

Estimator using multiple decision trees, this allows to improve the predictive accuracy and help with over-fitting

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
pred, error = regression(
    RandomForestRegressor(max_depth=4, random_state=42),
    train_x,
    train_y.values.ravel(),
    test_x,
    test_y,
    "random forest regressor",
)

errors = errors.append(error, ignore_index=True)

In [None]:
pred_real_plot(pred, test_y)

### SGD

In [None]:
pred, error = regression(
    SGDRegressor(max_iter=10000, tol=1e-5),
    train_x,
    train_y.values.ravel(),
    test_x,
    test_y,
    "stochastic gradien descent",
)

errors = errors.append(error, ignore_index=True)

In [None]:
pred_real_plot(pred, test_y)

### Support Vector Regression

This allows us to be less sensitive to outliers by tolerating some error up to a certain threshold.

In [None]:
from sklearn.svm import SVR

In [None]:
pred, error = regression(
    SVR(), train_x, train_y.values.ravel(), test_x, test_y, "support vector regression"
)

errors = errors.append(error, ignore_index=True)

In [None]:
pred_real_plot(pred, test_y)

### MLP Regressor

Multi-layer Perceptron regressor that optimizes the squared loss

In [None]:
MLPRegressor?

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
pred, error = regression(
    MLPRegressor(random_state=4, hidden_layer_sizes=(100,)),
    train_x,
    train_y.values.ravel(),
    test_x,
    test_y,
    "mlp regressor",
)

errors = errors.append(error, ignore_index=True)

In [None]:
pred_real_plot(pred, test_y)

### Voting Regressor

In [None]:
VotingRegressor?

In [None]:
from sklearn.ensemble import VotingRegressor

In [None]:
lr = LinearRegression()
rfr = RandomForestRegressor(max_depth=5, random_state=42)
sgd = SGDRegressor(max_iter=10000, tol=1e-5)
mlp = MLPRegressor(random_state=4)

models = [("lr", lr), ("rf", rfr), ("sgd", sgd), ("mlp", mlp)]

voting_reg = VotingRegressor(estimators=models)

pred, error = regression(
    voting_reg, train_x, train_y.values.ravel(), test_x, test_y, "voting regressor"
)

errors = errors.append(error, ignore_index=True)

In [None]:
pred_real_plot(pred, test_y)

## Compare ML models

In [None]:
errors.set_index("name", inplace=True)

In [None]:
errors[["mse"]].T.style.highlight_min(axis=1, color="lightgreen").highlight_max(
    axis=1, color="lightcoral"
)

In [None]:
errors[["r2"]].T.style.highlight_max(axis=1, color="lightgreen").highlight_min(
    axis=1, color="lightcoral"
)