# World Happiness Analysis

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Get the data

In [None]:
%%bash

# *Warning*: You need kaggle-api to be setup, for more information: https://github.com/Kaggle/kaggle-api

if [ ! -d data ]; then
    kaggle datasets download -d unsdsn/world-happiness
    mkdir -p data
    unzip world-happiness.zip -d data
fi

ls data

## Simple analysis of the 2015 dataset

In [None]:
happiness_2015 = pd.read_csv("./data/2015.csv")

In [None]:
happiness_2015

In [None]:
happiness_2015.groupby("Region").mean().sort_values("Happiness Score", ascending=False)

In [None]:
variables = [
    "Economy (GDP per Capita)",
    "Family",
    "Health (Life Expectancy)",
    "Freedom",
    "Trust (Government Corruption)",
    "Generosity",
]
variables

In [None]:
fig, axs = plt.subplots(
    ncols=2, nrows=len(variables) // 2, figsize=(16, 2 * len(variables))
)

for i, column in enumerate(variables):
    sns.regplot(
        data=happiness_2015, y="Happiness Score", x=column, ax=axs[i // 2, i % 2]
    );

In [None]:
sns.heatmap(happiness_2015.corr());

In [None]:
regions = happiness_2015["Region"].unique()
regions

In [None]:
fig, axs = plt.subplots(
    ncols=2, nrows=len(regions) // 2, figsize=(12, 2 * len(regions))
)
fig.tight_layout(pad=10)
fig.autofmt_xdate(rotation=45)

for i, region in enumerate(regions):
    axs[i // 2, i % 2].set_title(f"{region} correlation plot")
    sns.heatmap(
        data=happiness_2015[happiness_2015["Region"] == region].corr(),
        ax=axs[i // 2, i % 2],
    );

In [None]:
plot = sns.catplot(
    data=happiness_2015, kind="box", x="Region", y="Happiness Score", aspect=2
)
plot.set_xticklabels(rotation=45);

In [None]:
sns.scatterplot(
    data=happiness_2015,
    y="Happiness Score",
    x="Family",
    hue="Region",
    size="Economy (GDP per Capita)",
)
plt.legend(loc="center right", bbox_to_anchor=(2.2, 0.5), ncol=2);

## ML testing

In [None]:
import numpy as np
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    MultiTaskLasso,
    PassiveAggressiveRegressor,
    SGDRegressor,
)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

### Helpers

In [None]:
def pred_real_plot(predictions, test_y):
    pred_real = list(zip(predictions, test_y.values.flatten()))
    g = sns.jointplot(x=predictions, y=test_y.values.flatten())
    g.ax_joint.set_xlabel("ground truth")
    g.ax_joint.set_ylabel("predictions")

    x0, x1 = g.ax_joint.get_xlim()
    y0, y1 = g.ax_joint.get_ylim()
    lims = [max(x0, y0), min(x1, y1)]
    g.ax_joint.plot(lims, lims, ":k")

In [None]:
def regression(reg, train_x, train_y, test_x, test_y):
    reg.fit(train_x, train_y)
    predictions = reg.predict(test_x)
    print("MSE:", mean_squared_error(test_y, predictions))

    return predictions

### Split train/test

In [None]:
# split train test
train, test = train_test_split(happiness_2015, test_size=0.2, random_state=42)

In [None]:
train_x = train[variables]
train_y = train[["Happiness Score"]]

test_x = test[variables]
test_y = test[["Happiness Score"]]

### Linear Regression

In [None]:
pred = regression(LinearRegression(), train_x, train_y, test_x, test_y)

In [None]:
pred_real_plot(pred.flatten(), test_y)

### Multi Task Lasso

In [None]:
pred = regression(MultiTaskLasso(alpha=1 / 1000), train_x, train_y, test_x, test_y)

In [None]:
pred_real_plot(pred.flatten(), test_y)

### Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

In [None]:
pred = regression(Ridge(), train_x, train_y, test_x, test_y)

In [None]:
pred_real_plot(pred.flatten(), test_y)

### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
pred = regression(DecisionTreeRegressor(), train_x, train_y, test_x, test_y)

In [None]:
pred_real_plot(pred, test_y)

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
pred = regression(
    RandomForestRegressor(max_depth=5, random_state=42),
    train_x,
    train_y.values.ravel(),
    test_x,
    test_y,
)

In [None]:
pred_real_plot(pred, test_y)

### SGD

In [None]:
pred = regression(
    SGDRegressor(max_iter=10000, tol=1e-5),
    train_x,
    train_y.values.ravel(),
    test_x,
    test_y,
)

In [None]:
pred_real_plot(pred, test_y)

### Passive Aggressive Regressor

In [None]:
pred = regression(
    PassiveAggressiveRegressor(max_iter=100, random_state=0),
    train_x,
    train_y.values.ravel(),
    test_x,
    test_y,
)

In [None]:
pred_real_plot(pred, test_y)

### Support Vector Regression

In [None]:
from sklearn.svm import SVR

In [None]:
pred = regression(SVR(), train_x, train_y.values.ravel(), test_x, test_y)

In [None]:
pred_real_plot(pred, test_y)

### MLP Regressor

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
pred = regression(
    MLPRegressor(random_state=4, hidden_layer_sizes=(100,)),
    train_x,
    train_y.values.ravel(),
    test_x,
    test_y,
)

In [None]:
pred_real_plot(pred, test_y)

### Voting Regressor

In [None]:
from sklearn.ensemble import VotingRegressor

In [None]:
lr = LinearRegression()
rfr = RandomForestRegressor(max_depth=5, random_state=42)
sgd = SGDRegressor(max_iter=10000, tol=1e-5)
mlp = MLPRegressor(random_state=4)

models = [("lr", lr), ("rf", rfr), ("sgd", sgd), ("mlp", mlp)]

voting_reg = VotingRegressor(estimators=models)

pred = regression(voting_reg, train_x, train_y.values.ravel(), test_x, test_y)

In [None]:
pred_real_plot(pred, test_y)