# Q2.1

1. Use `matplotlib` to show scatterplots of each variable

In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

data = pd.read_csv("happiness.csv")

# fill impossible data with NaN
data.loc[data['inflation_rate[%]'] > 100, 'inflation_rate[%]'] = np.nan


y = data.loc[:,'happiness_score']

data.drop(columns = ['country', 'happiness_rank', 'map_reference', 'biggest_official_language'], inplace=True)

s = 1

plt.subplot(4, 4, 1)
plt.scatter(data.loc[:,'economy'], y, s)
plt.title('economy')

plt.subplot(4, 4, 2)
plt.scatter(data.loc[:,'family'], y, s)
plt.title('family')

plt.subplot(4, 4, 3)
plt.scatter(data.loc[:,'health'], y, s)
plt.title('health')

plt.subplot(4, 4, 4)
plt.scatter(data.loc[:,'freedom'], y, s)
plt.title('freedom')

plt.subplot(4, 4, 5)
plt.scatter(data.loc[:,'generosity'], y, s)
plt.title('generosity')

plt.subplot(4, 4, 6)
plt.scatter(data.loc[:,'corruption'], y, s)
plt.title('corruption')

plt.subplot(4, 4, 7)
plt.scatter(data.loc[:,'dystopia_residual'], y, s)
plt.title('dystopia_residual')

plt.subplot(4, 4, 8)
plt.scatter(data.loc[:,'internet_access_population[%]'], y, s)
plt.title('internet_access_population[%]')

plt.subplot(4, 4, 9)
plt.scatter(data.loc[:,'cellular_subscriptions'], y, s)
plt.title('cellular_subscriptions')

plt.subplot(4, 4, 10)
plt.scatter(data.loc[:,'surplus_deficit_GDP[%]'], y, s)
plt.ylim(plt.ylim()[0],100)
plt.title('surplus_deficit_GDP[%]')

plt.subplot(4, 4, 11)
plt.scatter(data.loc[:,'familiy_income_gini_coeff'], y, s)
plt.title('familiy_income_gini_coeff')

plt.subplot(4, 4, 12)
plt.scatter(data.loc[:,'GDP_per_capita[$]'], y, s)
plt.title('GDP_per_capita[$]')

plt.subplot(4, 4, 13)
plt.scatter(data.loc[:,'inflation_rate[%]'], y, s)
plt.ylim(plt.ylim()[0],100)
plt.title('inflation_rate[%]')

plt.subplot(4, 4, 14)
plt.scatter(data.loc[:,'military_expenditures[%]'], y, s)
plt.ylim(plt.ylim()[0],100)
plt.title('military_expenditures[%]')

plt.subplot(4, 4, 15)
plt.scatter(data.loc[:,'population'], y, s)
plt.title('population')

plt.subplots_adjust(top=2, bottom=0, left=0, right=2, hspace=0.35, wspace=0.35)

#np.corrcoef(x, data.loc[:,'population'])



#np.corrcoef(data.loc[:,'economy'], data.loc[:,'family'])
#np.corrcoef(data.loc[:,'freedom'], data.loc[:,'cellular_subscriptions'])
#np.corrcoef([data.loc[:,'freedom'], data.loc[:,'familiy_income_gini_coeff']])
#np.corrcoef([x, data.loc[:,'familiy_income_gini_coeff']])



\begin{enumerate}
    \item economy: positive linear correlation
    \item family: positive linear/quadratic correlation
    \item health: positive linear correlation
    \item freedom: linear / quadratic / cubic correlation
    \item generosity: weak quadratic correlation / no correlation
    \item corruption: quadratic correlation / no correlation
    \item dystopia residual: no correlation
    \item internet acces population $\left[ \% \right]$: positive linear correlation / cubic correlation
    \item cellular subscriptions: quadratic correlation
    \item surplus deficit GDP $\left[ \% \right]$: no correlation
    \item family income gini coeff: no correlation / weak negative linear correlation
    \item GDP per capital $\left[ \$ \right]$: quadratic correlation
    \item inflation rate $\left[ \% \right]$: no correlation
    \item military expenditures $\left[ \% \right]$: no correlation
    \item population: no correlation
\end{enumerate}

For the linear model only linear (and possibly linear) correlations were considered. Also, the VIF was calculated to check for multicolinearity (i.e., to check if independent variables are correlated among each other). This is ofcourse necessary as $R^2$ predicted falls of if multicolinearity is too high). A VIF of 1 is good, a VIF between 5 and 10 indicates high correlation. From the left table below, the economy and internet access population columns are too high and could cause trouble.

\begin{tabular}{|l|c|}
    \hline
    Variance inflation factor (VIF) &\\
    \hline
    economy &                          6.330479\\
    family &                           2.052008\\
    health &                           3.974670\\
    freedom &                          1.246693\\
    dystopia\_residual &                1.029444\\
    internet\_access\_population $\left[ \% \right]$ &    5.689814\\
    \hline
\end{tabular}

By removing the 'internet access population' variable, the VIF is relatively low for all independent variables. 

\begin{tabular}{|l|c|}
    \hline
    Variance inflation factor (VIF) &\\
    \hline
    economy &              4.161531\\
    family &               2.044164\\
    health &               3.502881\\
    freedom &              1.245993\\
    dystopia\_residual &    1.013095\\
    \hline
\end{tabular}

The VIF was calculated using the \textit{scipy} package and therefore, only the results are presented here.

# Q2.2

Load data and set up packages

In [2]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold

Check the dataset for missing values and, if any are found, address them programmatically

In [16]:
# we dropped unwanted columns before and replaced extreme outliers with NaNs.
data.fillna(data.mean(), inplace=True);


# analyzing the linear model with scipy statistics
#import scipy as sp
#X = data.loc[:,['family', 'health', 'internet_access_population[%]', 'freedom', 'familiy_income_gini_coeff']]
#cc = sp.corrcoef(X, rowvar=False)
#VIF = np.linalg.inv(cc)
#VIF.diagonal()

#sp.stats.f_oneway(data['happiness_score'], data['freedom'], data['familiy_income_gini_coeff'])
#sp.stats.kruskal(data['happiness_score'], data['internet_access_population[%]'])


from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X = data.loc[:,['economy', 'family', 'health', 'freedom', 'dystopia_residual', 'internet_access_population[%]']]
X = add_constant(X)
pd.Series([variance_inflation_factor(X.values, i)
          for i in range(X.shape[1])],
          index=X.columns)


const                            33.742991
economy                           6.330479
family                            2.052008
health                            3.974670
freedom                           1.246693
dystopia_residual                 1.029444
internet_access_population[%]     5.689814
dtype: float64

## Linear model

In [18]:
from sklearn.model_selection import cross_val_score

#linearModel = LinearRegression();

#x_reshaped = np.array(data.happiness_score).reshape(-1,1)
#linearModel.fit(x_reshaped, data.family)

#reg_x = np.linspace(2.5,7.5, 100).reshape(-1,1)
#reg_y = linearModel.predict(reg_x)

#plt.scatter(data.happiness_score, data.family)
#plt.plot(reg_x, reg_y, c ='red')

X = data.loc[:,['economy', 'family', 'health', 'freedom', 'dystopia_residual']]
y = data['happiness_score']

kr = KernelRidge(alpha=1e-6, kernel='linear', gamma=None, coef0=1, kernel_params=None)
print(kr.fit(X,y).score(X,y))


scores = cross_val_score(kr, X, y, cv=5)
print(scores)
scores.sum()
#kf = KFold(5)
#kf.get_n_splits(X,y)
#for train_index, test_index in kf.split(X,y):
#    X_train, X_test = X[train_index], X[test_index]
#    y_train, y_test = y[train_index], y[test_index]
#    y_train = lab_enc.fit_transform(y_train)
#    y_test = lab_enc.fit_transform(y_test)
#    clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
#    print(clf.score(X_test, y_test))

    
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)



0.9775544582666984
[ 0.59447929  0.33292534 -0.12752385  0.45375285  0.6455585 ]


1.8991921331703663

{text results here}

## Quadratic model

In [5]:

#sklearn.metrics.pairwise.polynomial_kernel(x_reshaped, np.array(data.family).reshape(-1,1), degree=2, gamma=None, coef0=1)

#reg_x = np.linspace(2.5,7.5, 100).reshape(-1,1)
#reg_y = linearModel.predict(reg_x)

#plt.scatter(data.happiness_score, data.family)
#plt.plot(reg_x, reg_y, c ='red')


{text results here}

## Gaussian model

{text results here}

## Comparison

{replace with your comparison paragraph/images}