In [1]:
import pandas as pd
import numpy as np
import math as m
from scipy import stats
import matplotlib.pyplot as plt
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant

In [2]:
# Read data
df_hscore = pd.read_csv('Happiness.csv')
df_hscore.columns

Index(['Country', 'Region', 'Happiness Score', 'Economy (GDP per Capita)',
       'Family', 'Health (Life Expectancy)', 'Freedom',
       'Trust (Government Corruption)', 'Generosity'],
      dtype='object')

In [3]:
# Split the data into Y (output) and X (input)
Y = df_hscore[['Happiness Score']]
X = df_hscore[['Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity']]

In [4]:
# Normalize data
x_data = np.array(X)
for i in range(0,len(X.columns)-1):
       x_data[:,i] = (x_data[:,i]-x_data[:,i].mean())/x_data[:,i].std()

df_norm = pd.DataFrame(x_data, columns = X.columns)

In [5]:
# Fit an Ordinary Least Squares (OLS) Regression estimator
ols_model = OLS(Y, add_constant(df_norm)).fit()
ols_model.summary()

0,1,2,3
Dep. Variable:,Happiness Score,R-squared:,0.788
Model:,OLS,Adj. R-squared:,0.779
Method:,Least Squares,F-statistic:,92.65
Date:,"Sat, 11 Jul 2020",Prob (F-statistic):,6.489999999999999e-48
Time:,16:01:16,Log-Likelihood:,-121.49
No. Observations:,157,AIC:,257.0
Df Residuals:,150,BIC:,278.4
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.3435,0.098,54.668,0.000,5.150,5.537
Economy (GDP per Capita),0.2967,0.089,3.323,0.001,0.120,0.473
Family,0.3269,0.061,5.353,0.000,0.206,0.448
Health (Life Expectancy),0.3284,0.080,4.117,0.000,0.171,0.486
Freedom,0.2196,0.056,3.902,0.000,0.108,0.331
Trust (Government Corruption),0.1017,0.051,1.977,0.050,6.67e-05,0.203
Generosity,0.1595,0.362,0.440,0.660,-0.556,0.875

0,1,2,3
Omnibus:,3.267,Durbin-Watson:,1.731
Prob(Omnibus):,0.195,Jarque-Bera (JB):,2.881
Skew:,-0.23,Prob(JB):,0.237
Kurtosis:,3.479,Cond. No.,14.8


In [6]:
# New dataframe column with the OLS predictions
df_hscore['OLS_fit'] = ols_model.predict()

In [7]:
# Calculate residuals, degrees of freedom and
res = df_hscore['Happiness Score'] - df_hscore['OLS_fit']
n = len(df_norm)
p = len(df_norm.columns)
dof = n - p - 1

In [8]:
# Calculate sigma^2(hat) and (X*X^T)^-1
sigma_sq_est = (res**2).sum()/dof
X_sq_mat_inv = np.linalg.inv(np.matmul(np.transpose(x_data),x_data))

In [9]:
# A vector with 0.5 incrementes in Happiness Score
selected_values = np.arange(np.array(Y).min().round(),np.array(Y).max().round(),0.5)
sel_values = np.reshape(selected_values, (len(selected_values),1))