In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random as rd
import seaborn as sns
import statistics as st
import statsmodels.api as sm
import tensorflow as tf

from scipy.stats import skew

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import classification_report, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR, SVR

sns.set()
mpl.style.use('classic')

In [None]:
class Happiness_Index():
    
    country = ''
    file_data = ''
    file_data_preprocessed = ''
    file_name = ''
    iso_alpha2 = ''
    iso_alpha3 = ''
    iso_numeric = ''
    iso2_alpha2 = ''
    region = ''
    score = ''    
    year = ''
    
    region_countries = {}

    def get_by_country_alpha2(self, filter_country):
        
        return self.file_data[(self.file_data['ISO_3166-1_Alpha2'] == filter_country)]
    
    def get_by_country_alpha3(self, filter_country):
        
        return self.file_data[(self.file_data['ISO_3166-1_Alpha3'] == filter_country)]
    
    def get_by_country_numeric(self, filter_country):
        
        return self.file_data[(self.file_data['ISO_3166-1_Numeric'] == filter_country)]
    
    def get_by_country_iso2(self, filter_country):
        
        return self.file_data[(self.file_data['ISO_3166-2'] == filter_country)]    
    
    def get_by_region(self, filter_region):
        
        return self.file_data[(self.file_data['Region'] == filter_region)]
    
    def get_by_year(self, filter_year):
        
        return self.file_data[(self.file_data['Year'] == filter_year)]
    
    def load_file(self):
        
        self.file_data = pd.read_csv(self.file_name)
        
        self.year = self.file_data['Year']
        self.iso_alpha2 = self.file_data['ISO_3166-1_Alpha2']
        self.iso_alpha3 = self.file_data['ISO_3166-1_Alpha3']
        self.iso_numeric = self.file_data['ISO_3166-1_Numeric']
        self.iso2_alpha2 = self.file_data['ISO_3166-2']
        self.country = self.file_data['Country']
        self.region = self.file_data['Region']
        self.score = self.file_data['Score']
        
        for region in self.region.unique():
    
            countries = self.get_by_region(region)    
            country_names = countries['Country'].unique()    
            self.region_countries[region] = country_names
            
        self.file_data.info()    
    
    def normalize_data(self):
        
        features_to_normalize = happiness_index.file_data[['Score']]

        normalized_features = normalize(features_to_normalize)

        normalized_features_df = pd.DataFrame(normalized_features, columns = ['Score_Nml'])

        happiness_index.file_data['Score_Nml'] = normalized_features_df['Score_Nml']

    def standardize_data(self):
        
        features_to_scale = happiness_index.file_data[['Score']]

        scaler = StandardScaler()
        scaler.fit(features_to_scale)
        scaled_features = scaler.transform(features_to_scale)

        scaled_features_df = pd.DataFrame(scaled_features, columns = ['Score_Std'])

        happiness_index.file_data['Score_Std'] = scaled_features_df['Score_Std']
        
    def __init__(self):
        
        pass

happiness_index = Happiness_Index()
happiness_index.file_name = '../input/happiness-index-dataset/Happiness_Index_Master.csv'
happiness_index.load_file()

In [None]:
class Health_Care(Happiness_Index):
    
    Spend_GDP = ''
    Spend_Out_Of_Pocket = ''
    Spend_Per_Capita = ''
    Spend_Per_Capita_PPP = ''
    Spend_Public = ''
    Nurse_per_1000 = ''
    Physicians_per_1000 = ''
    
    Optimal_Features = []
    Optimal_Happiness = 0
    
    def load_file(self):
        
        self.file_data = pd.read_csv(self.file_name)
        
        self.year = 2016
        self.iso_alpha2 = self.file_data['ISO_3166-1_Alpha2']
        self.iso_alpha3 = self.file_data['ISO_3166-1_Alpha3']
        self.iso_numeric = self.file_data['ISO_3166-1_Numeric']
        self.iso2_alpha2 = self.file_data['ISO_3166-2']
        self.country = self.file_data['Country']
        self.region = self.file_data['Region']
        self.score = self.file_data['Score']
        self.Spend_GDP = self.file_data['Spend_GDP']
        self.Spend_Out_Of_Pocket = self.file_data['Spend_Out_Of_Pocket']
        self.Spend_Per_Capita = self.file_data['Spend_Per_Capita']
        self.Spend_Per_Capita_PPP = self.file_data['Spend_Per_Capita_PPP']
        self.Spend_Public = self.file_data['Spend_Public']
        self.Nurse_per_1000 = self.file_data['Nurse_per_1000']
        self.Physicians_per_1000 = self.file_data['Physicians_per_1000']
        
        
        for region in self.region.unique():
    
            countries = self.get_by_region(region)    
            country_names = countries['Country'].unique()    
            self.region_countries[region] = country_names
            
        self.file_data.info() 
    
    def run_algorithm(self, \
                      algorithm, \
                      predictive_features_train, \
                      predictive_features_test, \
                      targets_train, \
                      targets_test, \
                      algorithm_description):

        predictive_model = algorithm
        predictive_model.fit(predictive_features_train, targets_train)        
        
        print(algorithm_description, 'Correlation: ', predictive_model.score(predictive_features_test, targets_test))
        
        for count in range(1, 100000):
            
            hypothetical_features = np.array([[rd.uniform(min(predictive_features_train[0]), max(predictive_features_train[0])), \
                                             rd.uniform(min(predictive_features_train[1]), max(predictive_features_train[1])), \
                                             rd.uniform(min(predictive_features_train[2]), max(predictive_features_train[2])), \
                                             rd.uniform(min(predictive_features_train[3]), max(predictive_features_train[3])), \
                                             rd.uniform(min(predictive_features_train[4]), max(predictive_features_train[4])), \
                                             rd.uniform(min(predictive_features_train[5]), max(predictive_features_train[5])), \
                                             rd.uniform(min(predictive_features_train[6]), max(predictive_features_train[6]))]])
            
            happiness_score = predictive_model.predict(hypothetical_features)
            
            if happiness_score > self.Optimal_Happiness:
                
                self.Optimal_Features = hypothetical_features
                self.Optimal_Happiness = happiness_score
                
        print(self.Optimal_Features)
        print(self.Optimal_Happiness)
    
    def regression_models(self):

        targets = self.file_data.loc[:, 'Score']
        predictive_features = self.file_data.loc[:,\
        ['Spend_GDP', 'Spend_Out_Of_Pocket', 'Spend_Per_Capita', 'Spend_Per_Capita_PPP', 'Spend_Public', 'Nurse_per_1000', 'Physicians_per_1000']]

        predictive_features_train, \
        predictive_features_test, \
        targets_train, \
        targets_test = train_test_split(predictive_features, \
                                        targets, test_size = 0.2, \
                                        random_state = 42)

        targets_train = np.ravel(targets_train)

        scaler = StandardScaler()

        scaler.fit(predictive_features_train)
        predictive_features_train_scaled = scaler.transform(predictive_features_train)
        predictive_features_test_scaled = scaler.transform(predictive_features_test)

        self.run_algorithm(LinearRegression(), \
                      predictive_features_train_scaled, \
                      predictive_features_test_scaled, \
                      targets_train, \
                      targets_test, \
                      'Linear Regression')
        
        self.run_algorithm(Ridge(), \
                      predictive_features_train_scaled, \
                      predictive_features_test_scaled, \
                      targets_train, \
                      targets_test, \
                      'Ridge')
        
        self.run_algorithm(SVR(), \
                      predictive_features_train_scaled, \
                      predictive_features_test_scaled, \
                      targets_train, \
                      targets_test, \
                      'Suport Vector Regression')
    
    
    def __init__(self):
        
        pass
    
health_care = Health_Care()
health_care.file_name = '../input/health-care/Health_Care.csv'
health_care.load_file()

In [None]:
constant = sm.add_constant(health_care.Spend_GDP)
regression_result = sm.OLS(health_care.score, constant).fit()
print(regression_result.summary())
 
plt.scatter(health_care.Spend_GDP, health_care.score)
regression_function = regression_result.params[1] * health_care.Spend_GDP + regression_result.params[0]
regression_line = plt.plot(health_care.Spend_GDP, regression_function, lw = 5, c = 'red', label = 'Regression Line')
plt.xlabel('Spend_GDP', fontsize = 15)
plt.ylabel('score', fontsize = 15)
plt.show()

constant = sm.add_constant(health_care.Spend_Out_Of_Pocket)
regression_result = sm.OLS(health_care.score, constant).fit()
print(regression_result.summary())
 
plt.scatter(health_care.Spend_Out_Of_Pocket, health_care.score)
regression_function = regression_result.params[1] * health_care.Spend_Out_Of_Pocket + regression_result.params[0]
regression_line = plt.plot(health_care.Spend_Out_Of_Pocket, regression_function, lw = 5, c = 'red', label = 'Regression Line')
plt.xlabel('Spend_Out_Of_Pocket', fontsize = 15)
plt.ylabel('score', fontsize = 15)
plt.show()

constant = sm.add_constant(health_care.Spend_Per_Capita)
regression_result = sm.OLS(health_care.score, constant).fit()
print(regression_result.summary())
 
plt.scatter(health_care.Spend_Per_Capita, health_care.score)
regression_function = regression_result.params[1] * health_care.Spend_Per_Capita + regression_result.params[0]
regression_line = plt.plot(health_care.Spend_Per_Capita, regression_function, lw = 5, c = 'red', label = 'Regression Line')
plt.xlabel('Spend_Per_Capita', fontsize = 15)
plt.ylabel('score', fontsize = 15)
plt.show()

constant = sm.add_constant(health_care.Spend_Per_Capita_PPP)
regression_result = sm.OLS(health_care.score, constant).fit()
print(regression_result.summary())
 
plt.scatter(health_care.Spend_Per_Capita_PPP, health_care.score)
regression_function = regression_result.params[1] * health_care.Spend_Per_Capita_PPP + regression_result.params[0]
regression_line = plt.plot(health_care.Spend_Per_Capita_PPP, regression_function, lw = 5, c = 'red', label = 'Regression Line')
plt.xlabel('Spend_Per_Capita_PPP', fontsize = 15)
plt.ylabel('score', fontsize = 15)
plt.show()

constant = sm.add_constant(health_care.Spend_Public)
regression_result = sm.OLS(health_care.score, constant).fit()
print(regression_result.summary())
 
plt.scatter(health_care.Spend_Public, health_care.score)
regression_function = regression_result.params[1] * health_care.Spend_Public + regression_result.params[0]
regression_line = plt.plot(health_care.Spend_Public, regression_function, lw = 5, c = 'red', label = 'Regression Line')
plt.xlabel('Spend_Public', fontsize = 15)
plt.ylabel('score', fontsize = 15)
plt.show()

constant = sm.add_constant(health_care.Nurse_per_1000)
regression_result = sm.OLS(health_care.score, constant).fit()
print(regression_result.summary())
 
plt.scatter(health_care.Nurse_per_1000, health_care.score)
regression_function = regression_result.params[1] * health_care.Nurse_per_1000 + regression_result.params[0]
regression_line = plt.plot(health_care.Nurse_per_1000, regression_function, lw = 5, c = 'red', label = 'Regression Line')
plt.xlabel('Nurse_per_1000', fontsize = 15)
plt.ylabel('score', fontsize = 15)
plt.show()

constant = sm.add_constant(health_care.Physicians_per_1000)
regression_result = sm.OLS(health_care.score, constant).fit()
print(regression_result.summary())
 
plt.scatter(health_care.Physicians_per_1000, health_care.score)
regression_function = regression_result.params[1] * health_care.Physicians_per_1000 + regression_result.params[0]
regression_line = plt.plot(health_care.Physicians_per_1000, regression_function, lw = 5, c = 'red', label = 'Regression Line')
plt.xlabel('Physicians_per_1000', fontsize = 15)
plt.ylabel('score', fontsize = 15)
plt.show()

In [None]:
health_care.regression_models()

In [None]:
target = ['Score']
predictors = health_care.file_data.columns[7:].tolist()
y = np.array(health_care.file_data[target])

fig = plt.figure(figsize=(20,25))

for i, x in enumerate(predictors):
    fig.add_subplot(7, 5, i+1)
    plt.scatter(health_care.file_data[x], health_care.file_data['Score'])
    plt.xlabel(x)
    plt.ylabel('Score')

    plt.show()

In [None]:
skews = health_care.file_data.iloc[:, 7:].apply(lambda x: skew(x), axis=0)
skewed = skews.index[skews > 1]
skewed

In [None]:
fig = plt.figure(figsize = (20, 25))

for i, x in enumerate(skewed):
    fig.add_subplot(7, 5, i + 1)
    plt.scatter(np.log(health_care.file_data[x] + 10), health_care.file_data['Score'])
    plt.xlabel(x)
    plt.ylabel('Score')

plt.show()

In [None]:
class do_preprocess(BaseEstimator, TransformerMixin):
    
    def __init__(self, cols, scaler): 
        self.cols = cols
        self.scaler = scaler
    
    def fit(self,X,y=None):
        self.scaler.fit(X[self.cols])
        return self
    
    def transform(self, X, y=None):
        for col in self.cols:
            if X[col].isna().sum()>0:
                X1 = X[['Country',col]].groupby('Country')
                previous = X1.fillna(method='bfill') 
                posterior = X1.fillna(method='ffill') 
                X[col] = (previous + posterior) / 2.0
                X[col].fillna(method='ffill',inplace=True)
                X[col].fillna(method='bfill',inplace=True)
                
            if skew(X[col]) > 1:
                X[col] = np.log(X[col]+10)
                X[col].fillna(X[col].min(),inplace=True)
                
        X[self.cols] = self.scaler.transform(X[self.cols])
        X.index = X['ISO_3166-1_Alpha2'] + '2019'
        return(X)
    
target = ['Score']
predictors = health_care.file_data.columns[8:].tolist()

run_pipeline = Pipeline([('do_preprocess', do_preprocess(predictors, StandardScaler())),])

In [None]:
health_care.file_data_preprocessed = run_pipeline.fit_transform(health_care.file_data.copy())

health_care.file_data.isna().sum().plot.bar(figsize=(15,3),title='Nans before processing')
plt.show()

health_care.file_data.isna().sum().plot.bar(figsize=(15,3),title='Nans after processing')
plt.show()

health_care.file_data[predictors].plot.box(figsize=(15,5),title='Distribution before processing')
plt.show()

health_care.file_data_preprocessed[predictors].plot.box(figsize=(15,5),title='Distribution after processing')
plt.show()

In [None]:
sns.pairplot(health_care.file_data_preprocessed[predictors+target])
plt.show()

In [None]:
correlation_matrix = health_care.file_data_preprocessed[predictors+target].corr(method='pearson')
sns.clustermap(correlation_matrix, figsize=(15,15))
plt.show()

In [None]:
model = sm.OLS(health_care.file_data_preprocessed[target], sm.add_constant(health_care.file_data_preprocessed[predictors]))
model = model.fit()

print(model.summary())

In [None]:
figure, ax = plt.subplots(figsize=(15,10))
sm.graphics.influence_plot(model, ax=ax)
figure.tight_layout(pad=1.0)

In [None]:
Xall = np.array(health_care.file_data_preprocessed[predictors])
yall = np.array(health_care.file_data_preprocessed[target])[:,0]
Xtrain, Xvalid, ytrain, yvalid = train_test_split(Xall, yall, test_size=0.25, random_state=2020)

model = LinearRegression().fit(Xtrain, ytrain)

ypred = model.predict(Xvalid)

print('MSE = ', mean_squared_error(yvalid, ypred))
print('MAE = ', mean_absolute_error(yvalid, ypred))

figure, ax = plt.subplots(figsize=(15,5))
plt.plot(ypred, yvalid-ypred, 'b.')
ax.axhline(y=0, color='k')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residual plot')
plt.show()

print('Intercept=', model.intercept_)
pd.DataFrame({'Coefficient':model.coef_}, index=predictors).plot.bar(figsize=(15,5))
plt.title('Predictor coefficients')
plt.show()

In [None]:
model = LinearSVR(max_iter = 10**6).fit(Xtrain, ytrain)

ypred = model.predict(Xvalid)

print('MSE = ', mean_squared_error(yvalid, ypred))
print('MAE = ', mean_absolute_error(yvalid, ypred))

figure, ax = plt.subplots(figsize=(15,5))
plt.plot(ypred, yvalid-ypred, 'b.')
ax.axhline(y=0, color='k')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residual plot')
plt.show()

pd.DataFrame({'Coefficient':model.coef_},index=predictors).plot.bar(figsize=(15,5))
plt.title('Predictor coefficients')
plt.show()

In [None]:
tf.random.set_seed(1234)

def get_model():
    tfin = tf.keras.layers.Input(shape=(Xtrain.shape[1],), name='X')
    tf1 = tf.keras.layers.Dense(64,activation='linear')(tfin)
    tfout = tf.keras.layers.Dense(units=1,activation='linear',name='out')(tf1)
    model = tf.keras.Model(tfin, tfout)
    model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])
    return(model)

model = get_model()
hist = model.fit(x=Xtrain, y=ytrain, validation_data=(Xvalid,yvalid), verbose=0, epochs=20, batch_size=4)

ypred = model.predict(Xvalid)[:,0]

print('MSE = ',mean_squared_error(yvalid, ypred))
print('MAE = ',mean_absolute_error(yvalid, ypred))

figure, ax = plt.subplots(figsize=(15,5))
plt.plot(ypred, yvalid-ypred, 'b.')
ax.axhline(y=0, color='k')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residual plot')
plt.show()

figure = plt.figure()
epochs = range(len(hist.history['loss']))
plt.plot(hist.history['loss'],label='Training')
plt.plot(hist.history['val_loss'],label='Validation')
plt.legend()
plt.title('Training and validation loss')
plt.xlabel("Epoch")
plt.ylabel("Value")
figure.set_size_inches(15,4)

In [None]:
correlation_matrix

In [None]:
file_data_new = run_pipeline.fit_transform(health_care.file_data.copy())

model = LinearRegression()
kfolds = KFold(n_splits=5, random_state=1989, shuffle=True)
performance = pd.DataFrame(np.zeros(len(predictors)), index=predictors,columns=['MSE'])
current = []

for i, x in enumerate(predictors):
    
    compare = [x for x in predictors if x not in current]
    
    performance = pd.DataFrame(np.zeros(len(compare)), index=compare,columns=['MSE'])
    
    for predictor in compare:
        
        X = np.array(file_data_new[current+[predictor]])
        yp_test = np.zeros(len(y))
        
        for train_index, test_index in kfolds.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model.fit(X_train, y_train)
            yp_test[test_index] = model.predict(X_test)[:,0]
        
        performance.loc[predictor] = mean_squared_error(y,yp_test)
    
    performance.sort_values(by='MSE', inplace=True)
    current = current + [performance.index[0]]

    X = np.array(file_data_new[current])
    yp_test = np.zeros(len(y))
    
    for train_index, test_index in kfolds.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        yp_test[test_index] = model.predict(X_test)[:,0]

    fig = plt.figure(figsize = (15,4))
    ax = fig.add_subplot(1, 2, 1)
    performance.plot.bar(ax = ax)
    fig.add_subplot(1, 2, 2)    
    plt.scatter(y, yp_test)
    plt.xlabel('True values')
    plt.ylabel('Predicted values')
    plt.title(current[-1]+', MSE = ' + str(mean_squared_error(y, yp_test)))
    plt.show()