In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import KFold

In [2]:
with open('CSV_Creation/OECD_countries_data.csv') as f:
    overall_data_by_country = pd.read_csv(f, error_bad_lines=False)

#we want to predict Death_rate by using countries' features 
features = list(overall_data_by_country.columns)
features.remove('Death_rate')
features.remove('Country')
overall_data_by_country = overall_data_by_country.set_index('Country')

#countries are classified according their death rate. Countries with the label 'True' means 
#that these countries have a high mortality rate compare to other countries

def classify_countries(threshold, overall_data_by_country):
    death_rate_class = []
    for i in range(overall_data_by_country.shape[0]):
        if overall_data_by_country.values[i,0] > threshold:
            death_rate_class.append(True)
        else:
            death_rate_class.append(False)
    return np.array(death_rate_class)

death_rate_class = classify_countries(8, overall_data_by_country) #the label (y)
other_data = overall_data_by_country[features].values #features to find the label (X)

In [3]:
#separating the data for training and testing 
train_death_rate, test_death_rate, train_other_data, test_other_data = train_test_split(death_rate_class, other_data, test_size=0.25)

In [4]:
#building of the model
Gbc = GradientBoostingClassifier()
cv = KFold(n_splits=5)

Gbc.fit(train_other_data, train_death_rate)
pred_death_rate = Gbc.predict(test_other_data)
pred_train = Gbc.predict(train_other_data)

In [5]:
#tool to assess the model with the training dataset
print('Accuracy:', metrics.accuracy_score(train_death_rate, pred_train))
print('Precision:', metrics.precision_score(train_death_rate, pred_train))
print('Recall: ', metrics.recall_score(train_death_rate, pred_train))

Accuracy: 1.0
Precision: 1.0
Recall:  1.0


In [6]:
#tools to assess the model with the test dataset 
print('Accuracy:', metrics.accuracy_score(test_death_rate, pred_death_rate))
print('Precision:', metrics.precision_score(test_death_rate, pred_death_rate))
print('Recall: ', metrics.recall_score(test_death_rate, pred_death_rate))
print(metrics.confusion_matrix(test_death_rate, pred_death_rate))

Accuracy: 0.875
Precision: 0.6666666666666666
Recall:  1.0
[[5 1]
 [0 2]]


In [7]:
#contributions of the feature for classifying the data
feature_imp = pd.Series(Gbc.feature_importances_,index=features).sort_values(ascending=False)
print(feature_imp)

Number of tests per confirmed cases    0.675798
Proportion of edler people             0.224285
Proportion of overweight people        0.034324
Health employment per 1000 hab         0.033348
Hospital beds per 1000 hab             0.032245
dtype: float64


In [8]:
#this function return 3 lists of metrics obtained by cross validation
def build_list_scores(Method, X, y):
    accuracies = []
    recalls = []
    precisions = []
    for i, (train, test) in enumerate(cv.split(X,y)):
        Method.fit(X[train], y[train])
        y_pred = Method.predict(X[test])
        accuracies.append(metrics.accuracy_score(y[test], y_pred))
        if True in y[test]:#test dataset can have no postive sample due to the imbalance
            recalls.append(metrics.recall_score(y[test], y_pred))
        else: 
            recalls.append(np.nan)
        if True in y_pred:#predictions can have no positive results 
            precisions.append(metrics.precision_score(y[test], y_pred))
        else:
            precisions.append(np.nan)
    return (accuracies, recalls, precisions)
            
(accuracies, recalls, precisions) = build_list_scores(Gbc, other_data, death_rate_class)

In [9]:
print('Number of predictions whithout positive results:',len([i for i in precisions if i is np.nan]))
print('The average accuracy is: %0.2f +/- %0.2f' %(np.nanmean(accuracies), np.nanstd(accuracies)))
print('The average recall is: %0.2f +/- %0.2f' %(np.nanmean(recalls), np.nanstd(recalls)))
print('The average precision is: %0.2f +/- %0.2f' %(np.nanmean(precisions), np.nanstd(precisions)))

Number of predictions whithout positive results: 1
The average accuracy is: 0.78 +/- 0.15
The average recall is: 0.60 +/- 0.37
The average precision is: 0.69 +/- 0.32
