In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from pprint import pprint

In [2]:
with open('CSV_Creation/OECD_countries_data.csv') as f:
    overall_data_by_country = pd.read_csv(f, error_bad_lines=False)

#we want to predict Death_rate by using countries' features 
features = list(overall_data_by_country.columns)
features.remove('Death_rate')
features.remove('Country')
overall_data_by_country = overall_data_by_country.set_index('Country')

#countries are classified according their death rate. Countries with the label 'True' means 
#that these countries have a high mortality rate compare to other countries

def classify_countries(threshold, overall_data_by_country):
    death_rate_class = []
    for i in range(overall_data_by_country.shape[0]):
        if overall_data_by_country.values[i,0] > threshold:
            death_rate_class.append(True)
        else:
            death_rate_class.append(False)
    return np.array(death_rate_class)

death_rate_class = classify_countries(8, overall_data_by_country) #the label (y)
other_data = overall_data_by_country[features].values #features to find the label (X)

In [3]:
cv = KFold(n_splits=5)
#Look at the parameter use for a usual random forest
Rfc = RandomForestClassifier()
print('Parameters currently in use:\n')
pprint(Rfc.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [4]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [5]:
#Use the random grid to search for best hyperparameters
# Random search of parameters, using 5 fold cross validation, 
# search across 50 different combinations, and use all available cores
Rfc_random = RandomizedSearchCV(estimator = Rfc, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
Rfc_random.fit(other_data, death_rate_class)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   44.0s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [6]:
pprint(Rfc_random.best_params_)
best_random = Rfc_random.best_estimator_

{'bootstrap': True,
 'max_depth': 30,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 400}


In [7]:
def build_list_scores(Method, X, y):
    accuracies = []
    recalls = []
    precisions = []
    for i, (train, test) in enumerate(cv.split(X,y)):
        Method.fit(X[train], y[train])
        y_pred = Method.predict(X[test])
        accuracies.append(metrics.accuracy_score(y[test], y_pred))
        if True in y[test]:
            recalls.append(metrics.recall_score(y[test], y_pred))
        else: 
            recalls.append(np.nan)
        if True in y_pred:
            precisions.append(metrics.precision_score(y[test], y_pred))
        else:
            precisions.append(np.nan)
    return (accuracies, recalls, precisions)
            
(accuracies, recalls, precisions) = build_list_scores(best_random, other_data, death_rate_class)

In [8]:
print('Number of predictions whithout positive results:',len([i for i in precisions if i is np.nan]))
print('The average accuracy is: %0.2f +/- %0.2f' %(np.nanmean(accuracies), np.nanstd(accuracies)))
print('The average recall is: %0.2f +/- %0.2f' %(np.nanmean(recalls), np.nanstd(recalls)))
print('The average precision is: %0.2f +/- %0.2f' %(np.nanmean(precisions), np.nanstd(precisions)))

Number of predictions whithout positive results: 1
The average accuracy is: 0.93 +/- 0.08
The average recall is: 0.70 +/- 0.40
The average precision is: 1.00 +/- 0.00


In [9]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 20, 30, 40, 50],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1],
    'min_samples_split': [1, 3, 5, 7, 9, 11],
    'n_estimators': [400]
}

# Create a based model
Rfc = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = Rfc, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [10]:
# Fit the grid search to the data
grid_search.fit(other_data, death_rate_class)
grid_search.best_params_

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   12.3s finished


{'bootstrap': True,
 'max_depth': 5,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 400}

In [11]:
best_grid = grid_search.best_estimator_
(accuracies, recalls, precisions) = build_list_scores(best_grid, other_data, death_rate_class)
print('Number of predictions whithout positive results:',len([i for i in precisions if i is np.nan]))
print('The average accuracy is: %0.2f +/- %0.2f' %(np.nanmean(accuracies), np.nanstd(accuracies)))
print('The average recall is: %0.2f +/- %0.2f' %(np.nanmean(recalls), np.nanstd(recalls)))
print('The average precision is: %0.2f +/- %0.2f' %(np.nanmean(precisions), np.nanstd(precisions)))

Number of predictions whithout positive results: 1
The average accuracy is: 0.93 +/- 0.08
The average recall is: 0.70 +/- 0.40
The average precision is: 1.00 +/- 0.00
