In [14]:
from collections import ChainMap

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from util import load_data, fdr, plot_report
pd.set_option('display.max_rows', 100)

In [15]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv')
x_train.head()

Unnamed: 0,ssn_count_3,fulladdress_count_0,address_count_0,name_dob_count_3,homephone_count_14,name_dob_count_7,name_day_since,ssn_firstname_count_0_by_14,name_count_7,fulladdress_homephone_count_0_by_14,...,ssn_count_0_by_14,ssn_firstname_count_7,ssn_count_7,fulladdress_count_1_by_14,fulladdress_homephone_count_7,ssn_name_dob_count_7,name_count_14,ssn_firstname_count_0_by_30,ssn_lastname_count_0_by_30,ssn_count_14
236003,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
736281,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
572600,-0.047591,-0.041351,-0.042158,-0.046415,0.135423,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
148694,-0.047591,-0.041351,-0.042158,-0.046415,-1.011308,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
305769,-0.047591,-0.041351,-0.042158,-0.046415,-1.011308,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421


In [4]:
rf = RandomForestClassifier(n_jobs=4)
params = {'n_estimators': [50, 150, 450, 750],
          'max_depth': [15, 30, 50],
          'max_features':[5, 10, 20]}
grid = GridSearchCV(rf, params, cv=3, refit='FDR', 
                    verbose=1, n_jobs=2,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 27.2min
[Parallel(n_jobs=2)]: Done 108 out of 108 | elapsed: 95.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   27.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 750 out of 750 | elapsed:  1.8min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=4,
                                              oob_score=False,
                                              random_s

In [7]:
print(f"The best parameters are {grid.best_params_} "
      f"with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_n_estimators', 'param_max_features',
           'param_max_depth', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR',
           'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'max_depth': 15, 'max_features': 10, 'n_estimators': 750} with a score of 0.55193


Unnamed: 0,param_n_estimators,param_max_features,param_max_depth,mean_test_FDR,std_test_FDR,rank_test_FDR
7,750,10,15,0.551929,0.014313,1
5,150,10,15,0.551929,0.013605,2
4,50,10,15,0.55182,0.01411,3
6,450,10,15,0.551601,0.014313,4
11,750,20,15,0.551492,0.014926,5
9,150,20,15,0.551274,0.014881,6
10,450,20,15,0.551165,0.014671,7
8,50,20,15,0.550509,0.014362,8
2,450,5,15,0.549198,0.012297,9
3,750,5,15,0.549089,0.012245,10


In [10]:
params = {'max_depth': 15, 'max_features': 10, 'n_estimators': 750, 'n_jobs': 4}
score = cross_val_score(RandomForestClassifier(**params), 
                        x_train, y_train, 
                        scoring=fdr, cv=5, n_jobs=2)
print(score)
print(sum(score) / len(score))

[0.55925724 0.51010377 0.55816494 0.55021834 0.58296943]
0.5521427430067802


In [11]:
params = {'max_depth': 15, 'max_features': 10, 'n_estimators': 150, 'n_jobs': 4}
score = cross_val_score(RandomForestClassifier(**params), 
                        x_train, y_train, 
                        scoring=fdr, cv=5, n_jobs=2)
print(score)
print(sum(score) / len(score))

[0.55871109 0.50955762 0.55816494 0.55021834 0.58406114]
0.552142623760133


In [12]:
rf = RandomForestClassifier(n_jobs=4)
params = {'n_estimators': [100, 150, 200, 300],
          'max_depth': [5, 10, 15],
          'max_features':[7, 10, 15]}
grid = GridSearchCV(rf, params, cv=4, refit='FDR', 
                    verbose=1, n_jobs=2,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 4 folds for each of 36 candidates, totalling 144 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  6.4min
[Parallel(n_jobs=2)]: Done 144 out of 144 | elapsed: 36.4min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=4,
                                              oob_score=False,
                                              random_s

In [13]:
print(f"The best parameters are {grid.best_params_} "
      f"with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_n_estimators', 'param_max_features',
           'param_max_depth', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR',
           'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'max_depth': 15, 'max_features': 10, 'n_estimators': 300} with a score of 0.55247


Unnamed: 0,param_n_estimators,param_max_features,param_max_depth,mean_test_FDR,std_test_FDR,rank_test_FDR
31,300,10,15,0.55247,0.019692,1
28,100,10,15,0.552361,0.019544,2
26,200,7,15,0.552361,0.01971,3
29,150,10,15,0.552252,0.019729,4
24,100,7,15,0.552252,0.020498,5
30,200,10,15,0.552143,0.019993,6
34,200,15,15,0.552034,0.019692,7
27,300,7,15,0.551924,0.020153,8
32,100,15,15,0.551924,0.019852,8
25,150,7,15,0.551924,0.020175,10


In [14]:
rf = RandomForestClassifier(n_estimators=300, max_features=10, n_jobs=4)
params = {'max_depth': [12, 15, 17],
          'ccp_alpha': [0, 1e-7, 1e-6, 1e-5]}
grid = GridSearchCV(rf, params, cv=4, refit='FDR', 
                    verbose=1, n_jobs=2,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  48 out of  48 | elapsed: 25.9min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=10,
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=4,
                                              oob_score=False,
                                              random_state

In [16]:
print(f"The best parameters are {grid.best_params_} "
      f"with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_ccp_alpha',
           'param_max_depth', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR',
           'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'ccp_alpha': 1e-06, 'max_depth': 17} with a score of 0.55258


Unnamed: 0,param_ccp_alpha,param_max_depth,mean_test_FDR,std_test_FDR,rank_test_FDR
8,1e-06,17,0.55258,0.020147,1
7,1e-06,15,0.55258,0.019996,2
11,1e-05,17,0.552361,0.019572,3
4,1e-07,15,0.552252,0.019841,4
1,0.0,15,0.552143,0.020301,5
2,0.0,17,0.552034,0.019717,6
5,1e-07,17,0.551815,0.019992,7
6,1e-06,12,0.550942,0.017422,8
10,1e-05,15,0.550942,0.018968,9
3,1e-07,12,0.550832,0.019993,10


In [17]:
params = {'max_depth': 17, 'max_features': 10, 'n_estimators': 300, 'ccp_alpha': 1e-06, 'n_jobs': 8}
score = cross_val_score(RandomForestClassifier(**params), 
                        x_train, y_train, 
                        scoring=fdr, cv=5, n_jobs=1)
print(score)
print(sum(score) / len(score))

[0.56089569 0.51010377 0.55980339 0.55076419 0.58351528]
0.5530164631921374


In [18]:
rf = RandomForestClassifier(n_estimators=300,
                            max_depth=17, 
                            max_features=10,
                            ccp_alpha=1e-6, 
                            n_jobs=8).fit(x_train, y_train)
print(fdr(rf, x_train, y_train))
print(fdr(rf, x_test, y_test))

0.55585890575516
0.5369330453563715


In [19]:
rf = RandomForestClassifier(n_estimators=300, max_features=10, n_jobs=8)
params = {'max_depth': [17, 20, 40],
          'ccp_alpha': [0, 1e-6, 1e-5, 1e-4]}
grid = GridSearchCV(rf, params, cv=4, refit='FDR', 
                    verbose=1, n_jobs=1,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 36.5min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=10,
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=8,
                                              oob_score=False,
                                              random_state

In [21]:
print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_ccp_alpha', 'param_max_depth', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR', 'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'ccp_alpha': 1e-06, 'max_depth': 17} with a score of 0.55247


Unnamed: 0,param_ccp_alpha,param_max_depth,mean_test_FDR,std_test_FDR,rank_test_FDR
3,1e-06,17,0.55247,0.019837,1
6,1e-05,17,0.552143,0.01861,2
4,1e-06,20,0.552143,0.019687,3
7,1e-05,20,0.552034,0.018777,4
0,0.0,17,0.551924,0.019572,5
5,1e-06,40,0.551488,0.019684,6
8,1e-05,40,0.551379,0.017531,7
1,0.0,20,0.550068,0.01957,8
11,0.0001,40,0.544935,0.019039,9
10,0.0001,20,0.544826,0.019534,10


In [4]:
rf = RandomForestClassifier(n_estimators=300, max_features=10, ccp_alpha=0, n_jobs=8)
params = {'max_depth': [10, 17, 40],
          'min_samples_leaf': [4, 16, 64, 256]}
grid = GridSearchCV(rf, params, cv=4, refit='FDR', 
                    verbose=1, n_jobs=1,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 32.7min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=10,
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=-1,
                                              oob_score=False,
                                              random_state=

In [5]:
print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_max_depth', 'param_min_samples_leaf', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR', 'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'max_depth': 17, 'min_samples_leaf': 4} with a score of 0.54923


Unnamed: 0,param_max_depth,param_min_samples_leaf,mean_test_FDR,std_test_FDR,rank_test_FDR
4,17,4,0.549229,0.003725,1
5,17,16,0.548795,0.004466,2
6,17,64,0.548034,0.004084,3
10,40,64,0.547382,0.003564,4
9,40,16,0.545752,0.00277,5
2,10,64,0.545317,0.003166,6
1,10,16,0.544882,0.002487,7
0,10,4,0.544447,0.002972,8
7,17,256,0.543578,0.003044,9
3,10,256,0.543361,0.003122,10


In [6]:
rf = RandomForestClassifier(n_estimators=300, max_features=10, ccp_alpha=1e-6, n_jobs=8)
params = {'max_depth': [15, 17, 20],
          'min_samples_leaf': [1, 2, 4, 8]}
grid = GridSearchCV(rf, params, cv=4, refit='FDR', 
                    verbose=1, n_jobs=1,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 35.5min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=1e-06,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=10,
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=8,
                                              oob_score=False,
                                              random_sta

In [7]:
print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_max_depth', 'param_min_samples_leaf', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR', 'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'max_depth': 15, 'min_samples_leaf': 2} with a score of 0.55075


Unnamed: 0,param_max_depth,param_min_samples_leaf,mean_test_FDR,std_test_FDR,rank_test_FDR
1,15,2,0.550751,0.003802,1
4,17,1,0.550642,0.003164,2
0,15,1,0.550533,0.003758,3
5,17,2,0.550533,0.003258,4
7,17,8,0.550425,0.003998,5
8,20,1,0.550425,0.003457,6
9,20,2,0.550424,0.003296,7
6,17,4,0.550316,0.003497,8
11,20,8,0.550207,0.003847,9
10,20,4,0.550207,0.00343,10


In [8]:
rf = RandomForestClassifier(n_jobs=4)
params = {'n_estimators': [150, 300, 450], 
          'min_samples_leaf': [1, 2, 4],
          'max_depth': [13, 17, 20],
          'ccp_alpha': [1e-8, 1e-6, 1e-5]}
grid = GridSearchCV(rf, params, cv=3, refit='FDR', 
                    verbose=1, n_jobs=1,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed: 107.4min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=4,
                                              oob_score=False,
                                              random_s

In [11]:
print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_n_estimators', 'param_max_depth', 'param_ccp_alpha', 'param_min_samples_leaf', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR', 'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'ccp_alpha': 1e-06, 'max_depth': 20, 'min_samples_leaf': 2, 'n_estimators': 150} with a score of 0.55021


Unnamed: 0,param_n_estimators,param_max_depth,param_ccp_alpha,param_min_samples_leaf,mean_test_FDR,std_test_FDR,rank_test_FDR
48,150,20,1e-06,2,0.550206,0.003062,1
49,300,20,1e-06,2,0.550206,0.002406,2
36,150,17,1e-06,1,0.550098,0.002454,3
46,300,20,1e-06,1,0.549989,0.002601,4
45,150,20,1e-06,1,0.549989,0.002289,5
53,450,20,1e-06,4,0.549989,0.003203,6
12,150,17,1e-08,2,0.54988,0.002689,7
41,450,17,1e-06,2,0.549772,0.002019,8
51,150,20,1e-06,4,0.549772,0.003092,9
14,450,17,1e-08,2,0.549772,0.002578,10


In [17]:
rf = RandomForestClassifier(ccp_alpha=1e-6, n_jobs=4, n_estimators=300)
params = {'min_samples_leaf': [1, 2, 4],
          'max_depth': [15, 17, 20],
          'max_features': [10, 13, 15]}
grid = GridSearchCV(rf, params, cv=4, refit='FDR', 
                    verbose=1, n_jobs=2,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 4 folds for each of 27 candidates, totalling 108 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 80.3min
[Parallel(n_jobs=2)]: Done 108 out of 108 | elapsed: 214.2min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=1e-06,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=4,
                                              oob_score=False,
                                              random

In [18]:
print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_n_estimators', 'param_max_depth', 'param_ccp_alpha', 
           'param_min_samples_leaf', 'param_max_features', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR', 'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'max_depth': 20, 'max_features': 10, 'min_samples_leaf': 2} with a score of 0.55082


Unnamed: 0,param_max_depth,param_min_samples_leaf,param_max_features,mean_test_FDR,std_test_FDR,rank_test_FDR
19,20,2,10,0.550819,0.002024,1
26,20,4,15,0.550558,0.001926,2
22,20,2,13,0.550558,0.001926,2
23,20,4,13,0.550471,0.002039,4
21,20,1,13,0.550471,0.001932,4
11,17,4,10,0.550471,0.002024,4
17,17,4,15,0.550384,0.001995,7
25,20,2,15,0.550384,0.00207,8
5,15,4,13,0.550296,0.00247,9
20,20,4,10,0.550296,0.001818,9


In [27]:
rf = RandomForestClassifier(max_depth=20,
                            max_features=10,
                            ccp_alpha=1e-6,
                            min_samples_leaf=2,
                            n_jobs=4)
params = {'n_estimators': [20, 50, 100, 200, 300]}
grid = GridSearchCV(rf, params, cv=4, refit='FDR', 
                    verbose=1, n_jobs=2,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed: 16.3min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=1e-06,
                                              class_weight=None,
                                              criterion='gini', max_depth=20,
                                              max_features=10,
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=2,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=4,
                                              oob_score=False,
                                              random_state

In [28]:
print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_n_estimators', 'param_max_depth', 'param_ccp_alpha', 
           'param_min_samples_leaf', 'param_max_features', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR', 'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'n_estimators': 200} with a score of 0.55021


Unnamed: 0,param_n_estimators,mean_test_FDR,std_test_FDR,rank_test_FDR
3,200,0.550209,0.011834,1
1,50,0.550122,0.011568,2
4,300,0.550122,0.011568,2
2,100,0.550035,0.011877,4
0,20,0.549686,0.011662,5


In [29]:
x_train, _, x_oot, y_train, _, y_oot = load_data('var_30.csv', test_size=0)
x_train = x_train.to_numpy()

In [30]:
# use tuned hyper-parameters
rf = RandomForestClassifier(n_estimators=200,
                            max_depth=20,
                            max_features=10,
                            ccp_alpha=1e-6,
                            min_samples_leaf=2,
                            n_jobs=8)
score = cross_validate(rf, x_train, y_train, scoring={'FDR': fdr}, 
                       cv=5, return_train_score=True, return_estimator=True)
print('train_score:', np.mean(score['train_FDR']))
print('test_score:', np.mean(score['test_FDR']))
oot_score = [fdr(clf, x_oot, y_oot) for clf in score['estimator']]
print('oot_score', np.mean(oot_score))

train_score: 0.554851008541183
test_score: 0.5506458640803993
oot_score 0.540362889421431


In [31]:
# use tuned hyper-parameters
rf = RandomForestClassifier(n_estimators=50,
                            max_depth=20,
                            max_features=10,
                            ccp_alpha=1e-6,
                            min_samples_leaf=2,
                            n_jobs=8)
score = cross_validate(rf, x_train, y_train, scoring={'FDR': fdr}, 
                       cv=5, return_train_score=True, return_estimator=True)
print('train_score:', np.mean(score['train_FDR']))
print('test_score:', np.mean(score['test_FDR']))
oot_score = [fdr(clf, x_oot, y_oot) for clf in score['estimator']]
print('oot_score', np.mean(oot_score))

train_score: 0.5536305922508528
test_score: 0.5498614363889507
oot_score 0.5394727832933927


In [25]:
x_train_40, _, x_oot_40, y_train_40, _, y_oot_40 = load_data('var_40.csv', test_size=0)
rf = RandomForestClassifier(n_estimators=200,
                            max_depth=20,
                            max_features=10,
                            ccp_alpha=1e-6,
                            min_samples_leaf=2,
                            n_jobs=8)
score = cross_validate(rf, x_train, y_train, scoring=fdr, 
                       cv=5, return_train_score=True, return_estimator=True)
oot_score = []
for clf in score['estimator']:
    oot_score.append(fdr(clf, x_oot, y_oot))
print('train_score:', np.mean(score['train_score']))
print('test_score:', np.mean(score['test_score']))
print('oot_score', np.mean(oot_score))

train_score: 0.5548945530282355
test_score: 0.5503839323194162
oot_score 0.5398151318041767
