In [1]:
from collections import ChainMap

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from util import load_data, fdr, plot_report

In [2]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv')
x_train.head()

Unnamed: 0,fulladdress_count_1_by_7,address_count_0,ssn_count_3,fulladdress_count_0,homephone_count_7,address_count_1_by_7,name_dob_count_3,name_count_7,homephone_count_3,fulladdress_homephone_count_0_by_14,...,ssn_dob_count_7,ssn_name_count_7,name_day_since,ssn_firstname_count_14,ssn_count_7,name_count_14,fulladdress_count_1_by_14,fulladdress_homephone_count_0_by_30,ssn_lastname_count_14,ssn_name_count_14
215565,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
94591,0.09144,-0.042158,-0.047591,-0.041351,0.879479,0.103441,-0.046415,-0.115648,0.639149,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
250735,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
306329,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
481678,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962


In [4]:
rf = RandomForestClassifier(n_jobs=4)
params = {'n_estimators': [50, 150, 450, 750],
          'max_depth': [15, 30, 50],
          'max_features':[5, 10, 20]}
grid = GridSearchCV(rf, params, cv=3, refit='FDR', 
                    verbose=1, n_jobs=2,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 27.2min
[Parallel(n_jobs=2)]: Done 108 out of 108 | elapsed: 95.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   27.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 750 out of 750 | elapsed:  1.8min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=4,
                                              oob_score=False,
                                              random_s

In [7]:
print(f"The best parameters are {grid.best_params_} "
      f"with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_n_estimators', 'param_max_features',
           'param_max_depth', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR',
           'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'max_depth': 15, 'max_features': 10, 'n_estimators': 750} with a score of 0.55193


Unnamed: 0,param_n_estimators,param_max_features,param_max_depth,mean_test_FDR,std_test_FDR,rank_test_FDR
7,750,10,15,0.551929,0.014313,1
5,150,10,15,0.551929,0.013605,2
4,50,10,15,0.55182,0.01411,3
6,450,10,15,0.551601,0.014313,4
11,750,20,15,0.551492,0.014926,5
9,150,20,15,0.551274,0.014881,6
10,450,20,15,0.551165,0.014671,7
8,50,20,15,0.550509,0.014362,8
2,450,5,15,0.549198,0.012297,9
3,750,5,15,0.549089,0.012245,10


In [10]:
params = {'max_depth': 15, 'max_features': 10, 'n_estimators': 750, 'n_jobs': 4}
score = cross_val_score(RandomForestClassifier(**params), 
                        x_train, y_train, 
                        scoring=fdr, cv=5, n_jobs=2)
print(score)
print(sum(score) / len(score))

[0.55925724 0.51010377 0.55816494 0.55021834 0.58296943]
0.5521427430067802


In [11]:
params = {'max_depth': 15, 'max_features': 10, 'n_estimators': 150, 'n_jobs': 4}
score = cross_val_score(RandomForestClassifier(**params), 
                        x_train, y_train, 
                        scoring=fdr, cv=5, n_jobs=2)
print(score)
print(sum(score) / len(score))

[0.55871109 0.50955762 0.55816494 0.55021834 0.58406114]
0.552142623760133


In [12]:
rf = RandomForestClassifier(n_jobs=4)
params = {'n_estimators': [100, 150, 200, 300],
          'max_depth': [5, 10, 15],
          'max_features':[7, 10, 15]}
grid = GridSearchCV(rf, params, cv=4, refit='FDR', 
                    verbose=1, n_jobs=2,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 4 folds for each of 36 candidates, totalling 144 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  6.4min
[Parallel(n_jobs=2)]: Done 144 out of 144 | elapsed: 36.4min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=4,
                                              oob_score=False,
                                              random_s

In [13]:
print(f"The best parameters are {grid.best_params_} "
      f"with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_n_estimators', 'param_max_features',
           'param_max_depth', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR',
           'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'max_depth': 15, 'max_features': 10, 'n_estimators': 300} with a score of 0.55247


Unnamed: 0,param_n_estimators,param_max_features,param_max_depth,mean_test_FDR,std_test_FDR,rank_test_FDR
31,300,10,15,0.55247,0.019692,1
28,100,10,15,0.552361,0.019544,2
26,200,7,15,0.552361,0.01971,3
29,150,10,15,0.552252,0.019729,4
24,100,7,15,0.552252,0.020498,5
30,200,10,15,0.552143,0.019993,6
34,200,15,15,0.552034,0.019692,7
27,300,7,15,0.551924,0.020153,8
32,100,15,15,0.551924,0.019852,8
25,150,7,15,0.551924,0.020175,10


In [14]:
rf = RandomForestClassifier(n_estimators=300, max_features=10, n_jobs=4)
params = {'max_depth': [12, 15, 17],
          'ccp_alpha': [0, 1e-7, 1e-6, 1e-5]}
grid = GridSearchCV(rf, params, cv=4, refit='FDR', 
                    verbose=1, n_jobs=2,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  48 out of  48 | elapsed: 25.9min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=10,
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=4,
                                              oob_score=False,
                                              random_state

In [16]:
print(f"The best parameters are {grid.best_params_} "
      f"with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_ccp_alpha',
           'param_max_depth', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR',
           'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'ccp_alpha': 1e-06, 'max_depth': 17} with a score of 0.55258


Unnamed: 0,param_ccp_alpha,param_max_depth,mean_test_FDR,std_test_FDR,rank_test_FDR
8,1e-06,17,0.55258,0.020147,1
7,1e-06,15,0.55258,0.019996,2
11,1e-05,17,0.552361,0.019572,3
4,1e-07,15,0.552252,0.019841,4
1,0.0,15,0.552143,0.020301,5
2,0.0,17,0.552034,0.019717,6
5,1e-07,17,0.551815,0.019992,7
6,1e-06,12,0.550942,0.017422,8
10,1e-05,15,0.550942,0.018968,9
3,1e-07,12,0.550832,0.019993,10


In [17]:
params = {'max_depth': 17, 'max_features': 10, 'n_estimators': 300, 'ccp_alpha': 1e-06, 'n_jobs': 8}
score = cross_val_score(RandomForestClassifier(**params), 
                        x_train, y_train, 
                        scoring=fdr, cv=5, n_jobs=1)
print(score)
print(sum(score) / len(score))

[0.56089569 0.51010377 0.55980339 0.55076419 0.58351528]
0.5530164631921374


In [18]:
rf = RandomForestClassifier(n_estimators=300,
                            max_depth=17, 
                            max_features=10,
                            ccp_alpha=1e-6, 
                            n_jobs=8).fit(x_train, y_train)
print(fdr(rf, x_train, y_train))
print(fdr(rf, x_test, y_test))

0.55585890575516
0.5369330453563715


In [19]:
rf = RandomForestClassifier(n_estimators=300, max_features=10, n_jobs=8)
params = {'max_depth': [17, 20, 40],
          'ccp_alpha': [0, 1e-6, 1e-5, 1e-4]}
grid = GridSearchCV(rf, params, cv=4, refit='FDR', 
                    verbose=1, n_jobs=1,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 36.5min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=10,
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=8,
                                              oob_score=False,
                                              random_state

In [21]:
print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
    items=['param_ccp_alpha', 'param_max_depth', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR', 'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'ccp_alpha': 1e-06, 'max_depth': 17} with a score of 0.55247


Unnamed: 0,param_ccp_alpha,param_max_depth,mean_test_FDR,std_test_FDR,rank_test_FDR
3,1e-06,17,0.55247,0.019837,1
6,1e-05,17,0.552143,0.01861,2
4,1e-06,20,0.552143,0.019687,3
7,1e-05,20,0.552034,0.018777,4
0,0.0,17,0.551924,0.019572,5
5,1e-06,40,0.551488,0.019684,6
8,1e-05,40,0.551379,0.017531,7
1,0.0,20,0.550068,0.01957,8
11,0.0001,40,0.544935,0.019039,9
10,0.0001,20,0.544826,0.019534,10


In [3]:
# use tuned hyper-parameters
rf = RandomForestClassifier(n_estimators=300,
                            max_depth=17, 
                            max_features=10,
                            ccp_alpha=1e-6, 
                            n_jobs=8).fit(x_train, y_train)
print(fdr(rf, x_train, y_train))
print(fdr(rf, x_test, y_test))
print(fdr(rf, x_oot, y_oot))

0.5492605480643759
0.56195079086116
0.5395412529955494


In [4]:
# use parameters tuned last time
rf = RandomForestClassifier(n_estimators=700,
                            max_depth=20,
                            max_features=8,
                            ccp_alpha=1e-7, 
                            n_jobs=8).fit(x_train, y_train)
print(fdr(rf, x_train, y_train))
print(fdr(rf, x_test, y_test))
print(fdr(rf, x_oot, y_oot))

0.5555676381035233
0.5593145869947276
0.5395412529955494


In [5]:
# use params tuned for decision tree
rf = RandomForestClassifier(n_estimators=100,
                            max_depth=20,
                            max_features=30,
                            min_samples_leaf=128,
                            ccp_alpha=1e-7, 
                            n_jobs=8).fit(x_train, y_train)
print(fdr(rf, x_train, y_train))
print(fdr(rf, x_test, y_test))
print(fdr(rf, x_oot, y_oot))

0.5511091779034363
0.5540421792618629
0.5371448134200616


In [23]:
x_train_40, x_test_40, x_oot_40, y_train_40, y_test_40, y_oot_40 = load_data('var_40.csv')
rf = RandomForestClassifier(n_estimators=300,
                            max_depth=17, 
                            max_features=10,
                            ccp_alpha=1e-6, 
                            n_jobs=8).fit(x_train_40, y_train_40)
print(fdr(rf, x_train_40, y_train_40))
print(fdr(rf, x_test_40, y_test_40))
print(fdr(rf, x_oot_40, y_oot_40))

0.5564560289124959
0.562580093976933
0.5460458747004451
