In [2]:
from collections import ChainMap

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from util import load_data, fdr, plot_report

In [3]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv')
x_train.head()

Unnamed: 0,ssn_name_count_14,ssn_name_dob_count_14,ssn_lastname_count_14,ssn_firstname_count_14,ssn_dob_count_0_by_30,name_dob_count_0_by_30,fulladdress_homephone_count_14,fulladdress_count_1_by_14,fulladdress_homephone_day_since,ssn_dob_day_since,...,fulladdress_count_0,name_dob_count_3,fulladdress_homephone_count_0_by_14,name_day_since,name_count_14,ssn_dob_count_0_by_14,ssn_lastname_count_0_by_14,ssn_firstname_count_0_by_14,ssn_name_day_since,address_count_1_by_7
624857,-0.070962,-0.068281,-0.071051,-0.071088,0.162987,0.163303,-0.070089,0.123861,0.597648,0.590158,...,-0.041351,-0.046415,0.120387,0.808162,-0.159376,0.118004,0.123458,0.123528,0.608866,0.103441
284827,-0.070962,-0.068281,-0.071051,-0.071088,0.162987,0.163303,-0.070089,0.123861,-0.892847,-0.899409,...,-0.041351,-0.046415,0.120387,-0.676001,-0.159376,0.118004,0.123458,0.123528,-0.884078,0.103441
100203,-0.070962,-0.068281,-0.071051,-0.071088,0.162987,0.163303,-0.070089,0.123861,-1.225123,-1.231478,...,-0.041351,-0.046415,0.120387,-1.006865,-0.159376,0.118004,0.123458,0.123528,-1.2169,0.103441
131538,-0.070962,-0.068281,-0.071051,-0.071088,0.162987,0.163303,-0.070089,0.123861,-1.120693,-1.127113,...,-0.041351,-0.046415,0.120387,-0.902879,-0.159376,0.118004,0.123458,0.123528,-1.112299,0.103441
525156,-0.070962,-0.068281,-0.071051,-0.071088,0.162987,0.163303,-0.070089,0.123861,-1.253604,-1.259941,...,-0.041351,-0.046415,0.120387,-1.035225,-0.159376,0.118004,0.123458,0.123528,-1.245428,0.103441


In [3]:
rf = RandomForestClassifier(verbose=1)
params = {'n_estimators': [200, 500, 700, 1000],
          'max_depth': [20, 50, 80, 100],
          'max_features':[5, 8, 10]}
grid = GridSearchCV(rf, params, cv=2, refit='FDR', 
                    verbose=1, n_jobs=8,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 2 folds for each of 48 candidates, totalling 96 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed: 58.2min
[Parallel(n_jobs=8)]: Done  96 out of  96 | elapsed: 190.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed: 15.8min finished


GridSearchCV(cv=2, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [6]:
print(f"The best parameters are {grid.best_params_} "
      f"with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).filter(
    items=['param_n_estimators', 'param_max_features',
           'param_max_depth', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR',
           'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'max_depth': 20, 'max_features': 8, 'n_estimators': 700} with a score of 0.52632


Unnamed: 0,param_n_estimators,param_max_features,param_max_depth,mean_test_FDR,std_test_FDR,rank_test_FDR
0,200,5,20,0.525366,0.002528,12
1,500,5,20,0.525628,0.002964,10
2,700,5,20,0.525715,0.003225,8
3,1000,5,20,0.526238,0.002005,4
4,200,8,20,0.52554,0.0034,11
5,500,8,20,0.525889,0.002702,6
6,700,8,20,0.526325,0.002789,1
7,1000,8,20,0.526238,0.002354,4
8,200,10,20,0.526325,0.003138,1
9,500,10,20,0.525715,0.003574,8


In [5]:
rf = RandomForestClassifier(n_jobs=8, **grid.best_params_).fit(x_train, y_train)
print(fdr(rf, x_train, y_train))
print(fdr(rf, x_test, y_test))

0.5530857740585774
0.53098254022595


In [9]:
rf = RandomForestClassifier(n_jobs=2, **grid.best_params_)
params = {'max_depth': [5, 10, 20],
          'ccp_alpha': [0, 1e-7, 1e-6]}
grid = GridSearchCV(rf, params, cv=3, refit='FDR', 
                    verbose=1, n_jobs=4,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed: 44.6min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=20,
                                              max_features=8,
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=700, n_jobs=2,
                                              oob_score=False,
                                              random_state=No

In [12]:
print(f"The best parameters are {grid.best_params_} "
      f"with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).filter(
    items=['param_ccp_alpha',
           'param_max_depth', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR',
           'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'ccp_alpha': 1e-06, 'max_depth': 20} with a score of 0.52685


Unnamed: 0,param_ccp_alpha,param_max_depth,mean_test_FDR,std_test_FDR,rank_test_FDR
0,0.0,5,0.507061,0.000565,9
1,0.0,10,0.517521,0.001957,5
2,0.0,20,0.525976,0.002843,3
3,1e-07,5,0.507148,0.000963,8
4,1e-07,10,0.518218,0.001603,4
5,1e-07,20,0.526674,0.003224,2
6,1e-06,5,0.507409,0.001096,7
7,1e-06,10,0.516998,0.002935,6
8,1e-06,20,0.526848,0.002961,1


In [11]:
rf = RandomForestClassifier(n_estimators=700,
                            max_depth=20, 
                            max_features=8,
                            ccp_alpha=1e-6, 
                            n_jobs=8).fit(x_train, y_train)
print(fdr(rf, x_train, y_train))
print(fdr(rf, x_test, y_test))

0.5480299860529986
0.5350907223553577


In [None]:
rf = RandomForestClassifier(n_estimators=700, max_features=8, n_jobs=2)
params = {'max_depth': [10, 20, 40],
          'ccp_alpha': [0, 1e-7, 1e-6, 1e-5, 1e-4]}
grid = GridSearchCV(rf, params, cv=4, refit='FDR', 
                    verbose=1, n_jobs=4,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 4 folds for each of 15 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [11]:
print(f"The best parameters are {grid.best_params_} with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).filter(
    items=['param_ccp_alpha', 'param_max_depth', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR', 'mean_test_ROC', 'std_test_ROC'])

The best parameters are {'ccp_alpha': 1e-06, 'max_depth': 20} with a score of 0.52807


Unnamed: 0,param_ccp_alpha,param_max_depth,mean_test_FDR,std_test_FDR,rank_test_FDR
0,0.0,10,0.517172,0.010342,9
1,0.0,20,0.527022,0.010158,3
2,0.0,40,0.518828,0.009291,7
3,1e-07,10,0.51569,0.010382,11
4,1e-07,20,0.527632,0.010235,2
5,1e-07,40,0.518567,0.009835,8
6,1e-06,10,0.516475,0.010737,10
7,1e-06,20,0.528068,0.010961,1
8,1e-06,40,0.521095,0.010844,6
9,1e-05,10,0.514557,0.009934,12


In [12]:
rf = RandomForestClassifier(n_estimators=700,
                            max_depth=20, 
                            max_features=8,
                            ccp_alpha=1e-6, 
                            n_jobs=8).fit(x_train, y_train)
print(fdr(rf, x_train, y_train))
print(fdr(rf, x_test, y_test))

0.5481171548117155
0.5344060253337898


In [13]:
x_train, x_test, y_train, y_test = load_data('var_40.csv')
rf = RandomForestClassifier(n_estimators=700,
                            max_depth=20, 
                            max_features=8,
                            ccp_alpha=1e-6, 
                            n_jobs=8).fit(x_train, y_train)
print(fdr(rf, x_train, y_train))
print(fdr(rf, x_test, y_test))

0.5736576011157601
0.5566586785347484
