In [1]:
from collections import ChainMap
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from util import load_data, fdr, plot_report

pd.set_option('display.max_rows', 999)

In [2]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv')
x_train.head()

Unnamed: 0,fulladdress_count_1_by_7,address_count_0,ssn_count_3,fulladdress_count_0,homephone_count_7,address_count_1_by_7,name_dob_count_3,name_count_7,homephone_count_3,fulladdress_homephone_count_0_by_14,...,ssn_dob_count_7,ssn_name_count_7,name_day_since,ssn_firstname_count_14,ssn_count_7,name_count_14,fulladdress_count_1_by_14,fulladdress_homephone_count_0_by_30,ssn_lastname_count_14,ssn_name_count_14
148362,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,-1.83851,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
50873,0.09144,-0.042158,-0.047591,-0.041351,0.055132,0.103441,-0.046415,-0.115648,0.639149,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
331657,0.09144,-0.042158,-0.047591,-0.041351,1.703825,0.103441,-0.046415,-0.115648,1.799751,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
390228,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
545652,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962


In [3]:
dt = DecisionTreeClassifier()
params = {'min_samples_leaf': [8, 16, 32, 64, 128, 512],
          'max_depth' : [10, 20, 40, 80, 160],
          'ccp_alpha': [0, 1e-7, 5e-7, 1e-6, 5e-6, 1e-5]}
grid = GridSearchCV(dt, params, cv=10, refit='FDR', 
                    verbose=1, n_jobs=8,
                    scoring={'FDR': fdr})
grid.fit(x_train, y_train)

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   14.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:  1.7min
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:  4.1min
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:  7.6min
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed: 12.0min
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed: 17.6min
[Parallel(n_jobs=8)]: Done 1800 out of 1800 | elapsed: 17.7min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=8,
             param_grid={'ccp_alpha': [0, 1e-07, 5e-07, 1e-06, 5e-06, 1e-05],
  

In [4]:
print(f"The best parameters are {grid.best_params_} "
      f"with a score of {grid.best_score_:0.5f}")

The best parameters are {'ccp_alpha': 1e-06, 'max_depth': 20, 'min_samples_leaf': 64} with a score of 0.55598


In [5]:
pd.DataFrame(grid.cv_results_).sort_values('rank_test_FDR').filter(
        items=['param_max_depth', 'param_min_samples_leaf',
               'param_ccp_alpha', 'mean_test_FDR', 'std_test_FDR',
               'rank_test_FDR', 'mean_test_ROC', 'std_test_ROC'])

Unnamed: 0,param_max_depth,param_min_samples_leaf,param_ccp_alpha,mean_test_FDR,std_test_FDR,rank_test_FDR
117,160,64,1e-06,0.555979,0.022103,1
111,80,64,1e-06,0.555979,0.022103,1
99,20,64,1e-06,0.555979,0.022103,1
105,40,64,1e-06,0.555871,0.022143,4
116,160,32,1e-06,0.551097,0.017721,5
110,80,32,1e-06,0.551097,0.017721,5
98,20,32,1e-06,0.551097,0.017721,5
104,40,32,1e-06,0.551097,0.017721,5
112,80,128,1e-06,0.55012,0.017048,9
100,20,128,1e-06,0.55012,0.017048,9


In [6]:
dt = DecisionTreeClassifier(ccp_alpha=1e-06, max_depth=20, min_samples_leaf=64)
params = {'class_weight': ['balanced', None],
          'criterion': ['gini', 'entropy']}
grid_2 = GridSearchCV(dt, params, cv=10, refit='FDR', 
                      verbose=1, n_jobs=8,
                      scoring={'FDR': fdr, 'ROC': 'roc_auc'})
grid_2.fit(x_train, y_train)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  40 out of  40 | elapsed:   21.8s finished


GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=1e-06,
                                              class_weight=None,
                                              criterion='gini', max_depth=20,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=64,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=8,
             param_grid={'class_w

In [7]:
print(f"The best parameters are {grid_2.best_params_} "
      f"with a score of {grid_2.best_score_:0.5f}")

The best parameters are {'class_weight': None, 'criterion': 'gini'} with a score of 0.55598


In [8]:
pd.DataFrame(grid_2.cv_results_).filter(
    items=['param_class_weight', 'param_criterion',
           'mean_test_FDR', 'std_test_FDR', 'rank_test_FDR',
           'mean_test_ROC', 'std_test_ROC'])

Unnamed: 0,param_class_weight,param_criterion,mean_test_FDR,std_test_FDR,rank_test_FDR,mean_test_ROC,std_test_ROC
0,balanced,gini,0.538513,0.020518,4,0.766718,0.008331
1,balanced,entropy,0.538514,0.019542,3,0.767333,0.010362
2,,gini,0.555979,0.022103,1,0.764684,0.010983
3,,entropy,0.5409,0.019818,2,0.769617,0.011115


In [9]:
dt = DecisionTreeClassifier(ccp_alpha=1e-06, 
                            max_depth=20,
                            min_samples_leaf=64, 
                            class_weight=None,
                            criterion='gini').fit(x_train, y_train)
print(fdr(dt, x_train, y_train))
print(fdr(dt, x_test, y_test))
print(fdr(dt, x_oot, y_oot))

0.5435018442178347
0.54924578527063
0.5347483738445737


In [12]:
dt = DecisionTreeClassifier(ccp_alpha=1e-06, 
                            max_depth=80,
                            min_samples_leaf=64, 
                            class_weight=None,
                            criterion='gini').fit(x_train, y_train)
print(fdr(dt, x_train, y_train))
print(fdr(dt, x_test, y_test))
print(fdr(dt, x_oot, y_oot))

0.5435018442178347
0.54924578527063
0.5347483738445737


In [11]:
x_train_40, x_test_40, x_oot_40, y_train_40, y_test_40, y_oot_40 = load_data('var_40.csv')
dt = DecisionTreeClassifier(ccp_alpha=1e-06, 
                            max_depth=20, 
                            min_samples_leaf=64, 
                            class_weight=None,
                            criterion='gini').fit(x_train_40, y_train_40)
print(fdr(dt, x_train_40, y_train_40))
print(fdr(dt, x_test_40, y_test_40))
print(fdr(dt, x_oot_40, y_oot_40))

0.5548833189282627
0.5315884476534296
0.5395412529955494
