In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier

from util import load_data, fdr
from ensemble import MakeEnsemble

pd.set_option('display.max_rows', 100)

In [2]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv')
x_train.head()

Unnamed: 0,fulladdress_count_1_by_7,address_count_0,ssn_count_3,fulladdress_count_0,homephone_count_7,address_count_1_by_7,name_dob_count_3,name_count_7,homephone_count_3,fulladdress_homephone_count_0_by_14,...,ssn_dob_count_7,ssn_name_count_7,name_day_since,ssn_firstname_count_14,ssn_count_7,name_count_14,fulladdress_count_1_by_14,fulladdress_homephone_count_0_by_30,ssn_lastname_count_14,ssn_name_count_14
424722,0.09144,-0.042158,-0.047591,-0.041351,0.055132,0.103441,-0.046415,-0.115648,0.639149,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
606998,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
639246,0.09144,-0.042158,-0.047591,-0.041351,1.703825,0.103441,-0.046415,-0.115648,1.799751,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
56700,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
410754,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962


In [3]:
model_results = dict()

In [4]:
def fdr_score(clf, x_train, x_test, x_oot, y_train, y_test, y_oot):
    return {'fdr_train': fdr(clf, x_train, y_train),
            'fdr_test': fdr(clf, x_test, y_test),
            'fdr_oot': fdr(clf, x_oot, y_oot)}

In [5]:
# Logistic regression
lr = LogisticRegression(max_iter=1000, 
                        C=1, 
                        class_weight='balanced').fit(x_train, y_train)
model_results['logistic regression'] = fdr_score(lr, x_train, x_test, x_oot, y_train, y_test, y_oot)
print(model_results['logistic regression'])

{'fdr_train': 0.5279232111692844, 'fdr_test': 0.5373263888888888, 'fdr_oot': 0.5183156453269429}


In [6]:
# Decision tree
dt = DecisionTreeClassifier(ccp_alpha=1e-06, 
                            max_depth=20,
                            min_samples_leaf=64, 
                            class_weight=None,
                            criterion='gini').fit(x_train, y_train)
model_results['decision tree'] = fdr_score(dt, x_train, x_test, x_oot, y_train, y_test, y_oot)
print(model_results['decision tree'])

{'fdr_train': 0.5453752181500873, 'fdr_test': 0.5555555555555556, 'fdr_oot': 0.5368024649092776}


In [7]:
# Random forest
rf = RandomForestClassifier(n_estimators=300,
                            max_depth=17, 
                            max_features=10,
                            ccp_alpha=1e-6, 
                            n_jobs=8).fit(x_train, y_train)
model_results['random forest'] = fdr_score(rf, x_train, x_test, x_oot, y_train, y_test, y_oot)
print(model_results['random forest'])

{'fdr_train': 0.5509380453752182, 'fdr_test': 0.5572916666666666, 'fdr_oot': 0.5398836015063334}


In [8]:
# Boosted tree
bt = XGBClassifier(n_estimators=200,
                   max_depth=5,
                   learning_rate=0.05,
                   subsample=0.5,
                   verbose=3, n_jobs=8).fit(x_train, y_train)
model_results['boosted tree'] = fdr_score(bt, x_train, x_test, x_oot, y_train, y_test, y_oot)
print(model_results['boosted tree'])

{'fdr_train': 0.550174520069808, 'fdr_test': 0.5581597222222222, 'fdr_oot': 0.5398836015063334}


In [9]:
# Random forest with random under sampling
rf = RandomForestClassifier(n_estimators=150,
                            max_depth=17, 
                            max_features=10,
                            ccp_alpha=1e-6, 
                            n_jobs=4)
ee = EasyEnsembleClassifier(n_estimators=50,
                            base_estimator=rf,
                            sampling_strategy='not minority',
                            n_jobs=2,
                            verbose=0).fit(x_train, y_train)
model_results['rf with under sampling'] = fdr_score(ee, x_train, x_test, x_oot, y_train, y_test, y_oot)
print(model_results['rf with under sampling'])

{'fdr_train': 0.5571553228621291, 'fdr_test': 0.5564236111111112, 'fdr_oot': 0.5357754193769257}


In [10]:
# Decision tree with Random undersample to 10% and then SMOTE 
dt = DecisionTreeClassifier(ccp_alpha=1e-06, max_depth=20, min_samples_leaf=64, class_weight=None, criterion='gini')
me = MakeEnsemble(dt, [RandomUnderSampler(sampling_strategy=0.1), SMOTE()],
                  n_estimators=50, n_jobs=8, verbose=0)
me.fit(x_train, y_train)
model_results['dt with 0.1 under and SMOTE'] = fdr_score(me, x_train, x_test, x_oot, y_train, y_test, y_oot)
print(model_results['dt with 0.1 under and SMOTE'])

{'fdr_train': 0.5506108202443281, 'fdr_test': 0.5529513888888888, 'fdr_oot': 0.531324888736734}


In [11]:
# Decision tree with Random undersample to 10% and then SMOTE 
rf = RandomForestClassifier(n_estimators=150, max_depth=20, max_features=10, ccp_alpha=1e-6, n_jobs=4)
me = MakeEnsemble(rf, 
                  [RandomUnderSampler(sampling_strategy=0.1), SMOTE()],
                  n_estimators=50, n_jobs=2, verbose=0)
me.fit(x_train, y_train)
model_results['rf with 0.1 under and SMOTE'] = fdr_score(me, x_train, x_test, x_oot, y_train, y_test, y_oot)
print(model_results['rf with 0.1 under and SMOTE'])

{'fdr_train': 0.5599912739965096, 'fdr_test': 0.5516493055555556, 'fdr_oot': 0.5306401917151661}


In [12]:
pd.DataFrame.from_dict(model_results, orient='index').sort_values('fdr_test', ascending=False)

Unnamed: 0,fdr_train,fdr_test,fdr_oot
boosted tree,0.550175,0.55816,0.539884
random forest,0.550938,0.557292,0.539884
rf with under sampling,0.557155,0.556424,0.535775
decision tree,0.545375,0.555556,0.536802
dt with 0.1 under and SMOTE,0.550611,0.552951,0.531325
rf with 0.1 under and SMOTE,0.559991,0.551649,0.53064
logistic regression,0.527923,0.537326,0.518316
