In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler

from ensemble import MakeEnsemble
from util import load_data, fdr, plot_report

In [2]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv')
x_train.head()

Unnamed: 0,fulladdress_count_1_by_7,address_count_0,ssn_count_3,fulladdress_count_0,homephone_count_7,address_count_1_by_7,name_dob_count_3,name_count_7,homephone_count_3,fulladdress_homephone_count_0_by_14,...,ssn_dob_count_7,ssn_name_count_7,name_day_since,ssn_firstname_count_14,ssn_count_7,name_count_14,fulladdress_count_1_by_14,fulladdress_homephone_count_0_by_30,ssn_lastname_count_14,ssn_name_count_14
68337,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
630010,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
283882,0.09144,-0.042158,-0.047591,-0.041351,0.055132,0.103441,-0.046415,-0.115648,0.639149,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
378960,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,-1.682029,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
465063,0.09144,-0.042158,-0.047591,-0.041351,0.055132,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962


In [3]:
# Random forest with random under sampling
rf = RandomForestClassifier(n_estimators=300,
                            max_depth=17, 
                            max_features=10,
                            ccp_alpha=1e-6, 
                            n_jobs=4)
ee = EasyEnsembleClassifier(n_estimators=25,
                            base_estimator=rf,
                            sampling_strategy='not minority',
                            n_jobs=2,
                            verbose=0).fit(x_train, y_train)
print(fdr(ee, x_train, y_train))
print(fdr(ee, x_test, y_test))

0.5582875013479995
0.55025011368804
0.5347483738445737


In [4]:
# random forest with balanced class weight
rf = RandomForestClassifier(n_estimators=300,
                            max_depth=17, 
                            max_features=10,
                            ccp_alpha=1e-6, 
                            class_weight='balanced',
                            n_jobs=8).fit(x_train, y_train)
print(fdr(rf, x_train, y_train))
print(fdr(rf, x_test, y_test))

0.5621697401056832
0.542519326966803
0.5227661759671346


In [6]:
# Random undersample to 10% and then SMOTE
me = MakeEnsemble(RandomForestClassifier(n_estimators=20,
                                         max_depth=20,
                                         max_features=10,
                                         ccp_alpha=1e-6,
                                         n_jobs=8),
                  [RandomUnderSampler(sampling_strategy=0.1), SMOTE()],
                  n_estimators=50)
me.fit(x_train, y_train)
print(fdr(me, x_train, y_train))
print(fdr(me, x_test, y_test))

0.5559053408597482
0.5575221238938053


In [4]:
# Decision tree with random undersample to 10% and then SMOTE
dt = DecisionTreeClassifier(ccp_alpha=1e-06, max_depth=20, min_samples_leaf=64, class_weight=None, criterion='gini')
me = MakeEnsemble(dt, [RandomUnderSampler(sampling_strategy=0.1), SMOTE()],
                  n_estimators=50, n_jobs=8, verbose=1)
me.fit(x_train, y_train)
print(fdr(me, x_train, y_train))
print(fdr(me, x_test, y_test))

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   20.8s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:   28.3s finished


0.5499348675640469
0.5592920353982301


In [5]:
# Decision tree with random undersample to 10% and then ADASYN
dt = DecisionTreeClassifier(ccp_alpha=1e-06, max_depth=20, min_samples_leaf=64, class_weight=None, criterion='gini')
me = MakeEnsemble(dt, [RandomUnderSampler(sampling_strategy=0.1), ADASYN()],
                  n_estimators=50, n_jobs=8, verbose=1)
me.fit(x_train, y_train)
print(fdr(me, x_train, y_train))
print(fdr(me, x_test, y_test))

0.5394051237516283
0.5508849557522124


In [None]:
# Decision tree with random undersample to 10% and then SVMSMOTE
dt = DecisionTreeClassifier(ccp_alpha=1e-06, max_depth=20, min_samples_leaf=64, class_weight=None, criterion='gini')
me = MakeEnsemble(dt, [RandomUnderSampler(sampling_strategy=0.1), SVMSMOTE()],
                  n_estimators=50)
me.fit(x_train, y_train)
print(fdr(me, x_train, y_train))
print(fdr(me, x_test, y_test))