In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler

from ensemble import MakeEnsemble
from util import load_data, fdr, plot_report

In [2]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv', test_size=0)
x_train.head()

Unnamed: 0,ssn_count_3,fulladdress_count_0,address_count_0,name_dob_count_3,homephone_count_14,name_dob_count_7,name_day_since,ssn_firstname_count_0_by_14,name_count_7,fulladdress_homephone_count_0_by_14,...,ssn_count_0_by_14,ssn_firstname_count_7,ssn_count_7,fulladdress_count_1_by_14,fulladdress_homephone_count_7,ssn_name_dob_count_7,name_count_14,ssn_firstname_count_0_by_30,ssn_lastname_count_0_by_30,ssn_count_14
533499,-0.047591,-0.041351,-0.042158,-0.046415,0.708788,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
200863,-0.047591,-0.041351,-0.042158,-0.046415,1.282153,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
563773,-0.047591,-0.041351,-0.042158,-0.046415,1.282153,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
692072,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
76469,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421


In [3]:
# Random forest with random under sampling
rf = RandomForestClassifier(n_estimators=300,
                            max_depth=17, 
                            max_features=10,
                            ccp_alpha=1e-6, 
                            n_jobs=4)
ee = EasyEnsembleClassifier(n_estimators=25,
                            base_estimator=rf,
                            sampling_strategy='not minority',
                            n_jobs=2,
                            verbose=0).fit(x_train, y_train)
print(fdr(ee, x_train, y_train))
print(fdr(ee, x_test, y_test))

0.5582875013479995
0.55025011368804
0.5347483738445737


In [4]:
# random forest with balanced class weight
rf = RandomForestClassifier(n_estimators=300,
                            max_depth=17, 
                            max_features=10,
                            ccp_alpha=1e-6, 
                            class_weight='balanced',
                            n_jobs=8).fit(x_train, y_train)
print(fdr(rf, x_train, y_train))
print(fdr(rf, x_test, y_test))

0.5621697401056832
0.542519326966803
0.5227661759671346


In [3]:
# Random forest with SMOTE to 20%
train_score = []
test_score = []
oot_score = []
for _ in range(5):
    x, x_test, y, y_test = train_test_split(x_train, y_train, test_size=0.2)
    x_resample, y_resample = SMOTE(sampling_strategy=0.2).fit_resample(x, y)
    rf = RandomForestClassifier(n_estimators=50,
                            max_depth=20,
                            max_features=10,
                            ccp_alpha=1e-6,
                            n_jobs=8).fit(x_resample, y_resample)
    train_score.append(fdr(rf, x_resample, y_resample))
    test_score.append(fdr(rf, x_test, y_test))
    oot_score.append(fdr(rf, x_oot, y_oot))
print(np.mean(train_score))
print(np.mean(test_score))
print(np.mean(oot_score))

0.17999610052418252
0.5361295537789305
0.5172201300924341


In [4]:
# Decision tree with random undersample to 10% and then SMOTE
dt = DecisionTreeClassifier(ccp_alpha=1e-06,
                            max_depth=20,
                            min_samples_leaf=64,
                            class_weight=None,
                            criterion='gini')
train_score = []
test_score = []
oot_score = []
for _ in range(5):
    x, x_test, y, y_test = train_test_split(x_train, y_train, test_size=0.1)
    me = MakeEnsemble(dt, [RandomUnderSampler(sampling_strategy=0.1), SMOTE()],
                      n_estimators=50, n_jobs=8, verbose=0).fit(x, y)
    train_score.append(fdr(me, x, y))
    test_score.append(fdr(me, x_test, y_test))
    oot_score.append(fdr(me, x_oot, y_oot))
print(np.mean(train_score))
print(np.mean(test_score))
print(np.mean(oot_score))

0.5551655048404479
0.5490871183890038
0.5328996918863402


In [5]:
# Decision tree with random undersample to 10% and then SMOTE to 50%
dt = DecisionTreeClassifier(ccp_alpha=1e-06,
                            max_depth=20,
                            min_samples_leaf=64,
                            class_weight=None,
                            criterion='gini')
train_score = []
test_score = []
oot_score = []
for _ in range(5):
    x, x_test, y, y_test = train_test_split(x_train, y_train, test_size=0.1)
    me = MakeEnsemble(dt, [RandomUnderSampler(sampling_strategy=0.1), SMOTE(sampling_strategy=0.5)],
                      n_estimators=50, n_jobs=8, verbose=0).fit(x, y)
    train_score.append(fdr(me, x, y))
    test_score.append(fdr(me, x_test, y_test))
    oot_score.append(fdr(me, x_oot, y_oot))
print(np.mean(train_score))
print(np.mean(test_score))
print(np.mean(oot_score))

0.5562622805249273
0.541476819136751
0.5334474495035948
