### Model Building

#### Seprate Training Testing

In [187]:
import pandas as pd
import scipy.stats as sps
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.metrics import confusion_matrix

In [72]:
df = pd.read_csv('training_testing.csv', index_col=0)
y = df['fraud_label'].to_numpy()

In [73]:
# feature selected after wrapper
x = pd.read_csv('selected features.csv', index_col = 0).to_numpy()

In [74]:
from sklearn.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)

#### Oversampling fraud records

In [75]:
from imblearn.over_sampling import SMOTE

In [76]:
os = SMOTE(random_state=0)
x_train_os, y_train_os = os.fit_sample(x_train, y_train)

In [77]:
x_train.shape

(635996, 25)

In [78]:
x_train_os.shape

(1253614, 25)

FDR = pd.read_csv('fdr_rank.csv')

In [81]:
#features selected by just filter, without wrapper
fdr = df.loc[:,FDR.loc[1:25,'Field']]

In [82]:
fdr.shape

(794996, 25)

In [83]:
fdr_train, fdr_test, y_fdr_train, y_fdr_test = train_test_split(fdr, y, test_size=0.2, random_state=0, stratify=y)

In [84]:
fdr_train_os, y_fdr_train_os = os.fit_sample(fdr_train, y_fdr_train)

In [86]:
fdr_train.shape

(635996, 25)

### Logistic Regression

In [41]:
import statsmodels.api as sm

#####       model with features selected only by FDR filter

In [87]:
logit_model2 = sm.Logit(y_fdr_train_os, fdr_train_os)
result2 = logit_model2.fit(maxiter=5000)
result2.summary2()

Optimization terminated successfully.
         Current function value: 0.539910
         Iterations 11


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.221
Dependent Variable:,y,AIC:,1353728.1176
Date:,2020-03-07 21:37,BIC:,1354029.1562
No. Observations:,1253614,Log-Likelihood:,-676840.0
Df Model:,24,LL-Null:,-868940.0
Df Residuals:,1253589,LLR p-value:,0.0
Converged:,1.0000,Scale:,1.0
No. Iterations:,11.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
addr_lag30_count,0.5583,0.0346,16.1230,0.0000,0.4905,0.6262
address_lag30_count,-0.3860,0.0246,-15.7114,0.0000,-0.4341,-0.3378
addr_#days_since,-0.0020,0.0002,-11.1101,0.0000,-0.0023,-0.0016
address_#days_since,-0.0050,0.0001,-39.3055,0.0000,-0.0053,-0.0048
address_lag14_count,0.3231,0.0439,7.3631,0.0000,0.2371,0.4092
addr_lag14_count,0.8304,0.0561,14.7904,0.0000,0.7204,0.9405
address_lag7_count,-0.9523,0.0720,-13.2276,0.0000,-1.0934,-0.8112
addr_lag7_count,1.4539,0.0771,18.8612,0.0000,1.3028,1.6050
address_lag3_count,-0.3772,0.1217,-3.0988,0.0019,-0.6158,-0.1386


###### - FDR 100% on training

In [195]:
result_prob2 = result2.predict(fdr_train_os)
result_pred2 = (result_prob2 > 0.5)
confusion_matrix(y_fdr_train_os, result_pred2)

array([[448251, 178556],
       [216674, 410133]])

In [131]:
410133/(410133+216674)

0.6543210270466029

###### - FDR 100% on testing

In [194]:
result_prob2_test = result2.predict(fdr_test)
result_pred2_test = (result_prob2_test > 0.5)
confusion_matrix(y_fdr_test, result_pred2_test)

array([[112035,  44668],
       [   802,   1495]])

In [150]:
1495/(1495+802)

0.6508489333913801

##### model with oversampled fraud records and features selected by wrapper

In [79]:
logit_model1 = sm.Logit(y_train_os, x_train_os)
result1 = logit_model1.fit(maxiter=5000)
result1.summary2()

Optimization terminated successfully.
         Current function value: 0.484021
         Iterations 12


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.302
Dependent Variable:,y,AIC:,1213601.4698
Date:,2020-03-07 21:26,BIC:,1213902.5083
No. Observations:,1253614,Log-Likelihood:,-606780.0
Df Model:,24,LL-Null:,-868940.0
Df Residuals:,1253589,LLR p-value:,0.0
Converged:,1.0000,Scale:,1.0
No. Iterations:,12.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
x1,0.5927,0.0176,33.6961,0.0000,0.5583,0.6272
x2,-0.4552,0.0368,-12.3625,0.0000,-0.5274,-0.3830
x3,0.2237,0.0630,3.5538,0.0004,0.1003,0.3471
x4,0.1442,0.0649,2.2211,0.0263,0.0169,0.2714
x5,0.5765,0.0359,16.0480,0.0000,0.5061,0.6469
x6,0.4089,0.1028,3.9794,0.0001,0.2075,0.6103
x7,1.2210,0.1087,11.2326,0.0000,1.0079,1.4340
x8,-1.6134,0.2593,-6.2220,0.0000,-2.1217,-1.1052
x9,1.0681,0.1037,10.3040,0.0000,0.8649,1.2713


###### - Fraud Detection Rate 100% on training

In [193]:
result_prob1 = result1.predict(x_train_os)
result_pred1 = (result_prob1 > 0.5)
confusion_matrix(y_train_os, result_pred1)

array([[605660,  21147],
       [277041, 349766]])

In [127]:
349766/(349766+277041)

0.558012274910778

###### - Fraud Detection Rate 100% on testing

In [192]:
result_prob1_test = result1.predict(x_test)
result_pred1_test = (result_prob1_test > 0.5)
confusion_matrix(y_test, result_pred1_test)

array([[151514,   5189],
       [  1007,   1290]])

In [136]:
1290/(1290+1007)

0.5616020896821942

###### - FDR 3% training

In [220]:
ind = np.argsort(result_prob1)
y_train_sorted1 = np.take_along_axis(y_train_os, ind, axis=0)
result_prob1_sorted = np.take_along_axis(result_prob1, ind, axis=0)
y_train_sorted1_3per = y_train_sorted1[-int(len(y_train_sorted1)*0.03):-1]
y_train_sorted1_3per_fraud = y_train_sorted1_3per[y_train_sorted1_3per==1]
y_train_os_fraud = y_train_os[y_train_os==1]
len(y_train_sorted1_3per_fraud)/len(y_train_os_fraud)

0.05999773454986942

###### - FDR 3% testing

In [222]:
ind = np.argsort(result_prob1_test)
y_test_sorted1 = np.take_along_axis(y_test, ind, axis=0)
result_prob1_test_sorted = np.take_along_axis(result_prob1_test, ind, axis=0)
y_test_sorted1_3per = y_test_sorted1[-int(len(y_test_sorted1)*0.03):-1]
y_test_sorted1_3per_fraud = y_test_sorted1_3per[y_test_sorted1_3per==1]
y_test_fraud = y_test[y_test==1]
len(y_test_sorted1_3per_fraud)/len(y_test_fraud)

0.5476708750544188

##### model with features selected by wrapper

In [89]:
logit_model = sm.Logit(y_train, x_train)
result = logit_model.fit(maxiter=5000)
result.summary2()

Optimization terminated successfully.
         Current function value: 0.050833
         Iterations 9


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.327
Dependent Variable:,y,AIC:,64708.5905
Date:,2020-03-07 21:46,BIC:,64992.6642
No. Observations:,635996,Log-Likelihood:,-32329.0
Df Model:,24,LL-Null:,-48058.0
Df Residuals:,635971,LLR p-value:,0.0
Converged:,1.0000,Scale:,1.0
No. Iterations:,9.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
x1,1.2264,0.0558,21.9737,0.0000,1.1170,1.3358
x2,-1.6605,0.0974,-17.0502,0.0000,-1.8513,-1.4696
x3,1.7565,0.2217,7.9221,0.0000,1.3220,2.1911
x4,-1.7642,0.2093,-8.4280,0.0000,-2.1745,-1.3539
x5,0.4007,0.0510,7.8506,0.0000,0.3006,0.5007
x6,0.6325,0.4591,1.3777,0.1683,-0.2673,1.5324
x7,0.9439,0.4619,2.0434,0.0410,0.0386,1.8493
x8,-1.1094,0.4534,-2.4470,0.0144,-1.9979,-0.2208
x9,0.6861,0.1927,3.5598,0.0004,0.3083,1.0638


###### - FDR 100% on training

In [189]:
result_prob = result.predict(x_train)
result_pred = (result_prob > 0.5)
confusion_matrix(y_train, result_pred)

array([[626458,    349],
       [  6040,   3149]])

In [117]:
3149/(3149+6040)

0.3426923495483731

###### - FDR 100% on testing

In [188]:
result_prob_test = result.predict(x_test)
result_pred_test = (result_prob_test > 0.5)
confusion_matrix(y_test, result_pred_test)

array([[156614,     89],
       [  1511,    786]])

In [140]:
786/(786+1511)

0.34218545929473226

###### - FDR 3% training

In [223]:
ind = np.argsort(result_prob)
y_train_sorted = np.take_along_axis(y_train, ind, axis=0)
result_prob_sorted = np.take_along_axis(result_prob, ind, axis=0)
y_train_sorted_3per = y_train_sorted[-int(len(y_train_sorted)*0.03):-1]
y_train_sorted_3per_fraud = y_train_sorted_3per[y_train_sorted_3per==1]
y_train_fraud = y_train[y_train==1]
len(y_train_sorted_3per_fraud)/len(y_train_fraud)

0.517901839155512

###### - FDR 3% testing

In [224]:
ind = np.argsort(result_prob_test)
y_test_sorted = np.take_along_axis(y_test, ind, axis=0)
result_prob_test_sorted = np.take_along_axis(result_prob_test, ind, axis=0)
y_test_sorted_3per = y_test_sorted[-int(len(y_test_sorted)*0.03):-1]
y_test_sorted_3per_fraud = y_test_sorted_3per[y_test_sorted_3per==1]
y_test_fraud = y_test[y_test==1]
len(y_test_sorted_3per_fraud)/len(y_test_fraud)

0.5267740531127557

### Neural Net

In [198]:
from sklearn.neural_network import MLPRegressor

In [None]:
# Training
for n in range(3,11):
    fdr = 0
    for i in range(5):
        NN = MLPRegressor(hidden_layer_sizes=(n,),activation='logistic',solver='adam',
                              learning_rate='adaptive',max_iter=10000,learning_rate_init=.01,alpha=.01)
        NN.fit(x_train,y_train)
        NN_train_pred = NN.predict(x_train)
        NN_test_pred = NN.predict(x_test)
        ind = np.argsort(NN_train_pred)
        NN_y_train_sorted = np.take_along_axis(y_train, ind, axis=0)
        NN_train_pred_sorted = np.take_along_axis(NN_train_pred, ind, axis=0)
        NN_y_train_sorted_3per = NN_y_train_sorted[-int(len(NN_y_train_sorted)*0.3):-1]
        NN_y_train_sorted_3per_fraud = NN_y_train_sorted_3per[NN_y_train_sorted_3per==1]
        y_train_fraud = y_train[y_train==1]
        fdr += len(NN_y_train_sorted_3per_fraud)/len(y_train_fraud)
        i += 1
    avg_fdr=fdr/5
    print(f'With {n} layers, FDR at 3%:', avg_fdr)
    n += 1
    

In [None]:
# Testing
for n in range(3,11):
    fdr = 0
    for i in range(5):
        NN = MLPRegressor(hidden_layer_sizes=(n,),activation='logistic',solver='adam',
                              learning_rate='adaptive',max_iter=10000,learning_rate_init=.01,alpha=.01)
        NN.fit(x_test,y_test)
        NN_test_pred = NN.predict(x_test)
        ind = np.argsort(NN_test_pred)
        NN_y_test_sorted = np.take_along_axis(y_test, ind, axis=0)
        NN_test_pred_sorted = np.take_along_axis(NN_test_pred, ind, axis=0)
        NN_y_test_sorted_3per = NN_y_test_sorted[-int(len(NN_y_test_sorted)*0.3):-1]
        NN_y_test_sorted_3per_fraud = NN_y_test_sorted_3per[NN_y_test_sorted_3per==1]
        y_test_fraud = y_test[y_test==1]
        fdr += len(NN_y_test_sorted_3per_fraud)/len(y_test_fraud)
        i += 1
    avg_fdr=fdr/5
    print(f'With {n} layers, FDR at 3%:', avg_fdr)
    n += 1
    

In [203]:
for n in range(3,8):
    NN = MLPRegressor(hidden_layer_sizes=(n,),activation='logistic',solver='adam',
                          learning_rate='adaptive',max_iter=10000,learning_rate_init=.01,alpha=.01)
    NN.fit(x_train,y_train)
    NN_train_pred = NN.predict(x_train)
    NN_test_pred = NN.predict(x_test)
    ind = np.argsort(NN_train_pred)
    NN_y_train_sorted = np.take_along_axis(y_train, ind, axis=0)
    NN_train_pred_sorted = np.take_along_axis(NN_train_pred, ind, axis=0)
    NN_y_train_sorted_3per = NN_y_train_sorted[-int(len(NN_y_train_sorted)*0.3):-1]
    NN_y_train_sorted_3per_fraud = NN_y_train_sorted_3per[NN_y_train_sorted_3per==1]
    y_train_fraud = y_train[y_train==1]
    print(f'With {n} layers, FDR at 3%:', len(NN_y_train_sorted_3per_fraud)/len(y_train_fraud))

With 3 layers, FDR at 3%: 0.5471759712699967
With 4 layers, FDR at 3%: 0.6750462509522255
With 5 layers, FDR at 3%: 0.6876700402655349
With 6 layers, FDR at 3%: 0.6923495483730547
With 7 layers, FDR at 3%: 0.5254108172815323


In [219]:
#for n in range(8,11):
NN = MLPRegressor(hidden_layer_sizes=(9,),activation='logistic',solver='adam',
                      learning_rate='adaptive',max_iter=10000,learning_rate_init=.01,alpha=.01)
NN.fit(x_train,y_train)
NN_train_pred = NN.predict(x_train)
NN_test_pred = NN.predict(x_test)
ind = np.argsort(NN_train_pred)
NN_y_train_sorted = np.take_along_axis(y_train, ind, axis=0)
NN_train_pred_sorted = np.take_along_axis(NN_train_pred, ind, axis=0)
NN_y_train_sorted_3per = NN_y_train_sorted[-int(len(NN_y_train_sorted)*0.03):-1]
NN_y_train_sorted_3per_fraud = NN_y_train_sorted_3per[NN_y_train_sorted_3per==1]
y_train_fraud = y_train[y_train==1]
print(f'With {n} layers, FDR at 3%:', len(NN_y_train_sorted_3per_fraud)/len(y_train_fraud))

With 250 layers, FDR at 3%: 0.5096310806398955


### Random Forest

In [206]:
from sklearn.ensemble import RandomForestRegressor

In [214]:
# Training
for n in range(20,110,10):
    fdr = 0
    for i in range (5):
        RF = RandomForestRegressor(n_estimators=n)
        RF.fit(x_train, y_train)
        RF_train_pred = RF.predict(x_train)
        RF_test_pred = RF.predict(x_test)
        ind = np.argsort(RF_train_pred)
        RF_y_train_sorted = np.take_along_axis(y_train, ind, axis=0)
        RF_train_pred_sorted = np.take_along_axis(RF_train_pred, ind, axis=0)
        RF_y_train_sorted_3per = RF_y_train_sorted[-int(len(RF_y_train_sorted)*0.3):-1]
        RF_y_train_sorted_3per_fraud = RF_y_train_sorted_3per[RF_y_train_sorted_3per==1]
        y_train_fraud = y_train[y_train==1]
        fdr += len(RF_y_train_sorted_3per_fraud)/len(y_train_fraud)
        print(n, i)
        i += 1
    
    avg_fdr = fdr/5
    print(f'With {n} trees, FDR at 3%:', avg_fdr)
    n += 10
    
    

20 0
20 1
20 2
20 3
20 4
With 20 trees, FDR at 3%: 0.8001088257699424
30 0
30 1
30 2
30 3
30 4
With 30 trees, FDR at 3%: 0.8005005985417346
40 0
40 1
40 2
40 3
40 4
With 40 trees, FDR at 3%: 0.8005005985417346
50 0
50 1
50 2
50 3
50 4
With 50 trees, FDR at 3%: 0.8008706061595385
60 0
60 1
60 2
60 3
60 4
With 60 trees, FDR at 3%: 0.8008488410055502
70 0
70 1
70 2
70 3
70 4
With 70 trees, FDR at 3%: 0.8010011970834693
80 0
80 1
80 2
80 3
80 4
With 80 trees, FDR at 3%: 0.8010011970834693
90 0
90 1
90 2
90 3
90 4
With 90 trees, FDR at 3%: 0.8008706061595385
100 0
100 1
100 2
100 3
100 4
With 100 trees, FDR at 3%: 0.8010011970834693


In [216]:
# Testing
for n in range(20,110,10):
    fdr = 0
    for i in range (5):
        RF = RandomForestRegressor(n_estimators=n)
        RF.fit(x_test, y_test)
        RF_test_pred = RF.predict(x_test)
        ind = np.argsort(RF_test_pred)
        RF_y_test_sorted = np.take_along_axis(y_test, ind, axis=0)
        RF_test_pred_sorted = np.take_along_axis(RF_test_pred, ind, axis=0)
        RF_y_test_sorted_3per = RF_y_test_sorted[-int(len(RF_y_test_sorted)*0.3):-1]
        RF_y_test_sorted_3per_fraud = RF_y_test_sorted_3per[RF_y_test_sorted_3per==1]
        y_test_fraud = y_test[y_test==1]
        fdr += len(RF_y_test_sorted_3per_fraud)/len(y_test_fraud)
        i += 1
    avg_fdr = fdr/5
    print(f'With {n} trees, FDR at 3%:', avg_fdr)
    n += 10
    

With 20 trees, FDR at 3%: 0.8884632128863735
With 30 trees, FDR at 3%: 0.8901175446234217
With 40 trees, FDR at 3%: 0.8901175446234217
With 50 trees, FDR at 3%: 0.8905528950805397
With 60 trees, FDR at 3%: 0.8906399651719635
With 70 trees, FDR at 3%: 0.8907270352633871
With 80 trees, FDR at 3%: 0.8908141053548106
With 90 trees, FDR at 3%: 0.8909011754462343
With 100 trees, FDR at 3%: 0.8908141053548106


In [210]:
# Testing
for n in range(80,100,5):
    RF = RandomForestRegressor(n_estimators=n)
    RF.fit(x_train, y_train)
    RF_train_pred = RF.predict(x_train)
    RF_test_pred = RF.predict(x_test)
    ind = np.argsort(RF_train_pred)
    RF_y_train_sorted = np.take_along_axis(y_train, ind, axis=0)
    RF_train_pred_sorted = np.take_along_axis(RF_train_pred, ind, axis=0)
    RF_y_train_sorted_3per = RF_y_train_sorted[-int(len(RF_y_train_sorted)*0.3):-1]
    RF_y_train_sorted_3per_fraud = RF_y_train_sorted_3per[RF_y_train_sorted_3per==1]
    y_train_fraud = y_train[y_train==1]
    print(f'With {n} trees, FDR at 3%:', len(RF_y_train_sorted_3per_fraud)/len(y_train_fraud))

With 80 trees, FDR at 3%: 0.8010664925454347
With 85 trees, FDR at 3%: 0.8009576667754924
With 90 trees, FDR at 3%: 0.8010664925454347
With 95 trees, FDR at 3%: 0.8008488410055501


### Boosted Tree

In [215]:
from sklearn.ensemble import GradientBoostingRegressor

In [217]:
# Training
for n in range(200,300,10):
    fdr = 0
    for i in range (5):
        params = {'n_estimators':n, 'max_depth':10, 'min_samples_split':10, 'learning_rate':.2, 'loss':'ls'}
        GB = GradientBoostingRegressor(**params)
        GB.fit(x_train, y_train)
        GB_train_pred = GB.predict(x_train)
        GB_test_pred = GB.predict(x_test)
        ind = np.argsort(GB_train_pred)
        GB_y_train_sorted = np.take_along_axis(y_train, ind, axis=0)
        GB_train_pred_sorted = np.take_along_axis(GB_train_pred, ind, axis=0)
        GB_y_train_sorted_3per = GB_y_train_sorted[-int(len(GB_y_train_sorted)*0.3):-1]
        GB_y_train_sorted_3per_fraud = GB_y_train_sorted_3per[GB_y_train_sorted_3per==1]
        y_train_fraud = y_train[y_train==1]
        fdr += len(GB_y_train_sorted_3per_fraud)/len(y_train_fraud)
        i += 1
    
    avg_fdr = fdr/5
    print(f'With {n} trees, FDR at 3%:', avg_fdr)
    n += 10
    
    

With 200 trees, FDR at 3%: 0.7708129285014691
With 210 trees, FDR at 3%: 0.7709652845793885
With 220 trees, FDR at 3%: 0.7732723909021656
With 230 trees, FDR at 3%: 0.7750788986832081
With 240 trees, FDR at 3%: 0.773381216672108


KeyboardInterrupt: 

In [None]:
# Testing
for n in range(200,300,10):
    fdr = 0
    for i in range (5):
        params = {'n_estimators':n, 'max_depth':10, 'min_samples_split':10, 'learning_rate':.2, 'loss':'ls'}
        GB = GradientBoostingRegressor(**params)
        GB.fit(x_test, y_test)
        GB_test_pred = GB.predict(x_test)
        ind = np.argsort(GB_test_pred)
        GB_y_test_sorted = np.take_along_axis(y_test, ind, axis=0)
        GB_test_pred_sorted = np.take_along_axis(GB_test_pred, ind, axis=0)
        GB_y_test_sorted_3per = GB_y_test_sorted[-int(len(GB_y_test_sorted)*0.3):-1]
        GB_y_test_sorted_3per_fraud = GB_y_test_sorted_3per[GB_y_test_sorted_3per==1]
        y_test_fraud = y_test[y_test==1]
        fdr += len(GB_y_test_sorted_3per_fraud)/len(y_test_fraud)
        i += 1
    
    avg_fdr = fdr/5
    print(f'With {n} trees, FDR at 3%:', avg_fdr)
    n += 10