In [1]:
import operator
import random

import numpy as np
import pandas as pd
import sklearn
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import recall_score
from imblearn.over_sampling import SMOTE



Using TensorFlow backend.


In [2]:
df = pd.read_csv('sample.csv')
std_scale = StandardScaler().fit(df)
df_std = pd.DataFrame(std_scale.transform(df), columns=df.columns)

In [None]:
# df = pd.read_csv('vars_308.csv')

In [3]:
with open('ks_stat.csv', 'r') as f:
    lines = f.read().split('\n')
    ks_stat = [i.split(',') for i in lines]

In [4]:
y = df['fraud_label'].to_numpy()
df_std.drop(columns=['record', 'fraud_label'], inplace=True)

In [5]:
ks_col = [i[0] for i in ks_stat[:102]]
ks_df = df_std.filter(items=ks_col)

In [6]:
def fdr(y, cutoff=0.03, *, prob=None, classifier=None, x=None):
    if prob is None:
        assert classifier is not None
        prob = classifier.predict_proba(x)
    if len(prob.shape) == 2:
        prob = prob[:, -1:]
    fraud_num = len(y[y == 1])
    total_num = len(y)
    fraud_prob = [(i, j) for i, j in zip(prob, y)]
    sorted_prob = sorted(fraud_prob, key=lambda x: x[0], reverse=True)
    cutoff_bin = sorted_prob[0:int(total_num * cutoff)]
    return len(cutoff_bin[cutoff_bin == 1]) / fraud_num

In [8]:
# forward selection
best_feature = set()
while len(best_feature) < 30:
    # get candidate variables
    candidate = set(ks_df.columns) - best_feature
    
    # calculate FDR
    scores = dict()
    for col in candidate:
        features = list(best_feature) + [col]
        test_x = ks_df.filter(items=features)
        if len(features) == 1:
            # only one feature, reshape needed
            test_x = test_x.to_numpy().reshape(-1, 1)
        sample_x, sample_y = RandomUnderSampler().fit_resample(test_x, y)
        classifier = LogisticRegression().fit(sample_x, sample_y)
        scores[col] = fdr(sample_y, prob=classifier.predict_proba(sample_x))
        # print(f"FRD for {col}: {scores[col]}")
    
    # get the best feature
    best = max(scores.items(), key=operator.itemgetter(1))
    print(f"FRD for {best[0]}: {best[1]:.3f}")
    best_feature.add(best[0])

FRD for ssn_fulladdress_day_since: 0.021
FRD for name_count_0_by_7: 0.021
FRD for address_count_1_by_14: 0.021
FRD for ssn_lastname_count_0_by_14: 0.021
FRD for homephone_count_1_by_3: 0.021
FRD for homephone_count_0_by_3: 0.021
FRD for dob_count_1_by_14: 0.021
FRD for dob_count_1_by_7: 0.021
FRD for name_dob_count_1_by_30: 0.021
FRD for fulladdress_homephone_day_since: 0.021
FRD for ssn_address_day_since: 0.021
FRD for ssn_name_dob_day_since: 0.021
FRD for name_dob_day_since: 0.021
FRD for fulladdress_count_0_by_30: 0.021
FRD for ssn_name_day_since: 0.021
FRD for name_dob_count_0_by_30: 0.021
FRD for dob_count_14: 0.021


KeyboardInterrupt: 

In [119]:
y = df['fraud_label'].to_numpy()
x = ks_df.iloc[:, 2:5].to_numpy()
# a = a.to_numpy().reshape(-1, 1)

In [11]:
x_f = [i for i, j in zip(x, y) if j == 1]
x_nf = [i for i, j in zip(x, y) if j == 0]
x_sample = random.sample(x_nf, len(x_f)) + x_f
y_sample = [0] * len(x_f) + [1] * len(x_f)

In [44]:
x_sample, y_sample = sklearn.utils.shuffle(x_sample, y_sample)

In [48]:
clf = LogisticRegression().fit(x_sample, y_sample)
fdr(np.asarray(y_sample), classifier=clf, x=x_sample)

0.021052631578947368

In [132]:
x_sample, y_sample = RandomUnderSampler().fit_resample(x, y)
clf = LogisticRegression().fit(x_sample, y_sample)
y_pre = clf.predict(x)
print(balanced_accuracy_score(y, y_pre))
print(recall_score(y, y_pre))

0.5259172666648955
0.3368421052631579


In [125]:
clf = LogisticRegression(class_weight='balanced').fit(x, y)
y_pre = clf.predict(x)
print(balanced_accuracy_score(y, y_pre))
print(recall_score(y, y_pre))

0.5577937777305454
0.11578947368421053


In [139]:
from sklearn.ensemble import RandomForestClassifier
x_sample, y_sample = RandomUnderSampler().fit_resample(x, y)
clf = RandomForestClassifier(class_weight='balanced').fit(x_sample, y_sample)
y_pre = clf.predict(x)
print(balanced_accuracy_score(y, y_pre))
print(recall_score(y, y_pre))

0.5532479608916284
0.2736842105263158


In [146]:
from imblearn.ensemble import BalancedBaggingClassifier
clf = BalancedBaggingClassifier().fit(x, y)
y_pre = clf.predict(x)
print(balanced_accuracy_score(y, y_pre))
print(recall_score(y, y_pre))

0.5859294880310317
0.6105263157894737


In [117]:
x = ks_df.to_numpy()
from imblearn.ensemble import BalancedBaggingClassifier
y_pre = np.zeros((len(y), 2))
for _ in range(10):
    x_sample, y_sample = RandomUnderSampler().fit_resample(x, y)
    clf = LogisticRegression().fit(x_sample, y_sample)
    y_ = clf.predict(x)
    print(_)
    print(balanced_accuracy_score(y, y_))
    print(recall_score(y, y_))
    y_pre += clf.predict_proba(x)
y_pre /= 10

print('fdr', fdr(y, prob=y_pre))

0
0.6505566035229416
0.6210526315789474
1
0.6530061903876299
0.6421052631578947
2
0.6362708892372273
0.42105263157894735
3
0.6322830043306145
0.42105263157894735
4
0.6671803182868833
0.6736842105263158
5
0.6372671962591993
0.4105263157894737
6
0.6375594463189777
0.4421052631578947
7
0.6188527856744335
0.37894736842105264
8
0.632431786179229
0.37894736842105264
9
0.6388187783947501
0.4
fdr 0.021052631578947368


In [150]:
y_pre = np.zeros((len(y), 2))
for _ in range(10):
    x_sample, y_sample = SMOTE(sampling_strategy=0.1).fit_resample(x, y)
    x_sample, y_sample = RandomUnderSampler().fit_resample(x_sample, y_sample)
    clf = LogisticRegression().fit(x_sample, y_sample)
    y_ = clf.predict(x)
    print(_)
    print(balanced_accuracy_score(y, y_))
    print(recall_score(y, y_))
    y_pre += clf.predict_proba(x)
y_pre /= 10

print('fdr', fdr(y, prob=y_pre))


0
0.5577937777305454
0.11578947368421053
1
0.5577937777305454
0.11578947368421053
2
0.5577937777305454
0.11578947368421053
3
0.5577937777305454
0.11578947368421053
4
0.5577937777305454
0.11578947368421053
5
0.5577937777305454
0.11578947368421053
6
0.5577937777305454
0.11578947368421053
7
0.5577937777305454
0.11578947368421053
8
0.5577937777305454
0.11578947368421053
9
0.5577937777305454
0.11578947368421053
fdr 0.021052631578947368
