In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('./data/creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
from sklearn.preprocessing import StandardScaler, RobustScaler

rbs = RobustScaler()
sds = StandardScaler()

time_scaler = rbs.fit_transform(data['Time'].values.reshape(-1, 1))
amount_scaler = rbs.fit_transform(data['Amount'].values.reshape(-1, 1))

data['time_scaler'] = time_scaler
data['amount_scaler'] = amount_scaler

data.drop(['Time', 'Amount'], axis=1, inplace=True)

In [8]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,time_scaler,amount_scaler
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,-0.994983,1.783274
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.994983,-0.269825
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,-0.994972,4.983721
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,-0.994972,1.418291
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,-0.99496,0.670579


In [10]:
print('非欺诈占比:', round(data['Class'].value_counts()[0] / len(data) * 100, 2))
print('欺诈占比:', round(data['Class'].value_counts()[1] / len(data) * 100, 2))

originTrainX = data.drop(['Class'], axis=1)
originTrainy = data['Class']

非欺诈占比: 99.83
欺诈占比: 0.17


In [16]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

print('length of X (train): {} | length of y (train): {}'.format(len(originTrainX), len(originTrainy)))

accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []

log_reg_sm = LogisticRegression()

log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

rand_log_reg = RandomizedSearchCV(log_reg_sm, log_reg_params, n_iter=4)



sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train, index in sss.split(originTrainX, originTrainy):
    pipeline = make_pipeline(SMOTE(), rand_log_reg)
    model = pipeline.fit(originTrainX[train].reshape(-1, 1), originTrainy[train].reshape(-1, 1))
    best_est = rand_log_reg.best_estimator_
    prediction = best_est.prediction(originTrainX[test].reshape(-1, 1))
    
    
    accuracy_lst.append(pipeline.score(originTrainX[test], originTrainX[test]))
    precision_lst.append(precision_score(originTrainX[test], prediction))
    recall_lst.append(recall_score(originTrainX[test], prediction))
    f1_lst.append(f1_score(originTrainX[test], prediction))
    auc_lst.append(roc_auc_score(originTrainX[test], prediction))

print('---' * 45)
print('')
print("accuracy: {}".format(np.mean(accuracy_lst)))
print("precision: {}".format(np.mean(precision_lst)))
print("recall: {}".format(np.mean(recall_lst)))
print("f1: {}".format(np.mean(f1_lst)))
print('---' * 45)


length of X (train): 284807 | length of y (train): 284807


KeyError: "None of [Int64Index([ 30473,  30496,  31002,  33276,  39183,  40085,  40525,  41395,\n             41569,  41943,\n            ...\n            284797, 284798, 284799, 284800, 284801, 284802, 284803, 284804,\n            284805, 284806],\n           dtype='int64', length=227845)] are in the [columns]"

In [17]:
sm = SMOTE(random_state=14)
X_res, y_res = sm.fit_resample(originTrainX, originTrainy)

In [18]:
from collections import Counter
print('resampled dataset shape {}'.format(Counter(y_res)))

resampled dataset shape Counter({0: 284315, 1: 284315})


In [32]:
X = new_df.drop(['Class'], axis=1)
y = new_df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

In [22]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras import backend

n_inputs = X_train.shape[1]

sm_model = Sequential([
    Dense(n_inputs, input_shape=(n_inputs, ), activation='relu'),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax')
])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [23]:
sm_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 30)                930       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                992       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 1,988
Trainable params: 1,988
Non-trainable params: 0
_________________________________________________________________


In [24]:
sm_model.compile(Adam(lr=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [33]:
sm_model.fit(X_train, y_train, validation_split=0.2, batch_size=25, epochs=20, shuffle=True, verbose=2)

Train on 629 samples, validate on 158 samples
Epoch 1/20
629/629 - 0s - loss: 0.1833 - acc: 0.9459 - val_loss: 0.1223 - val_acc: 0.9810
Epoch 2/20
629/629 - 0s - loss: 0.0579 - acc: 0.9762 - val_loss: 0.1373 - val_acc: 0.9873
Epoch 3/20
629/629 - 0s - loss: 0.0424 - acc: 0.9857 - val_loss: 0.1535 - val_acc: 0.9873
Epoch 4/20
629/629 - 0s - loss: 0.0342 - acc: 0.9825 - val_loss: 0.1553 - val_acc: 0.9873
Epoch 5/20
629/629 - 0s - loss: 0.0263 - acc: 0.9905 - val_loss: 0.1678 - val_acc: 0.9810
Epoch 6/20
629/629 - 0s - loss: 0.0225 - acc: 0.9921 - val_loss: 0.1696 - val_acc: 0.9810
Epoch 7/20
629/629 - 0s - loss: 0.0189 - acc: 0.9952 - val_loss: 0.1808 - val_acc: 0.9747
Epoch 8/20
629/629 - 0s - loss: 0.0158 - acc: 0.9968 - val_loss: 0.1868 - val_acc: 0.9747
Epoch 9/20
629/629 - 0s - loss: 0.0137 - acc: 0.9984 - val_loss: 0.1953 - val_acc: 0.9747
Epoch 10/20
629/629 - 0s - loss: 0.0109 - acc: 1.0000 - val_loss: 0.2105 - val_acc: 0.9747
Epoch 11/20
629/629 - 0s - loss: 0.0085 - acc: 1.0000

<tensorflow.python.keras.callbacks.History at 0x7fdbdd3aa8d0>

In [31]:
data = data.sample(frac=1)  ##随机抽样，frac抽样比例

##一比一重采样
fraud_df = data.loc[data['Class'] == 1]
non_fraud_df = data.loc[data['Class'] == 0][:492]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

new_df = normal_distributed_df.sample(frac=1, random_state=14)

new_df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,time_scaler,amount_scaler
244333,-5.222968,4.641827,-8.858204,7.723502,-1.507035,-2.159484,-4.205164,0.979334,-1.505637,-2.239066,...,0.624207,0.536429,-0.628334,-0.222651,0.382208,-2.693036,0.407935,1,0.794358,-0.29344
53591,-1.309441,1.786495,-1.37107,1.214335,-0.336642,-1.39012,-1.709109,0.667748,-1.699809,-3.843911,...,-0.02218,-0.299556,-0.226416,0.36436,-0.475102,0.571426,0.293426,1,-0.453894,-0.29344
232747,-0.519155,1.313411,0.198534,-0.984815,1.661996,-0.545091,2.118371,-1.533067,1.056106,1.026054,...,-0.381153,-0.007279,0.585161,-1.242991,-0.23042,-1.489159,-0.812762,0,0.735394,-0.170055
181301,1.450212,-0.765057,-2.048271,1.402407,0.407322,-0.330546,0.82094,-0.220077,-0.023103,0.304317,...,0.241444,-0.216697,0.708418,0.32159,-0.570939,-0.08015,-0.017062,0,0.472491,3.744847
249239,-0.082983,-3.935919,-2.616709,0.16331,-1.400952,-0.809419,1.50158,-0.471,1.519743,-1.134454,...,-0.182305,-0.921017,0.111635,-0.071622,-1.125881,-0.170947,0.126221,1,0.817878,15.02117


In [35]:
# X = new_df.drop(['Class'], axis=1)
# y = new_df['Class']
smX_train, smX_test, smy_train, smy_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

smX_train = smX_train.values
smX_test = smX_test.values
smy_train = smy_train.values
smy_test = smy_test.values

In [36]:
sm_model.fit(smX_train, smy_train, validation_split=0.2, batch_size=25, epochs=20, shuffle=True, verbose=2)

Train on 363923 samples, validate on 90981 samples
Epoch 1/20
363923/363923 - 12s - loss: 0.0132 - acc: 0.9964 - val_loss: 0.0061 - val_acc: 0.9980
Epoch 2/20
363923/363923 - 12s - loss: 0.0052 - acc: 0.9987 - val_loss: 0.0039 - val_acc: 0.9991
Epoch 3/20
363923/363923 - 12s - loss: 0.0038 - acc: 0.9991 - val_loss: 0.0034 - val_acc: 0.9991
Epoch 4/20
363923/363923 - 13s - loss: 0.0031 - acc: 0.9992 - val_loss: 0.0036 - val_acc: 0.9990
Epoch 5/20
363923/363923 - 13s - loss: 0.0029 - acc: 0.9993 - val_loss: 0.0039 - val_acc: 0.9987
Epoch 6/20
363923/363923 - 12s - loss: 0.0025 - acc: 0.9994 - val_loss: 0.0027 - val_acc: 0.9994
Epoch 7/20
363923/363923 - 13s - loss: 0.0025 - acc: 0.9994 - val_loss: 0.0025 - val_acc: 0.9995
Epoch 8/20
363923/363923 - 14s - loss: 0.0022 - acc: 0.9995 - val_loss: 0.0023 - val_acc: 0.9995
Epoch 9/20
363923/363923 - 12s - loss: 0.0022 - acc: 0.9995 - val_loss: 0.0021 - val_acc: 0.9996
Epoch 10/20
363923/363923 - 13s - loss: 0.0019 - acc: 0.9996 - val_loss: 0.0

<tensorflow.python.keras.callbacks.History at 0x7fdbda4f9fd0>