### Objective : Predict if the given transaction is fraudulent or not.

In [1]:
import numpy as np
import pandas as pd

#### Load dataset

In [2]:
fname = r'C:\Users\AnitaM\Downloads\creditcard\creditcard.csv'
cc = pd.read_csv(fname)
cc.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


#### Check the data

In [3]:
cc.dtypes

Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object

In [4]:
cc.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

#### Vectorize the data to pass it to neural network

In [6]:
cc_features = cc.copy()
cc_labels = cc_features.pop('Class')

print('Shape of features : ', cc_features.shape)
print('Shape of labels : ', cc_labels.shape)

Shape of features :  (284807, 30)
Shape of labels :  (284807,)


In [8]:
targets_testing = np.array(cc_labels, dtype='int64')
targets_testing.shape

(284807,)

In [9]:
all_features = []
all_targets = []

In [10]:
with open(fname) as f:
    for i, line in enumerate(f):
        if i == 0:
            print("HEADER:", line.strip())
            continue  # Skip header
        fields = line.strip().split(",")
        all_features.append([float(v.replace('"', "")) for v in fields[:-1]])
        all_targets.append([int(fields[-1].replace('"', ""))])
        if i == 1:
            print("EXAMPLE FEATURES:", all_features[-1])

HEADER: "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class"
EXAMPLE FEATURES: [0.0, -1.3598071336738, -0.0727811733098497, 2.53634673796914, 1.37815522427443, -0.338320769942518, 0.462387777762292, 0.239598554061257, 0.0986979012610507, 0.363786969611213, 0.0907941719789316, -0.551599533260813, -0.617800855762348, -0.991389847235408, -0.311169353699879, 1.46817697209427, -0.470400525259478, 0.207971241929242, 0.0257905801985591, 0.403992960255733, 0.251412098239705, -0.018306777944153, 0.277837575558899, -0.110473910188767, 0.0669280749146731, 0.128539358273528, -0.189114843888824, 0.133558376740387, -0.0210530534538215, 149.62]


In [11]:
features = np.array(all_features, dtype="float32")
targets = np.array(all_targets, dtype="uint8")
print("features.shape:", features.shape)
print("targets.shape:", targets.shape)

features.shape: (284807, 30)
targets.shape: (284807, 1)


#### Prepare a validation dataset

In [12]:
num_val_samples = int(len(features) * 0.2)
train_features = features[:-num_val_samples]
train_targets = targets[:-num_val_samples]
val_features = features[-num_val_samples:]
val_targets = targets[-num_val_samples:]

print("Number of training samples:", len(train_features))
print("Number of validation samples:", len(val_features))

Number of training samples: 227846
Number of validation samples: 56961


#### Analyze class imbalance in the dataset

In [13]:
counts = np.bincount(train_targets[:, 0])
print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(train_targets)
    )
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

Number of positive samples in training data: 417 (0.18% of total)


#### Normalize the data using training set statistics

In [14]:
mean = np.mean(train_features, axis=0)
train_features -= mean
val_features -= mean
std = np.std(train_features, axis=0)
train_features /= std
val_features /= std

#### Build a binary classification model

In [15]:
from tensorflow import keras

In [16]:
model = keras.Sequential(
    [
        keras.layers.Dense(
            256, activation="relu", input_shape=(train_features.shape[-1],)
        ),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               7936      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 139,777
Trainable params: 139,777
Non-trainable params: 0
__________________________________________________

#### Train the model with class_weight argument

In [17]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
)

callbacks = [keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")]
class_weight = {0: weight_for_0, 1: weight_for_1}

model.fit(
    train_features,
    train_targets,
    batch_size=2048,
    epochs=30,
    verbose=2,
    callbacks=callbacks,
    validation_data=(val_features, val_targets),
    class_weight=class_weight,
)

Train on 227846 samples, validate on 56961 samples
Epoch 1/30
227846/227846 - 111s - loss: 2.0977e-06 - fn: 45.0000 - fp: 18774.0000 - tn: 208655.0000 - tp: 372.0000 - precision: 0.0194 - recall: 0.8921 - val_loss: 1.2587e-06 - val_fn: 7.0000 - val_fp: 2658.0000 - val_tn: 54228.0000 - val_tp: 68.0000 - val_precision: 0.0249 - val_recall: 0.9067
Epoch 2/30
227846/227846 - 17s - loss: 1.5965e-06 - fn: 37.0000 - fp: 12501.0000 - tn: 214928.0000 - tp: 380.0000 - precision: 0.0295 - recall: 0.9113 - val_loss: 1.0043e-06 - val_fn: 9.0000 - val_fp: 1523.0000 - val_tn: 55363.0000 - val_tp: 66.0000 - val_precision: 0.0415 - val_recall: 0.8800
Epoch 3/30
227846/227846 - 14s - loss: 1.1091e-06 - fn: 27.0000 - fp: 8073.0000 - tn: 219356.0000 - tp: 390.0000 - precision: 0.0461 - recall: 0.9353 - val_loss: 1.0506e-06 - val_fn: 9.0000 - val_fp: 666.0000 - val_tn: 56220.0000 - val_tp: 66.0000 - val_precision: 0.0902 - val_recall: 0.8800
Epoch 4/30
227846/227846 - 14s - loss: 9.9642e-07 - fn: 27.0000 -

Epoch 29/30
227846/227846 - 13s - loss: 2.7340e-07 - fn: 1.0000 - fp: 3349.0000 - tn: 224080.0000 - tp: 416.0000 - precision: 0.1105 - recall: 0.9976 - val_loss: 4.6196e-06 - val_fn: 11.0000 - val_fp: 228.0000 - val_tn: 56658.0000 - val_tp: 64.0000 - val_precision: 0.2192 - val_recall: 0.8533
Epoch 30/30
227846/227846 - 13s - loss: 3.8658e-07 - fn: 2.0000 - fp: 2663.0000 - tn: 224766.0000 - tp: 415.0000 - precision: 0.1348 - recall: 0.9952 - val_loss: 4.4903e-06 - val_fn: 11.0000 - val_fp: 332.0000 - val_tn: 56554.0000 - val_tp: 64.0000 - val_precision: 0.1616 - val_recall: 0.8533


<tensorflow.python.keras.callbacks.History at 0xbec18acf98>

In [None]:
# In the real world, even higher weight can be assigned to class 1, so as to reflect that False Negatives are more costly than False Positives.