**First, vectorize the CSV data**

In [2]:
import csv
import numpy as np


fname = "/content/creditcard.csv"

all_features = []
all_targets = []
with open(fname) as f:
    for i, line in enumerate(f):
        if i == 0:
            print("HEADER:", line.strip())
            continue  # Skip header
        fields = line.strip().split(",")
        all_features.append([float(v.replace('"', "")) for v in fields[:-1]])
        all_targets.append([int(fields[-1].replace('"', ""))])
        if i == 1:
            print("EXAMPLE FEATURES:", all_features[-1])

features = np.array(all_features, dtype="float32")
targets = np.array(all_targets, dtype="uint8")
print("features.shape:", features.shape)
print("targets.shape:", targets.shape)

HEADER: "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class"
EXAMPLE FEATURES: [0.0, -1.3598071336738, -0.0727811733098497, 2.53634673796914, 1.37815522427443, -0.338320769942518, 0.462387777762292, 0.239598554061257, 0.0986979012610507, 0.363786969611213, 0.0907941719789316, -0.551599533260813, -0.617800855762348, -0.991389847235408, -0.311169353699879, 1.46817697209427, -0.470400525259478, 0.207971241929242, 0.0257905801985591, 0.403992960255733, 0.251412098239705, -0.018306777944153, 0.277837575558899, -0.110473910188767, 0.0669280749146731, 0.128539358273528, -0.189114843888824, 0.133558376740387, -0.0210530534538215, 149.62]
features.shape: (284807, 30)
targets.shape: (284807, 1)


**Prepare a validation set**

In [3]:
num_val_samples = int(len(features) * 0.2)
train_features = features[:-num_val_samples]
train_targets = targets[:-num_val_samples]
val_features = features[-num_val_samples:]
val_targets = targets[-num_val_samples:]

print("Number of training samples:", len(train_features))
print("Number of validation samples:", len(val_features))

Number of training samples: 227846
Number of validation samples: 56961


**Analyze class imbalance in the targets**

In [4]:
counts = np.bincount(train_targets[:, 0])
print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(train_targets)
    )
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

Number of positive samples in training data: 417 (0.18% of total)


**Normalize the data using training set statistics**

In [5]:
mean = np.mean(train_features, axis=0)
train_features -= mean
val_features -= mean
std = np.std(train_features, axis=0)
train_features /= std
val_features /= std

**Build a binary classification model**

In [7]:
import keras

model = keras.Sequential(
    [
        keras.Input(shape=train_features.shape[1:]),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()

**Train the model with class_weight argument**

In [8]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
)

callbacks = [keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.keras")]
class_weight = {0: weight_for_0, 1: weight_for_1}

model.fit(
    train_features,
    train_targets,
    batch_size=2048,
    epochs=30,
    verbose=2,
    callbacks=callbacks,
    validation_data=(val_features, val_targets),
    class_weight=class_weight,
)

Epoch 1/30
112/112 - 11s - 100ms/step - fn: 46.0000 - fp: 35125.0000 - loss: 2.6337e-06 - precision: 0.0105 - recall: 0.8897 - tn: 192304.0000 - tp: 371.0000 - val_fn: 9.0000 - val_fp: 1183.0000 - val_loss: 0.1244 - val_precision: 0.0528 - val_recall: 0.8800 - val_tn: 55703.0000 - val_tp: 66.0000
Epoch 2/30
112/112 - 8s - 71ms/step - fn: 36.0000 - fp: 6373.0000 - loss: 1.4472e-06 - precision: 0.0564 - recall: 0.9137 - tn: 221056.0000 - tp: 381.0000 - val_fn: 8.0000 - val_fp: 1858.0000 - val_loss: 0.1201 - val_precision: 0.0348 - val_recall: 0.8933 - val_tn: 55028.0000 - val_tp: 67.0000
Epoch 3/30
112/112 - 9s - 82ms/step - fn: 33.0000 - fp: 8625.0000 - loss: 1.4731e-06 - precision: 0.0426 - recall: 0.9209 - tn: 218804.0000 - tp: 384.0000 - val_fn: 6.0000 - val_fp: 1653.0000 - val_loss: 0.1011 - val_precision: 0.0401 - val_recall: 0.9200 - val_tn: 55233.0000 - val_tp: 69.0000
Epoch 4/30
112/112 - 7s - 65ms/step - fn: 25.0000 - fp: 7891.0000 - loss: 1.0994e-06 - precision: 0.0473 - recal

<keras.src.callbacks.history.History at 0x7b0047222c20>

###Conclusions
At the end of training, out of 56,961 validation transactions, we are:

* Correctly identifying 66 of them as fraudulent  
* Missing 9 fraudulent transactions  
* At the cost of incorrectly flagging 441 legitimate transactions  

In the real world, one would put an even higher weight on class 1, so as to reflect that False Negatives are more costly than False Positives.

Next time your credit card gets declined in an online purchase – this is why.

Example available on HuggingFace.