[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aldomunaretto/immune_deep_learning/blob/main/notebooks/01_intro_DL/05_keras_imbalanced_classification.ipynb)

# Artificial Neural Networks - Imbalanced classification

## Credit Card Fraud Detection

Reference: https://keras.io/examples/structured_data/imbalanced_classification/

### Introduction

This example looks at the
[Kaggle Credit Card Fraud Detection](https://www.kaggle.com/mlg-ulb/creditcardfraud/)
dataset to demonstrate how
to train a classification model on data with highly imbalanced classes.

### Download Dataset from Kaggle

In [135]:
# # Install Kaggle library
# !pip install kaggle

# # Download and unzip the dataset
# !kaggle datasets download -d mlg-ulb/creditcardfraud -p /content/drive/MyDrive/data
# !unzip /content/drive/MyDrive/data/creditcardfraud.zip -d /content/drive/MyDrive/data

# print("Dataset downloaded and unzipped")

### Import Libraries

In [136]:
import csv
import numpy as np

# Import the proper libraries for keras
import keras
from tensorflow.keras import regularizers, Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.metrics import TruePositives, Precision, Recall, AUC
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import layers

import tensorflow as tf

### Vectorize the CSV data

In [137]:
# Reading the CSV file
fname = "/content/drive/MyDrive/data/creditcard.csv"

all_features = []
all_targets = []
with open(fname) as f:
    for i, line in enumerate(f):
        if i == 0:
            print("HEADER:", line.strip())
            continue  # Skip header
        fields = line.strip().split(",")
        all_features.append([float(v.replace('"', "")) for v in fields[:-1]])
        all_targets.append([int(fields[-1].replace('"', ""))])
        if i == 1:
            print("EXAMPLE FEATURES:", all_features[-1])

features = np.array(all_features, dtype="float32")
targets = np.array(all_targets, dtype="uint8")
print("features.shape:", features.shape)
print("targets.shape:", targets.shape)

HEADER: "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class"
EXAMPLE FEATURES: [0.0, -1.3598071336738, -0.0727811733098497, 2.53634673796914, 1.37815522427443, -0.338320769942518, 0.462387777762292, 0.239598554061257, 0.0986979012610507, 0.363786969611213, 0.0907941719789316, -0.551599533260813, -0.617800855762348, -0.991389847235408, -0.311169353699879, 1.46817697209427, -0.470400525259478, 0.207971241929242, 0.0257905801985591, 0.403992960255733, 0.251412098239705, -0.018306777944153, 0.277837575558899, -0.110473910188767, 0.0669280749146731, 0.128539358273528, -0.189114843888824, 0.133558376740387, -0.0210530534538215, 149.62]
features.shape: (284807, 30)
targets.shape: (284807, 1)


In [138]:
import pandas as pd
df = pd.read_csv(fname)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [139]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


### Prepare a validation set

In [229]:
num_val_samples = int(len(features) * 0.2)
X_train = features[:-num_val_samples]
y_train = targets[:-num_val_samples]
X_test = features[-num_val_samples:]
y_test = targets[-num_val_samples:]

print("Number of training samples:", len(X_train))
print("Number of validation samples:", len(X_test))

Number of training samples: 227846
Number of validation samples: 56961


In [230]:
features

array([[ 0.0000000e+00, -1.3598071e+00, -7.2781175e-02, ...,
         1.3355838e-01, -2.1053053e-02,  1.4962000e+02],
       [ 0.0000000e+00,  1.1918571e+00,  2.6615071e-01, ...,
        -8.9830989e-03,  1.4724169e-02,  2.6900001e+00],
       [ 1.0000000e+00, -1.3583541e+00, -1.3401631e+00, ...,
        -5.5352796e-02, -5.9751842e-02,  3.7866000e+02],
       ...,
       [ 1.7278800e+05,  1.9195650e+00, -3.0125386e-01, ...,
         4.4547720e-03, -2.6560828e-02,  6.7879997e+01],
       [ 1.7278800e+05, -2.4044006e-01,  5.3048253e-01, ...,
         1.0882074e-01,  1.0453282e-01,  1.0000000e+01],
       [ 1.7279200e+05, -5.3341252e-01, -1.8973334e-01, ...,
        -2.4153087e-03,  1.3648914e-02,  2.1700000e+02]], dtype=float32)

### Normalización + semilla

In [231]:
from sklearn.preprocessing import StandardScaler
import random
import numpy as np

seed = 42
random.seed(seed)          # Semilla para el generador aleatorio de Python
np.random.seed(seed)       # Semilla para NumPy
tf.random.set_seed(seed)   # Semilla para TensorFlow

scaler = StandardScaler()

X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

print('train_features mu, sigma', X_train_norm.mean(0), X_train_norm.std(0))
print('test_features mu, sigma', X_test_norm.mean(0), X_test_norm.std(0))

train_features mu, sigma [ 9.72985254e-06 -4.99238553e-08 -1.49481309e-07  4.70997929e-06
  1.04751805e-07 -3.37099550e-07  2.59797616e-07  6.72313483e-10
 -1.43351585e-07  1.13865312e-07  5.24606918e-08 -1.09009591e-06
  6.24808081e-07 -1.95501570e-07 -1.01077042e-07  2.27438157e-08
  2.55664858e-08 -3.92054773e-07  1.16144641e-07 -9.48189580e-08
  3.63741208e-07  2.80278584e-07  8.40823517e-08 -7.57773151e-08
 -1.55235682e-07  6.83880671e-07  1.08841531e-08 -3.03192600e-08
  1.14463985e-08 -8.06893695e-07] [0.99998254 0.9999752  0.99995184 0.9999744  0.99998105 0.9999654
 0.99997044 0.9999639  0.99991703 0.9999617  0.9999607  0.99996996
 0.99996406 0.99997354 0.9999689  0.99997693 0.99997944 0.9999634
 0.99997306 0.9999727  0.99993575 0.9999414  0.99998456 0.9999309
 0.9999576  0.9999833  0.9999894  0.9999168  0.99992394 0.99987143]
test_features mu, sigma [ 1.9960041   0.17281522  0.04075707 -0.6122728  -0.15515524  0.2320748
 -0.11553436  0.10928366 -0.0164728  -0.00990407  0.00779

### Build a binary classification model

In [232]:
print(X_train_norm.shape)

(227846, 30)


In [233]:
from tensorflow.keras import regularizers

model = keras.Sequential()

#Input_layer
model.add(Input(shape=(30,), name = "Input_layer"))

#Hidden_layer
model.add(Dense(64, activation= "relu",
                kernel_regularizer = regularizers.l2(0.001),name = "hidden_layer_1"))
model.add(layers.Dropout(0.3))

model.add(Dense(64, activation= "relu",
                kernel_regularizer = regularizers.l2(0.001), name = "hidden_layer_2"))
model.add(layers.Dropout(0.1))


model.add(Dense(64, activation= "relu",
                kernel_regularizer = regularizers.l2(0.001), name = "hidden_layer_3"))
model.add(layers.Dropout(0.1))

model.add(Dense(64, activation= "relu",
                kernel_regularizer = regularizers.l2(0.001), name = "hidden_layer_4"))
model.add(layers.Dropout(0.1))

#Output_layer
model.add(Dense(1, activation = "sigmoid", name = "Output_layer"))


model.summary()

### Practice: detects 90% of frauds in test dataset (TP >= 68)

**Tips**: check the following documentation (class weight parameter): https://keras.io/api/models/model_training_apis/#fit-method

In [234]:
# Definir las métricas
metrics = [
  keras.metrics.TruePositives(name="tp"),
  keras.metrics.FalsePositives(name="fp"),
  keras.metrics.AUC(name="auc")
]

# Compilar el modelo
model.compile(
    optimizer = Adam(learning_rate=0.001),
    loss = "binary_crossentropy",
    metrics = metrics,
)


# Definir los callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=20,
        restore_best_weights=True
    )
]


### Balanceo de clases

In [235]:
### Clacular los pesos de clase
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train.flatten()
)

#Convertir a diccionario
class_weights = dict(enumerate(class_weights))
class_weights

{0: 0.500916769629203, 1: 273.1966426858513}

In [236]:
# Entrenamiento
model.fit(
    X_train_norm,
    y_train,
    batch_size=1000,
    epochs=100,
    callbacks= callbacks,
    verbose=1,
    validation_split=0.20,
    class_weight=class_weights,
    shuffle =True
)

Epoch 1/100
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 39ms/step - auc: 0.7441 - fp: 21077.5156 - loss: 0.8644 - tp: 141.9402 - val_auc: 0.9286 - val_fp: 1432.0000 - val_loss: 0.4180 - val_tp: 45.0000
Epoch 2/100
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - auc: 0.9781 - fp: 3118.9619 - loss: 0.3850 - tp: 166.7120 - val_auc: 0.9509 - val_fp: 1608.0000 - val_loss: 0.3368 - val_tp: 45.0000
Epoch 3/100
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - auc: 0.9805 - fp: 3149.3586 - loss: 0.3441 - tp: 169.0489 - val_auc: 0.9498 - val_fp: 3514.0000 - val_loss: 0.4316 - val_tp: 47.0000
Epoch 4/100
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - auc: 0.9855 - fp: 4120.4839 - loss: 0.3359 - tp: 169.9946 - val_auc: 0.9500 - val_fp: 1267.0000 - val_loss: 0.2720 - val_tp: 46.0000
Epoch 5/100
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - auc: 0.9890 - fp: 3

<keras.src.callbacks.history.History at 0x7ae37aa19480>

In [238]:
results = model.evaluate(X_test_norm, y_test, verbose=0)
print('Test Loss: {}'.format(results[0]))
print('Test TP: {}'.format(results[1]))
print('Test FP: {}'.format(results[2]))
print('Test AUC: {}'.format(results[3]))

Test Loss: 0.03885570541024208
Test TP: 62.0
Test FP: 93.0
Test AUC: 0.9326913356781006
