# Binary Classification with k-Fold Cross Validation

In [8]:
import tensorflow as tf
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.metrics import classification_report, f1_score
import numpy as np

In [9]:
df = pd.read_csv('datasets/fraud_detection.csv')
X = df.iloc[:, 1:-1]
y = df['targets']

In [10]:
X_main, X_holdout, y_main, y_holdout = train_test_split(X, y, test_size=0.2, random_state=11)
mm  = MinMaxScaler()
X_holdout = mm.fit_transform(X_holdout)
X_main = X_main.to_numpy()
y_main = y_main.to_numpy()
y_holdout = y_holdout.to_numpy()

In [11]:
X_main.shape

(16374, 112)

In [12]:
X_holdout.shape

(4094, 112)

In [16]:
k = 10 # 10 folds
skf = StratifiedKFold(k, shuffle=True, random_state=42)
fold = 0
best_model = None
best_fold = 0
best_model_f1 = 0
for train_idx, test_idx in skf.split(X_main, y_main):
    # In argument of skf.split, along with X_main, you have to give the target variables array y_main which contains classes as [0,1,1,2,3,4,4,5,5].
    # It is useful for even distribution of different class instances among the folds.
    # train_idx and test_idx are the arrays like: [1,2,3,4] which contains indices of the passed array X_main
    fold += 1
    print(f"Fold: {fold}")
    X_train = X_main[train_idx]
    y_train = y_main[train_idx]
    X_val = X_main[test_idx]
    y_val = y_main[test_idx]
    X_train = mm.fit_transform(X_train)
    X_val = mm.fit_transform(X_val)
    model = tf.keras.Sequential([
        tf.keras.Input(shape= (X_train.shape[1],) ),
        tf.keras.layers.Dense(units=1) 
])
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.SGD(learning_rate=1.))
    model.fit(X_train,y_train,validation_data=(X_val,y_val), batch_size=10000, epochs=100, verbose=0)
    logit = model.predict(X_val, verbose=0) # This is the raw answer of equation wx+b
    y_pred = np.array(tf.nn.sigmoid(logit))
    y_pred_classes = np.where(y_pred > 0.5, 1, 0)
    weighted_f1 = f1_score(y_val, y_pred_classes, average='weighted')
    if weighted_f1 > best_model_f1:
        best_model = model
        best_fold = fold
        best_model_f1 = weighted_f1
    print(f"Fold F1: {weighted_f1}\n")

logit_holdout = best_model.predict(X_holdout, verbose=0)
y_pred_holdout = np.array(tf.nn.sigmoid(logit_holdout))
y_pred_holdout_classes = np.where(y_pred_holdout > 0.5, 1, 0)
f1_holdout = f1_score(y_holdout, y_pred_holdout_classes, average='weighted')
print('Best fold: ', best_fold)
print('Best model params:\n')
print(f'F1 score for cross validation set: {best_model_f1}') 
print(f'F1 score for hold out set: {f1_holdout}') 
cr = classification_report(y_holdout, y_pred_holdout_classes)
print('\nClassification Report of Best Model:\n', cr)


Fold: 1
Fold F1: 0.8403894602890003

Fold: 2
Fold F1: 0.8328204707112381

Fold: 3
Fold F1: 0.8447228429492255

Fold: 4
Fold F1: 0.8437072068550302

Fold: 5
Fold F1: 0.8325217602845573

Fold: 6
Fold F1: 0.8446430580048759

Fold: 7
Fold F1: 0.8382108164743376

Fold: 8
Fold F1: 0.8400022854847489

Fold: 9
Fold F1: 0.8375061499949383

Fold: 10
Fold F1: 0.8487815345943778

Best fold:  10
Best model params:

F1 score for cross validation set: 0.8487815345943778
F1 score for hold out set: 0.847544141677466

Classification Report of Best Model:
               precision    recall  f1-score   support

           0       0.88      0.93      0.90      3034
           1       0.76      0.62      0.69      1060

    accuracy                           0.85      4094
   macro avg       0.82      0.78      0.80      4094
weighted avg       0.85      0.85      0.85      4094

