Step 1: Data Cleaning and Saving Cleaned Versions

In [1]:
import pandas as pd
import numpy as np

creditcard_df = pd.read_csv('../data/creditcard.csv')
fraud_data_df = pd.read_csv('../data/Fraud_Data.csv')

In [2]:
creditcard_df.drop_duplicates(inplace=True)

In [3]:
creditcard_df['Time'] = pd.to_datetime(creditcard_df['Time'], unit='s', origin='2010-01-01')

In [4]:
creditcard_df.to_csv('../data/cleaned_creditcard.csv', index=False)

In [5]:
fraud_data_df.drop_duplicates(inplace=True)

In [6]:
fraud_data_df['signup_time'] = pd.to_datetime(fraud_data_df['signup_time'])
fraud_data_df['purchase_time'] = pd.to_datetime(fraud_data_df['purchase_time'])

In [7]:
fraud_data_df.to_csv('../data/cleaned_fraud_data.csv', index=False)

In [8]:
fraud_data_df['signup_time'] = pd.to_datetime(fraud_data_df['signup_time'])
fraud_data_df['purchase_time'] = pd.to_datetime(fraud_data_df['purchase_time'])
fraud_data_df['signup_month'] = fraud_data_df['signup_time'].dt.month
fraud_data_df['signup_day'] = fraud_data_df['signup_time'].dt.day
fraud_data_df['purchase_month'] = fraud_data_df['purchase_time'].dt.month
fraud_data_df['purchase_day'] = fraud_data_df['purchase_time'].dt.day
fraud_data_df['purchase_hour'] = fraud_data_df['purchase_time'].dt.hour
fraud_data_df['time_diff'] = (fraud_data_df['purchase_time'] - fraud_data_df['signup_time']).dt.total_seconds() / 3600

In [9]:
fraud_data_df.drop(['signup_time', 'purchase_time'], axis=1, inplace=True)

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

fraud_data_df['device_id_encoded'] = le.fit_transform(fraud_data_df['device_id'])

fraud_data_df = fraud_data_df.drop(['device_id'], axis=1)

In [11]:
creditcard_df['Time'] = pd.to_datetime(creditcard_df['Time'], unit='s', origin='unix')
creditcard_df['transaction_hour'] = creditcard_df['Time'].dt.hour
creditcard_df['transaction_day'] = creditcard_df['Time'].dt.day
creditcard_df['transaction_month'] = creditcard_df['Time'].dt.month

In [12]:
creditcard_df.drop(['Time'], axis=1, inplace=True)

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
fraud_data_df['source'] = label_encoder.fit_transform(fraud_data_df['source'])
fraud_data_df['browser'] = label_encoder.fit_transform(fraud_data_df['browser'])
fraud_data_df['sex'] = label_encoder.fit_transform(fraud_data_df['sex'])

In [14]:
scaler = StandardScaler()

numerical_columns_fraud = fraud_data_df.select_dtypes(include=['int64', 'float64']).columns.difference(['class'])
fraud_data_df[numerical_columns_fraud] = scaler.fit_transform(fraud_data_df[numerical_columns_fraud])

In [15]:
numerical_columns_credit = creditcard_df.select_dtypes(include=['int64', 'float64']).columns.difference(['Class'])
creditcard_df[numerical_columns_credit] = scaler.fit_transform(creditcard_df[numerical_columns_credit])

Feature and Target Separation

In [16]:
X_fraud = fraud_data_df.drop('class', axis=1)
y_fraud = fraud_data_df['class']

X_credit = creditcard_df.drop('Class', axis=1)
y_credit = creditcard_df['Class']

Train-Test Split

In [17]:
from sklearn.model_selection import train_test_split

X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)

X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)


In [18]:
X_train_fraud.to_csv('../data/X_train_fraud.csv', index=False)
X_test_fraud.to_csv('../data/X_test_fraud.csv', index=False)
y_train_fraud.to_csv('../data/y_train_fraud.csv', index=False)
y_test_fraud.to_csv('../data/y_test_fraud.csv', index=False)

X_train_credit.to_csv('../data/X_train_credit.csv', index=False)
X_test_credit.to_csv('../data/X_test_credit.csv', index=False)
y_train_credit.to_csv('../data/y_train_credit.csv', index=False)
y_test_credit.to_csv('../data/y_test_credit.csv', index=False)

print("Preprocessing complete! Data is ready for training.")

Preprocessing complete! Data is ready for training.


Step 2: Model Selection

In [23]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from keras.models import Sequential
from keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten
from keras.utils import to_categorical

def train_and_evaluate_model(model, model_name, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else float('nan')
        
        mlflow.log_params(model.get_params())
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "roc_auc": roc_auc
        })
        
        mlflow.sklearn.log_model(model, model_name)
        
        print(f"{model_name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}, ROC-AUC: {roc_auc}")

In [24]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500)
}

In [25]:
print("\nTraining models on credit card data...")
for model_name, model in models.items():
    train_and_evaluate_model(model, model_name + " - Credit", X_train_credit, X_test_credit, y_train_credit, y_test_credit)


Training models on credit card data...




Logistic Regression - Credit - Accuracy: 0.9992246149508336, Precision: 0.896551724137931, Recall: 0.5777777777777777, F1 Score: 0.7027027027027027, ROC-AUC: 0.9652120791364649




Decision Tree - Credit - Accuracy: 0.9991365030134283, Precision: 0.7252747252747253, Recall: 0.7333333333333333, F1 Score: 0.7292817679558011, ROC-AUC: 0.8664460369010637




Random Forest - Credit - Accuracy: 0.9995418179254926, Precision: 0.9705882352941176, Recall: 0.7333333333333333, F1 Score: 0.8354430379746836, ROC-AUC: 0.9312538242492705




Gradient Boosting - Credit - Accuracy: 0.9993127268882388, Precision: 0.9047619047619048, Recall: 0.6333333333333333, F1 Score: 0.7450980392156863, ROC-AUC: 0.7665101666195988




MLP Classifier - Credit - Accuracy: 0.9993479716632009, Precision: 0.9206349206349206, Recall: 0.6444444444444445, F1 Score: 0.7581699346405228, ROC-AUC: 0.9383744783331764


In [26]:
print("Training models on fraud data...")
for model_name, model in models.items():
    train_and_evaluate_model(model, model_name + " - Fraud", X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud)

Training models on fraud data...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression - Fraud - Accuracy: 0.9331303973794792, Precision: 0.958011049723757, Recall: 0.3042105263157895, F1 Score: 0.4617842876165113, ROC-AUC: 0.7649434165181338




Decision Tree - Fraud - Accuracy: 0.9025576547662376, Precision: 0.4856365285757484, Recall: 0.5635087719298245, F1 Score: 0.5216826376482053, ROC-AUC: 0.7506836228041334




Random Forest - Fraud - Accuracy: 0.9564239155609966, Precision: 1.0, Recall: 0.5378947368421053, F1 Score: 0.6995208761122519, ROC-AUC: 0.7686466495028716




Gradient Boosting - Fraud - Accuracy: 0.9564570029447772, Precision: 1.0, Recall: 0.5382456140350877, F1 Score: 0.6998175182481752, ROC-AUC: 0.7761667438973352




MLP Classifier - Fraud - Accuracy: 0.9237666677695795, Precision: 0.9739583333333334, Recall: 0.1968421052631579, F1 Score: 0.3274956217162872, ROC-AUC: 0.7658989938221875


# ---------------------------
# Deep Learning Models (CNN, RNN, LSTM)
# ---------------------------

In [31]:
from keras.callbacks import EarlyStopping

def build_and_train_keras_model(model, model_name, X_train, X_test, y_train, y_test, input_shape):
    with mlflow.start_run(run_name=model_name):
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
        early_stopping = EarlyStopping(monitor='val_loss', patience=3)
        
        model.fit(X_train, y_train, validation_split=0.2, epochs=20, batch_size=32, callbacks=[early_stopping])
        loss, accuracy = model.evaluate(X_test, y_test)
        
        
        y_proba = model.predict(X_test).ravel()
        y_pred = (y_proba > 0.5).astype(int)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_proba)
        
        
        mlflow.log_metrics({
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "roc_auc": roc_auc
        })
        mlflow.keras.log_model(model, model_name)
        
        print(f"{model_name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}, ROC-AUC: {roc_auc}")

In [32]:
X_train_fraud_dl = np.expand_dims(X_train_fraud.values, axis=2)
X_test_fraud_dl = np.expand_dims(X_test_fraud.values, axis=2)
X_train_credit_dl = np.expand_dims(X_train_credit.values, axis=2)
X_test_credit_dl = np.expand_dims(X_test_credit.values, axis=2)

In [33]:
cnn_model = Sequential([
    Conv1D(32, 2, activation='relu', input_shape=(X_train_fraud_dl.shape[1], 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
build_and_train_keras_model(cnn_model, "CNN - Fraud", X_train_fraud_dl, X_test_fraud_dl, y_train_fraud, y_test_fraud, X_train_fraud_dl.shape[1])

Epoch 1/20
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step - accuracy: 0.8464 - loss: 51.6931 - val_accuracy: 0.9224 - val_loss: 2.1766
Epoch 2/20
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8654 - loss: 10.2845 - val_accuracy: 0.9076 - val_loss: 29.1280
Epoch 3/20
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8742 - loss: 7.2040 - val_accuracy: 0.9137 - val_loss: 5.2207
Epoch 4/20
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8753 - loss: 4.5591 - val_accuracy: 0.9286 - val_loss: 0.8255
Epoch 5/20
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.8782 - loss: 2.7661 - val_accuracy: 0.3251 - val_loss: 4.8234
Epoch 6/20
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.8944 - loss: 1.2148 - val_accuracy: 0.9391 - val_loss: 0.5652
Epoch 7/20




CNN - Fraud - Accuracy: 0.9245938658714294, Precision: 0.9481946624803768, Recall: 0.2119298245614035, F1 Score: 0.3464295956409521, ROC-AUC: 0.7618238487022364


In [34]:
lstm_model = Sequential([
    LSTM(32, input_shape=(X_train_fraud_dl.shape[1], 1)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
build_and_train_keras_model(lstm_model, "LSTM - Fraud", X_train_fraud_dl, X_test_fraud_dl, y_train_fraud, y_test_fraud, X_train_fraud_dl.shape[1])

  super().__init__(**kwargs)


Epoch 1/20
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 5ms/step - accuracy: 0.9353 - loss: 0.2271 - val_accuracy: 0.9544 - val_loss: 0.1844
Epoch 2/20
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 9ms/step - accuracy: 0.9547 - loss: 0.1842 - val_accuracy: 0.9552 - val_loss: 0.1837
Epoch 3/20
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 9ms/step - accuracy: 0.9550 - loss: 0.1833 - val_accuracy: 0.9555 - val_loss: 0.1811
Epoch 4/20
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7ms/step - accuracy: 0.9552 - loss: 0.1817 - val_accuracy: 0.9557 - val_loss: 0.1808
Epoch 5/20
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7ms/step - accuracy: 0.9561 - loss: 0.1789 - val_accuracy: 0.9557 - val_loss: 0.1800
Epoch 6/20
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 9ms/step - accuracy: 0.9568 - loss: 0.1765 - val_accuracy: 0.9556 - val_loss: 0.1805
Epoch 7/20



LSTM - Fraud - Accuracy: 0.9559937715530396, Precision: 0.9896907216494846, Recall: 0.5389473684210526, F1 Score: 0.6978646069968196, ROC-AUC: 0.7644220037545


In [37]:
def create_cnn_model(input_shape):
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model
input_shape_cnn = (X_train_credit_dl.shape[1], X_train_credit_dl.shape[2])
cnn_model = create_cnn_model(input_shape_cnn)

build_and_train_keras_model(cnn_model, "CNN - Credit", X_train_credit_dl, X_test_credit_dl, y_train_credit, y_test_credit, input_shape_cnn)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m5675/5675[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - accuracy: 0.9976 - loss: 0.0095 - val_accuracy: 0.9992 - val_loss: 0.0039
Epoch 2/20
[1m5675/5675[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 5ms/step - accuracy: 0.9993 - loss: 0.0034 - val_accuracy: 0.9992 - val_loss: 0.0041
Epoch 3/20
[1m5675/5675[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 5ms/step - accuracy: 0.9995 - loss: 0.0030 - val_accuracy: 0.9993 - val_loss: 0.0042
Epoch 4/20
[1m5675/5675[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 5ms/step - accuracy: 0.9994 - loss: 0.0031 - val_accuracy: 0.9992 - val_loss: 0.0038
Epoch 5/20
[1m5675/5675[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 5ms/step - accuracy: 0.9994 - loss: 0.0032 - val_accuracy: 0.9993 - val_loss: 0.0033
Epoch 6/20
[1m5675/5675[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 5ms/step - accuracy: 0.9994 - loss: 0.0031 - val_accuracy: 0.9990 - val_loss: 0.0043
Epoch 7/20



CNN - Credit - Accuracy: 0.99943608045578, Precision: 0.953125, Recall: 0.6777777777777778, F1 Score: 0.7922077922077922, ROC-AUC: 0.9602787975775833
