In [42]:
import pandas as pd
import numpy as np

In [43]:
df_fraud=pd.read_csv('fraud_preprocessed.csv')
df_credit=pd.read_csv('credit_clean.csv')

In [None]:
df_fraud.head()

In [None]:
df_credit.head()

In [None]:
pip install mlflow

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
import mlflow
import mlflow.sklearn
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, SimpleRNN, LSTM

In [45]:
def preprocess_fraud_data(df):
    df['signup_time'] = pd.to_datetime(df['signup_time'])
    df['purchase_time'] = pd.to_datetime(df['purchase_time'])
    df = df.drop(['signup_time', 'purchase_time', 'user_id', 'device_id'], axis=1)
    return df

df_fraud = preprocess_fraud_data(df_fraud)

In [46]:
# Impute missing values
imputer = SimpleImputer(strategy='mean')
df_fraud = pd.DataFrame(imputer.fit_transform(df_fraud), columns=df_fraud.columns)
df_credit = pd.DataFrame(imputer.fit_transform(df_credit), columns=df_credit.columns)

In [47]:
# Ensure target variables are binary
df_fraud['class'] = df_fraud['class'].astype(int)
df_credit['Class'] = df_credit['Class'].astype(int)

In [48]:
# Separate features and target for Fraud_Data
X_fraud = df_fraud.drop('class', axis=1)
y_fraud = df_fraud['class']

# Separate features and target for creditcard data
X_credit = df_credit.drop('Class', axis=1)
y_credit = df_credit['Class']

# Train-test split for Fraud_Data
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.3, random_state=42)

# Train-test split for creditcard data
X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(X_credit, y_credit, test_size=0.3, random_state=42)

In [49]:
# Model training and evaluation function
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Performance of {model_name}:")
    print(classification_report(y_test, y_pred))

    # Log model and metrics with MLflow
    with mlflow.start_run():
        mlflow.sklearn.log_model(model, model_name)
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
        mlflow.log_metric("precision", precision_score(y_test, y_pred))
        mlflow.log_metric("recall", recall_score(y_test, y_pred))
        mlflow.log_metric("f1_score", f1_score(y_test, y_pred))

In [50]:
# Logistic Regression
log_reg = LogisticRegression()
train_and_evaluate_model(log_reg, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test, "Logistic Regression")



Performance of Logistic Regression:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     16675
           1       0.00      0.00      0.00      1733

    accuracy                           0.91     18408
   macro avg       0.45      0.50      0.48     18408
weighted avg       0.82      0.91      0.86     18408



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
# Logistic Regression
log_reg = LogisticRegression()
train_and_evaluate_model(log_reg, X_credit_train, y_credit_train, X_credit_test, y_credit_test, "Logistic Regression")



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Performance of Logistic Regression:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30215
           1       0.75      0.55      0.64        65

    accuracy                           1.00     30280
   macro avg       0.87      0.78      0.82     30280
weighted avg       1.00      1.00      1.00     30280



In [51]:
# Decision Tree
dt = DecisionTreeClassifier()
train_and_evaluate_model(dt, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test, "Decision Tree")



Performance of Decision Tree:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94     16675
           1       0.44      0.56      0.49      1733

    accuracy                           0.89     18408
   macro avg       0.69      0.74      0.71     18408
weighted avg       0.90      0.89      0.90     18408



In [62]:
# Decision Tree
dt = DecisionTreeClassifier()
train_and_evaluate_model(dt, X_credit_train, y_credit_train, X_credit_test, y_credit_test, "Decision Tree")



Performance of Decision Tree:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30215
           1       0.77      0.82      0.79        65

    accuracy                           1.00     30280
   macro avg       0.88      0.91      0.90     30280
weighted avg       1.00      1.00      1.00     30280



In [52]:
# Random Forest
rf = RandomForestClassifier()
train_and_evaluate_model(rf, X_credit_train, y_credit_train, X_credit_test, y_credit_test, "Random Forest")



Performance of Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30215
           1       0.98      0.91      0.94        65

    accuracy                           1.00     30280
   macro avg       0.99      0.95      0.97     30280
weighted avg       1.00      1.00      1.00     30280



In [63]:
# Random Forest
rf = RandomForestClassifier()
train_and_evaluate_model(rf, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test, "Random Forest")



Performance of Random Forest:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     16675
           1       0.99      0.52      0.68      1733

    accuracy                           0.95     18408
   macro avg       0.97      0.76      0.83     18408
weighted avg       0.96      0.95      0.95     18408



In [53]:
# Gradient Boosting
gb = GradientBoostingClassifier()
train_and_evaluate_model(gb, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test, "Gradient Boosting")



Performance of Gradient Boosting:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     16675
           1       0.77      0.01      0.01      1733

    accuracy                           0.91     18408
   macro avg       0.84      0.50      0.48     18408
weighted avg       0.89      0.91      0.86     18408



In [64]:
# Gradient Boosting
gb = GradientBoostingClassifier()
train_and_evaluate_model(gb,  X_credit_train, y_credit_train, X_credit_test, y_credit_test, "Gradient Boosting")



Performance of Gradient Boosting:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30215
           1       0.93      0.85      0.89        65

    accuracy                           1.00     30280
   macro avg       0.97      0.92      0.94     30280
weighted avg       1.00      1.00      1.00     30280



In [54]:
# Multi-Layer Perceptron (MLP)
mlp = MLPClassifier()
train_and_evaluate_model(mlp, X_credit_train, y_credit_train, X_credit_test, y_credit_test, "MLP")



Performance of MLP:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30215
           1       0.67      0.03      0.06        65

    accuracy                           1.00     30280
   macro avg       0.83      0.52      0.53     30280
weighted avg       1.00      1.00      1.00     30280



In [65]:
# Multi-Layer Perceptron (MLP)
mlp = MLPClassifier()
train_and_evaluate_model(mlp, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test, "MLP")



Performance of MLP:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     16675
           1       0.09      1.00      0.17      1733

    accuracy                           0.09     18408
   macro avg       0.05      0.50      0.09     18408
weighted avg       0.01      0.09      0.02     18408



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
# Neural Network model architectures and training
def train_nn_model(model, X_train, y_train, X_test, y_test, model_name):
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

    # Log model and metrics with MLflow
    with mlflow.start_run():
        mlflow.keras.log_model(model, model_name)
        loss, accuracy = model.evaluate(X_test, y_test)
        mlflow.log_metric("accuracy", accuracy)



In [56]:
# Reshape data for CNN and RNN models
X_fraud_train_cnn_rnn = X_fraud_train.values.reshape((X_fraud_train.shape[0], X_fraud_train.shape[1], 1))
X_fraud_test_cnn_rnn = X_fraud_test.values.reshape((X_fraud_test.shape[0], X_fraud_test.shape[1], 1))
X_credit_train_cnn_rnn = X_credit_train.values.reshape((X_credit_train.shape[0], X_credit_train.shape[1], 1))
X_credit_test_cnn_rnn = X_credit_test.values.reshape((X_credit_test.shape[0], X_credit_test.shape[1], 1))



In [57]:
# CNN
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_fraud_train_cnn_rnn.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(100, activation='relu'),
    Dense(1, activation='sigmoid')
])
train_nn_model(cnn_model, X_fraud_train_cnn_rnn, y_fraud_train, X_fraud_test_cnn_rnn, y_fraud_test, "CNN")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10






In [69]:
# CNN for Credit Card Data
cnn_model_credit = Sequential([
    Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_credit_train_cnn_rnn.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(100, activation='relu'),
    Dense(1, activation='sigmoid')
])
train_nn_model(cnn_model_credit, X_credit_train_cnn_rnn, y_credit_train, X_credit_test_cnn_rnn, y_credit_test, "CNN_Credit")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10






In [58]:
# RNN
rnn_model = Sequential([
    SimpleRNN(100, activation='relu', input_shape=(X_fraud_train_cnn_rnn.shape[1], 1)),
    Dense(1, activation='sigmoid')
])
train_nn_model(rnn_model, X_fraud_train_cnn_rnn, y_fraud_train, X_fraud_test_cnn_rnn, y_fraud_test, "RNN")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10






In [68]:
# RNN for Credit Card Data
rnn_model_credit = Sequential([
    SimpleRNN(100, activation='relu', input_shape=(X_credit_train_cnn_rnn.shape[1], 1)),
    Dense(1, activation='sigmoid')
])
train_nn_model(rnn_model_credit, X_credit_train_cnn_rnn, y_credit_train, X_credit_test_cnn_rnn, y_credit_test, "RNN_Credit")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10






In [59]:
# LSTM
lstm_model = Sequential([
    LSTM(100, activation='relu', input_shape=(X_credit_train_cnn_rnn.shape[1], 1)),
    Dense(1, activation='sigmoid')
])
train_nn_model(lstm_model, X_credit_train_cnn_rnn, y_credit_train, X_credit_test_cnn_rnn, y_credit_test, "LSTM")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10






In [67]:
# Corrected LSTM Model
lstm_model = Sequential([
    LSTM(100, activation='relu', input_shape=(X_fraud_train_cnn_rnn.shape[1], 1)),
    Dense(1, activation='sigmoid')
])

train_nn_model(lstm_model, X_fraud_train_cnn_rnn, y_fraud_train, X_fraud_test_cnn_rnn, y_fraud_test, "LSTM")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




