# Diyako Gilibagu (2024): This project uses a Kaggle dataset of credit card transactions by European cardholders in 2013 to build machine learning models for fraud detection, focusing on imbalanced data with PCA-transformed features and a binary fraud label.


In [None]:
!pip install pandas numpy scikit-learn tensorflow matplotlib seaborn


# Preprocessing Data for Easier Use Later

In [None]:
# Import RobustScaler from sklearn to scale features while reducing the influence of outliers.
from sklearn.preprocessing import RobustScaler

# Copy the original dataset for processing
new_data = data.copy()

# Scale the 'Amount' feature using RobustScaler
new_data['Amount'] = RobustScaler().fit_transform(new_data['Amount'].to_numpy().reshape(-1, 1))

# Display histogram and stats for 'Amount'
new_data['Amount'].hist()
print(new_data['Amount'].describe())

# Normalize the 'Time' column
time = new_data['Time']
new_data['Time'] = (time - time.min()) / (time.max() - time.min())

# Shuffle the entire dataset randomly
new_data = new_data.sample(frac=1, random_state=1)


# Splitting Data for Training, Testing, and Validation

In [None]:
# Split the shuffled data into training, testing, and validation sets
train, test, val = new_data[:240000], new_data[240000:262000], new_data[262000:]

# Print the class distribution (fraud vs non-fraud) in each set
print(train['Class'].value_counts())
print(test['Class'].value_counts())
print(val['Class'].value_counts())

# Convert the DataFrames into numpy arrays for model compatibility
train_np, test_np, val_np = train.to_numpy(), test.to_numpy(), val.to_numpy()

# Split into input and output
x_train, y_train = train_np[:, :-1], train_np[:, -1]
x_test, y_test = test_np[:, :-1], test_np[:, -1]
x_val, y_val = val_np[:, :-1], val_np[:, -1]

# Print the shapes of datasets
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape)

# Logistic Regression Model

In [5]:
from sklearn.linear_model import LogisticRegression

# Train the logistic regression model
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)

# Print training accuracy
print(logistic_model.score(x_train, y_train))

# Evaluate model on validation set
from sklearn.metrics import classification_report

# Define target names for the classification report
target_names = ['Not Fraud', 'Fraud']

# Generate and print the classification report
print(classification_report(y_val, logistic_model.predict(x_val), target_names=target_names))

0.9992375
              precision    recall  f1-score   support

   Not Fraud       1.00      1.00      1.00     22771
       Fraud       0.83      0.56      0.67        36

    accuracy                           1.00     22807
   macro avg       0.92      0.78      0.83     22807
weighted avg       1.00      1.00      1.00     22807



# Shallow Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the neural network
shallow_nn = Sequential([
    InputLayer((x_train.shape[1],)),
    Dense(2, activation='relu'),
    BatchNormalization(),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Define a checkpoint to save the best model
checkpoint = ModelCheckpoint('shallow_nn.keras', save_best_only=True)

# Compile the neural network
shallow_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
#shallow_nn.summary()

# Train the neural network
shallow_nn.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, callbacks=[checkpoint])

# Define a function to generate predictions
def neural_net_predictions(model, x):
    return (model.predict(x).flatten() > 0.5).astype(int)

# Generate predictions for the validation set
predictions = neural_net_predictions(shallow_nn, x_val)
print(predictions)

In [8]:
# Evaluate neural network on validation data
print(classification_report(y_val, predictions, target_names=target_names))

              precision    recall  f1-score   support

   Not Fraud       1.00      1.00      1.00     22771
       Fraud       0.67      0.78      0.72        36

    accuracy                           1.00     22807
   macro avg       0.83      0.89      0.86     22807
weighted avg       1.00      1.00      1.00     22807



# Random Forest Classifier

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=2, n_jobs=1)
rf.fit(x_train, y_train)
print(classification_report(y_val, rf.predict(x_val), target_names=target_names))


              precision    recall  f1-score   support

   Not Fraud       1.00      1.00      1.00     22771
       Fraud       0.80      0.44      0.57        36

    accuracy                           1.00     22807
   macro avg       0.90      0.72      0.79     22807
weighted avg       1.00      1.00      1.00     22807



# Gradient Boosting Classifier

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=50, learning_rate=1.0, max_depth=1, random_state=0)
gbc.fit(x_train, y_train)
print(classification_report(y_val, gbc.predict(x_val), target_names=target_names))

              precision    recall  f1-score   support

   Not Fraud       1.00      1.00      1.00     22771
       Fraud       0.67      0.67      0.67        36

    accuracy                           1.00     22807
   macro avg       0.83      0.83      0.83     22807
weighted avg       1.00      1.00      1.00     22807



# Linear Support Vector Classifier

In [13]:
from sklearn.svm import LinearSVC
svc = LinearSVC(class_weight='balanced')
svc.fit(x_train, y_train)
print(classification_report(y_val, svc.predict(x_val), target_names=target_names))

              precision    recall  f1-score   support

   Not Fraud       1.00      0.98      0.99     22771
       Fraud       0.07      0.97      0.14        36

    accuracy                           0.98     22807
   macro avg       0.54      0.98      0.56     22807
weighted avg       1.00      0.98      0.99     22807



# Balancing the Dataset

In [None]:
not_frauds = new_data.query('Class == 0')
frauds = new_data.query('Class == 1')
print(not_frauds['Class'].value_counts(), frauds['Class'].value_counts())

balanced_data = pd.concat([frauds, not_frauds.sample(len(frauds), random_state=1)])
print(balanced_data['Class'].value_counts())

balanced_data = balanced_data.sample(frac=1, random_state=1)
balanced_data_np = balanced_data.to_numpy()

x_train_b, y_train_b = balanced_data_np[:700, :-1], balanced_data_np[:700, -1]
x_test_b, y_test_b = balanced_data_np[700:842, :-1], balanced_data_np[700:842, -1]
x_val_b, y_val_b = balanced_data_np[842:, :-1], balanced_data_np[842:, -1]

print(x_train_b.shape, y_train_b.shape, x_test_b.shape, y_test_b.shape, x_val_b.shape, y_val_b.shape)

# Linear Support Vector Classifier (Balanced Dataset)

In [16]:
svc_b = LinearSVC(class_weight='balanced')
svc_b.fit(x_train_b, y_train_b)
print(classification_report(y_val_b, svc_b.predict(x_val_b), target_names=target_names))

              precision    recall  f1-score   support

   Not Fraud       0.96      0.93      0.94        72
       Fraud       0.93      0.96      0.94        70

    accuracy                           0.94       142
   macro avg       0.94      0.94      0.94       142
weighted avg       0.94      0.94      0.94       142



# Random Forest (Balanced Dataset)

In [15]:
rf_b = RandomForestClassifier(max_depth=2, n_jobs=1)
rf_b.fit(x_train_b, y_train_b)
print(classification_report(y_val_b, rf_b.predict(x_val_b), target_names=target_names))

              precision    recall  f1-score   support

   Not Fraud       0.93      0.97      0.95        72
       Fraud       0.97      0.93      0.95        70

    accuracy                           0.95       142
   macro avg       0.95      0.95      0.95       142
weighted avg       0.95      0.95      0.95       142



# Gradient Boosting Classifier (Balanced Dataset)

In [17]:
gbc_b = GradientBoostingClassifier(n_estimators=50, learning_rate=1.0, max_depth=1, random_state=0)
gbc_b.fit(x_train_b, y_train_b)
print(classification_report(y_val_b, gbc_b.predict(x_val_b), target_names=target_names))

              precision    recall  f1-score   support

   Not Fraud       0.98      0.86      0.92        72
       Fraud       0.87      0.99      0.93        70

    accuracy                           0.92       142
   macro avg       0.93      0.92      0.92       142
weighted avg       0.93      0.92      0.92       142



# Logistic Regression Model (Balanced Data set)

In [None]:
print(classification_report(y_val_b, logistic_model.predict(x_val_b), target_names=target_names))

# Shallow Neural Network (Balanced Data set)

In [None]:

shallow_nn_b = Sequential()
shallow_nn_b.add(InputLayer((x_train.shape[1],)))
shallow_nn_b.add(Dense(2, activation='relu'))
shallow_nn_b.add(BatchNormalization())
shallow_nn_b.add(Dense(1, activation='sigmoid'))

checkpoint = ModelCheckpoint('shallow_nn_b.keras', save_best_only=True)
shallow_nn_b.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
shallow_nn_b.fit(x_train_b, y_train_b, validation_data=(x_val_b, y_val_b), epochs=40, callbacks=[checkpoint])

# Evaluate Neural Network on Validation Data
from tensorflow.keras.models import load_model

shallow_nn_b = load_model('shallow_nn_b.keras')

def neural_net_predictions(model, data):
    return (model.predict(data) > 0.5).astype("int32")

print(classification_report(y_val_b, neural_net_predictions(shallow_nn_b, x_val_b), target_names=target_names))


In [21]:
print(classification_report(y_val_b, neural_net_predictions(shallow_nn_b, x_val_b), target_names=target_names))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
              precision    recall  f1-score   support

   Not Fraud       0.96      0.90      0.93        72
       Fraud       0.91      0.96      0.93        70

    accuracy                           0.93       142
   macro avg       0.93      0.93      0.93       142
weighted avg       0.93      0.93      0.93       142



# Test Model Performance

In [24]:
print(classification_report(y_test_b, neural_net_predictions(shallow_nn_b, x_test_b), target_names=target_names))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
              precision    recall  f1-score   support

   Not Fraud       0.91      0.84      0.87        73
       Fraud       0.84      0.91      0.88        69

    accuracy                           0.87       142
   macro avg       0.88      0.87      0.87       142
weighted avg       0.88      0.87      0.87       142

