In [2]:
import pandas as pd
import os

In [3]:
# Defining Preprocessed dataset
data_path = "..\data\preprocessed dataset"
model_path = "..\models"

In [4]:
# Load the test dataset
X_test = pd.read_csv(os.path.join(data_path, "X_test.csv"))
y_test = pd.read_csv(os.path.join(data_path, "y_test.csv"))
y_train = pd.read_csv(os.path.join(data_path, "y_train.csv"))
X_train = pd.read_csv(os.path.join(data_path, "X_train.csv"))

print(f"Data loaded successfully! X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

Data loaded successfully! X_test shape: (6000, 18), y_test shape: (6000, 1)


Implementing AI-Driven Fraud Detection
- Applying Anomaly detection to identify fraudulent transactions
- Isolation Forest is an unsupervised ML technique that detects anomalies by isolating rare instances

In [5]:
from sklearn.ensemble import IsolationForest

# Initialize Isolation Forest
iso_forest = IsolationForest(n_estimators = 100, contamination = 0.05, random_state = 42)

# Fit the model on X_test
iso_forest.fit(X_test)

# Predict anomalies (Outliers)
y_pred_anomaly = iso_forest.predict(X_test)

# Convert predictions: -1(Anomaly) -> 1 (Fraud), 1 (Normal) -> 0 (Non-Fraud)
y_pred_anomaly = [1 if x == -1 else 0 for x in y_pred_anomaly]

# Add the predictions to a Dataframe
fraud_detection_results = pd.DataFrame({
    "Actual": y_test.values.flatten(),
    "IsolationForest_Prediction": y_pred_anomaly
})

# Count detected fraud cases
num_fraud_detected = fraud_detection_results["IsolationForest_Prediction"].sum()

print(f"Isolation Forest Detected {num_fraud_detected} potential fraud cases.")

Isolation Forest Detected 300 potential fraud cases.


Observations:
- The `contamination = 0.05` it forces the model to mark 5% of cases as fraud, even if fewer or more real fraud cases exist
- Isolation Forest is an unsupervised anomaly detection model - means it does not use labels `y_test` so it assumes a certain percentage of data is fraudulent based on the contamination value

Action:
- `y_test` has fraud cases because we split it from the labeled dataset - using this comparing the results how well Isolation Forest's predictions match

In [6]:
# Comparing with the actual labels
from sklearn.metrics import confusion_matrix, classification_report

# Comparing predictions with actual fraud labels
print(classification_report(y_test, y_pred_anomaly))

              precision    recall  f1-score   support

           0       0.78      0.96      0.86      4673
           1       0.33      0.08      0.12      1327

    accuracy                           0.76      6000
   macro avg       0.56      0.52      0.49      6000
weighted avg       0.68      0.76      0.70      6000



Observations:
- 0 (non-fraud): 4673
- 1 (fraud): 1327
- From the output: There are 1327 actual fraud cases in y_test

In [7]:
# Re-training isolation forest with dynamic contamination

# Dynamically set contamination based on training fraud ratio
actual_fraud_percentage = y_train.mean().iloc[0]

# Training Isolation Forest
iso_forest = IsolationForest(n_estimators = 100, contamination = actual_fraud_percentage, random_state = 42)
iso_forest.fit(X_train)     # This will train in X-train data instead of X_test

# Predict on X_test
y_pred_anomaly = iso_forest.predict(X_test)

# Convert predictions: -1(Anomaly)-> 1(Fraud), 1(Normal)-> 0(Non-Fraud)
y_pred_anomaly = [1 if x == -1 else 0 for x in y_pred_anomaly]

# Saving the results in a Dataframe
fraud_detection_results = pd.DataFrame({
    "Actual": y_test.values.flatten(),
    "IsolationForest_Prediction": y_pred_anomaly
})

# Count of detected fraud cases
num_fraud_detected = fraud_detection_results["IsolationForest_Prediction"].sum()

print(f"Isolation Forest Detection {num_fraud_detected} potential fraud cases based on {actual_fraud_percentage:.2%} contamination rate.")

Isolation Forest Detection 1323 potential fraud cases based on 22.12% contamination rate.


Observations:
- 1323 fraud cases detected - This closely matches with the number of actual fraud cases(1327)
- The contamination rate (22.12%) was set dynamically based on the actual fraud percentage in training data
- Isolation Forest then flagged approximately the same proportion of cases in `X_test`

In [8]:
# Checking model performace again to see if precision, recall, and F1-score improved

# Comparing with the actual labels
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_anomaly))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80      4673
           1       0.30      0.30      0.30      1327

    accuracy                           0.69      6000
   macro avg       0.55      0.55      0.55      6000
weighted avg       0.69      0.69      0.69      6000



Observations:
- Recall for fraud cases increased: before the model detected only 0.08 now it detects 0.30 of actual fraud cases
- Precision for fraud decreased from 0.33 to 0.30: Model misclassified more normal transactions as fraud
- Accuracy dropped from 0.76 to 0.69: more fraud cases are detected, model now labels more transactions as fraudulent

In [9]:
# Analyzing different contamination rates (15%, actual_fraud_percentage, 30%)

from sklearn.metrics import classification_report
import pandas as pd

# Looping different contamination values
contamination_values = [0.15, actual_fraud_percentage, 0.30]

results = {}

for contamination in contamination_values:
    print(f"\nTraining Isolation Forest with contamination = {contamination:.2%}\n")

    # Training Isolation Forest
    iso_forest = IsolationForest(n_estimators = 100, contamination = contamination, random_state = 42)
    iso_forest.fit(X_train)

    # Prediction
    y_pred_anomaly = iso_forest.predict(X_test)

    # Convert predictions: -1 (Anomaly) -> 1 (Fraud), 1 (Normal) -> 0 (Non-Fraud)
    y_pred_anomaly = [1 if x == -1 else 0 for x in y_pred_anomaly]

    # Evaluation of performance
    report = classification_report(y_test, y_pred_anomaly, output_dict = True)

    results[contamination] = report

    print(classification_report(y_test, y_pred_anomaly))

df_results = pd.DataFrame({k: v['1'] for k, v in results.items()}).T
df_results[['precision', 'recall', 'f1-score']]


Training Isolation Forest with contamination = 15.00%

              precision    recall  f1-score   support

           0       0.79      0.86      0.83      4673
           1       0.30      0.21      0.25      1327

    accuracy                           0.72      6000
   macro avg       0.55      0.54      0.54      6000
weighted avg       0.69      0.72      0.70      6000


Training Isolation Forest with contamination = 22.12%

              precision    recall  f1-score   support

           0       0.80      0.80      0.80      4673
           1       0.30      0.30      0.30      1327

    accuracy                           0.69      6000
   macro avg       0.55      0.55      0.55      6000
weighted avg       0.69      0.69      0.69      6000


Training Isolation Forest with contamination = 30.00%

              precision    recall  f1-score   support

           0       0.81      0.73      0.77      4673
           1       0.29      0.39      0.34      1327

    accuracy  

Unnamed: 0,precision,recall,f1-score
0.15,0.303456,0.211756,0.249445
0.221218,0.297808,0.29691,0.297358
0.3,0.29322,0.391108,0.335163


Observations:
- Lower Contamination 15%
    - Higher precision of 0.30 but lower recall of 0.21
    - Detects fewer fraud cases but is more precise
- Actual Contamination 22.12%
    - Balanced recall of 0.30 and precision of 0.30
    - It aligns with the actual fraud rate in the dataset
- Higher Contamination 30%
    - Lower precision of 0.29 and higher recall of 0.39
    - Detects more fraud cases

Training and Evaluating Local Outlier Factor (LOF)

In [12]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report

# Converting X_train and X_test to NumPy arrays
X_train_array = X_train.to_numpy()
X_test_array = X_test.to_numpy()

# Training LOF model
lof = LocalOutlierFactor(n_neighbors = 20, contamination = actual_fraud_percentage, novelty = True)
lof.fit(X_train_array)

# Predict on X_test
y_pred_lof = lof.predict(X_test_array)

# Convert predictions: -1(Anomaly)->1(Fraud), 1(Normal)->0(Non-Fraud)
y_pred_lof = [1 if x == -1 else 0 for x in y_pred_lof]

print(classification_report(y_test, y_pred_lof))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79      4673
           1       0.25      0.24      0.24      1327

    accuracy                           0.67      6000
   macro avg       0.52      0.52      0.52      6000
weighted avg       0.67      0.67      0.67      6000



Comparison of LOF vs Isolation Forest
- Isolation Forest (22% contamination):
    - Precision (Fraud): 0.30
    - Recall (Fraud): 0.30
    - F1-score(Fraud): 0.30
    - Accuracy(Fraud): 0.69
- Local Outlier Factor (LOF):
    - Precision (Fraud): 0.25
    - Recall (Fraud): 0.24
    - F1-score(Fraud): 0.24
    - Accuracy(Fraud): 0.67

- LOF has lower precision and recall - means it detects fewer fraud cases correctly
- Accuracy is also slightly lower, and F1-score for fraud is very low in LOF
- This suggests LOF is not as effective in detecting fraud compared to Isolation Forest

Training and Evaluating One-Class SVM

In [13]:
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report

# Training One-class SVM model
oc_svm = OneClassSVM(kernel = 'rbf', gamma = 'scale', nu = actual_fraud_percentage)
oc_svm.fit(X_train_array)

# Predict on X_test
y_pred_svm = oc_svm.predict(X_test_array)

# Convert predictions: -1 (Anomaly) -> 1 (Fraud), 1 (Normal) -> 0 (Non-Fraud)
y_pred_svm = [1 if x == -1 else 0 for x in y_pred_svm]

print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.79      0.78      0.79      4673
           1       0.26      0.27      0.26      1327

    accuracy                           0.67      6000
   macro avg       0.52      0.53      0.53      6000
weighted avg       0.67      0.67      0.67      6000



Comparison of One-Class SVM vs Isolation Forest
- Isolation Forest (22% contamination):
    - Precision (Fraud): 0.30
    - Recall (Fraud): 0.30
    - F1-score(Fraud): 0.30
    - Accuracy(Fraud): 0.69
- One-Class SVM:
    - Precision (Fraud): 0.26
    - Recall (Fraud): 0.27
    - F1-score(Fraud): 0.24
    - Accuracy(Fraud): 0.67

- One-class SVM is slightly better than the LOF but still lower than the Isolation Forest

Conclusion:
- After comparing Isolation Forest with LOF and One-Class SVM: Isolation Forest (with 22% contamination) is the best choice as it provides the best fraud detection balance

In [None]:
# Extracting anomaly scores from Isolation Forest and store them for future fraud risk analysis
import pandas as pd
import numpy as np

# Ensure y_test and y_pred_anomaly are 1D
y_test_1d = np.array(y_test).flatten()
y_pred_anomaly_1d = np.array(y_pred_anomaly).flatten()
# anomaly_scores_1d = anomaly_scores.flatten()

# Convert X_test_array back to DataFrame before calling decision_function
X_test_df = pd.DataFrame(X_test_array, columns=X_train.columns)

# Get anomaly scores from Isolation Forest
anomaly_scores = iso_forest.decision_function(X_test_df)

# Create a DataFrame to store results
anomaly_results = pd.DataFrame({
    "actual_label": y_test_1d,
    "predicted_label": y_pred_anomaly_1d,
    # "anomaly_score": anomaly_scores_1d
})

# Save to CSV for further fraud analysis
anomaly_results.to_csv("../data/anomaly_scores.csv", index=False)

print("Anomaly scores saved successfully!")

Anomaly scores saved successfully!
