In [1]:
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from matplotlib_venn import venn2
import warnings
warnings.filterwarnings('ignore')

### Using the Outliers

In [3]:
file = r"C:\Users\AKIN-JOHNSON\Desktop\DevSeal\Dataset1.xlsx"
outlier_df = pd.read_excel(file)
outlier_df.sample(10)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\AKIN-JOHNSON\\Desktop\\DevSeal\\Dataset1.xlsx'

In [None]:
'''numeric_columns = ["Transaction Amount", "Account Balance Before Transaction", "Account Balance After Transaction"]

for col in numeric_columns:
    df[f"Outlier_{col}_Zscore"] = ((df[col] - df[col].mean()) / df[col].std()).abs() > 3
    Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound, upper_bound = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    df[f"Outlier_{col}_IQR"] = (df[col] < lower_bound) | (df[col] > upper_bound)'''

In [None]:
numeric_columns = ["Transaction Amount", "Account Balance Before Transaction", "Account Balance After Transaction"]
for col in numeric_columns:
    plt.figure(figsize=(10, 4))
    sns.boxplot(x=outlier_df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()


In [None]:
# # Define threshold for outliers using IQR (Interquartile Range)
# Q1 = outlier_df["Transaction Amount"].quantile(0.25)
# Q3 = outlier_df["Transaction Amount"].quantile(0.75)
# IQR = Q3 - Q1

# # Define outlier boundaries
# lower_bound = Q1 - (1.5 * IQR)
# upper_bound = Q3 + (1.5 * IQR)

# # Create Target column using for loop and if condition
# target_labels = []
# for amount in outlier_df["Transaction Amount"]:
#     if amount < lower_bound or amount > upper_bound:
#         target_labels.append("FLAG")  # Outlier
#     else:
#         target_labels.append("SAFE")  # Normal transaction

# # Add the new Target column to the DataFrame
# outlier_df["Target"] = target_labels

# # Print flagged transactions
# outlier_df[outlier_df["Target"] == "FLAG"]

In [None]:
columns = ["Transaction Amount", "Account Balance Before Transaction", "Account Balance After Transaction"]

# Calculate IQR for each column
Q1 = outlier_df[columns].quantile(0.25)
Q3 = outlier_df[columns].quantile(0.75)
IQR = Q3 - Q1

# Define outlier boundaries
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

# Function to flag outliers
def flag_outliers(row):
    for col in columns:
        if row[col] < lower_bound[col] or row[col] > upper_bound[col]:
            return "FLAG"  # Outlier detected
    return "SAFE"  # Normal transaction

# Apply function row-wise
outlier_df["Target"] = outlier_df.apply(flag_outliers, axis=1)

# Print flagged transactions
outlier_df[outlier_df["Target"] == "FLAG"]

In [None]:
outlier_df[outlier_df["Target"] == "FLAG"].count()

### Using Isolation Forest

In [None]:
file = r"C:\Users\AKIN-JOHNSON\Desktop\DevSeal\Dataset1.xlsx"
forest_df = pd.read_excel(file).drop(columns=['Is_Fraudulent'], axis=1)
forest_df.shape

In [None]:
forest_df.head()

In [None]:
# forest_df["Is_Fraudulent"] = forest_df["Is_Fraudulent"].map({1: "FLAG", 0: "SAFE"})
# forest_df.head()

In [None]:
# Encode Transaction Type (Categorical to Numeric)
label_encoder = LabelEncoder()
forest_df["Transaction Type"] = label_encoder.fit_transform(forest_df["Transaction Type"])

# Select numerical features for anomaly detection
features = ["Sender's Account Number", "Recipient's Account Number", "Transaction Type", "Transaction Amount",
            "Account Balance Before Transaction", "Account Balance After Transaction", "Latitude", "Longitude"]
X = forest_df[features]

# Initialize Isolation Forest with higher contamination (5%)
iso_forest = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)

# Fit the model to the dataset
iso_forest.fit(X)

# Predict anomalies (-1 = anomaly, 1 = normal)
forest_df["Anomaly"] = iso_forest.predict(X)

# Map results: -1 -> FLAG, 1 -> SAFE
forest_df["Target"] = forest_df["Anomaly"].map({1: "SAFE", -1: "FLAG"})

# Count flagged transactions
num_flags = forest_df[forest_df["Target"] == "FLAG"].shape[0]

# Display results
print(f"Total Transactions: {len(forest_df)}")
print(f"Flagged Anomalies: {num_flags}")

# Save the flagged transactions
forest_df.to_csv("Dataset1.csv", index=False)

# Display flagged transactions
#print(forest_df[forest_df["Target"] == "FLAG"].head())


In [None]:
forest_df.head(6)

### Comparison

In [None]:
# Count unique FLAGs from both methods
iqr_flags = set(outlier_df[outlier_df["Target"] == "FLAG"].index)
iso_flags = set(forest_df[forest_df["Anomaly"] == -1].index)

# Plot Venn diagram
plt.figure(figsize=(5,5))
venn2([iqr_flags, iso_flags], set_labels=("Outlier Method", "Isolation Forest"))
plt.title("Overlap of Detected Anomalies")
plt.show()

In [None]:
# Create a mapping for "Target" labels
target_mapping = {"SAFE": 0, "FLAG": 1}

# Convert labels to numerical values
iqr_labels = outlier_df["Target"].map(target_mapping)  # From IQR method
iso_labels = forest_df["Anomaly"].replace({1: 0, -1: 1})  # From Isolation Forest

# Compute confusion matrix
cm = confusion_matrix(iqr_labels, iso_labels) 

# Visualize the confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["SAFE", "FLAG"], yticklabels=["SAFE", "FLAG"])
plt.xlabel("Isolation Forest Predictions")
plt.ylabel("Outlier Method Predictions")
plt.title("Confusion Matrix: Outlier vs Isolation Forest")
plt.show()

In [None]:
#forest_df.to_excel('Dataset.xlsx', index=False)

### Model Training

In [3]:
file = r"C:\Users\AKIN-JOHNSON\Desktop\DevSeal\Dataset1.csv"
forest_df = pd.read_csv(file)

In [5]:
forest_df.sample(5)

Unnamed: 0,Transaction ID,Time Stamp,BVN,Sender's Account Number,Recipient's Account Number,Transaction Type,Transaction Amount,Account Balance Before Transaction,Account Balance After Transaction,Latitude,Longitude,NIN,IP Address,Anomaly,Target
75919,446676,2024-03-24 06:27:12.233,75968662636,7877477393,1890995049,2,3278.33,702115.26,698836.93,5.086245,3.663615,26109641488,117.193.131.51,1,SAFE
89818,479392,2024-05-31 18:28:02.233,24281331335,9092873284,2599102871,1,33615.02,204549.83,170934.81,9.825573,8.855472,41502857883,86.161.30.21,1,SAFE
74735,805211,2025-01-16 22:12:19.233,82808882936,7772515054,8262344119,1,8271.51,149440.74,141169.23,12.136645,12.132472,80034789150,124.194.17.56,1,SAFE
54392,198843,2024-06-14 17:36:41.233,58660098795,5926261439,1008178518,1,15402.08,859473.14,844071.06,7.614347,3.930413,90771353372,16.63.108.186,1,SAFE
51917,158122,2025-01-27 20:15:04.233,85659743515,5703965416,9952367627,1,44067.24,808970.3,764903.06,9.162306,11.859844,59409080782,79.136.135.209,1,SAFE


In [7]:
# lets encode transaction type ccolumn
# le = LabelEncoder()
# forest_df["Transaction Type"] = le.fit_transform(forest_df["Transaction Type"])
forest_df["Target"] = forest_df["Target"].map({"SAFE": 0, "FLAG": 1})  # "FLAG" = 1, "SAFE" = 0
forest_df

Unnamed: 0,Transaction ID,Time Stamp,BVN,Sender's Account Number,Recipient's Account Number,Transaction Type,Transaction Amount,Account Balance Before Transaction,Account Balance After Transaction,Latitude,Longitude,NIN,IP Address,Anomaly,Target
0,268828,2024-04-15 12:32:17.233,31696731096,1000067730,5707146526,0,6781.40,907364.57,914145.97,5.091986,7.917284,50864002367,74.19.62.137,1,0
1,473599,2024-06-11 12:09:31.233,31696731096,1000067730,1159521989,0,3727.96,914145.97,917873.93,5.091986,7.917284,50864002367,7.157.249.174,-1,1
2,655839,2024-12-05 19:57:03.233,31696731096,1000067730,5319671498,0,33800.59,917873.93,951674.52,5.091986,7.917284,50864002367,183.40.55.123,1,0
3,871136,2025-01-18 07:58:26.233,31696731096,1000067730,4018587843,1,2632.26,951674.52,949042.26,5.091986,7.917284,50864002367,96.64.85.217,-1,1
4,316514,2025-02-19 15:58:19.233,31696731096,1000067730,3265620170,1,24631.96,949042.26,924410.30,5.091986,7.917284,50864002367,7.193.239.252,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,402079,2024-12-16 23:44:32.233,72862366462,9998386370,3884199506,0,11890.73,1018303.39,1030194.12,5.955972,10.674131,72244118496,24.73.76.53,-1,1
99996,674317,2024-08-26 21:00:38.233,68966196387,9999752077,5145592234,0,6239.84,466701.95,472941.79,6.098263,13.007431,41922925742,120.55.85.57,1,0
99997,966976,2024-09-30 12:31:27.233,68966196387,9999752077,6256528118,0,26477.04,472941.79,499418.83,6.098263,13.007431,41922925742,21.123.201.232,1,0
99998,343762,2024-12-30 00:40:06.233,68966196387,9999752077,7195023223,0,18731.25,499418.83,518150.08,6.098263,13.007431,41922925742,183.111.8.159,1,0


In [9]:
# Standard Scaling for numerical features
scaler = StandardScaler()
numerical_features = ["Transaction Amount", "Account Balance Before Transaction", "Account Balance After Transaction", "Latitude", "Longitude"]
forest_df[numerical_features] = scaler.fit_transform(forest_df[numerical_features])
forest_df.head(2)

Unnamed: 0,Transaction ID,Time Stamp,BVN,Sender's Account Number,Recipient's Account Number,Transaction Type,Transaction Amount,Account Balance Before Transaction,Account Balance After Transaction,Latitude,Longitude,NIN,IP Address,Anomaly,Target
0,268828,2024-04-15 12:32:17.233,31696731096,1000067730,5707146526,0,-0.262573,1.444756,1.489758,-0.885467,0.076286,50864002367,74.19.62.137,1,0
1,473599,2024-06-11 12:09:31.233,31696731096,1000067730,1159521989,0,-0.288178,1.468107,1.502532,-0.885467,0.076286,50864002367,7.157.249.174,-1,1


In [11]:
# split dataset into features and target
X = forest_df.drop(columns=['Transaction ID', 'Time Stamp', 'Anomaly', 'Target', 'IP Address'], axis=1)
y = forest_df['Target']

In [13]:
X.head(2)

Unnamed: 0,BVN,Sender's Account Number,Recipient's Account Number,Transaction Type,Transaction Amount,Account Balance Before Transaction,Account Balance After Transaction,Latitude,Longitude,NIN
0,31696731096,1000067730,5707146526,0,-0.262573,1.444756,1.489758,-0.885467,0.076286,50864002367
1,31696731096,1000067730,1159521989,0,-0.288178,1.468107,1.502532,-0.885467,0.076286,50864002367


In [15]:
y

0        0
1        1
2        0
3        1
4        0
        ..
99995    1
99996    0
99997    0
99998    0
99999    0
Name: Target, Length: 100000, dtype: int64

In [17]:
# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=25, shuffle=True)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(75000, 10) (25000, 10) (75000,) (25000,)


In [19]:
# initialize the model
dtc = DecisionTreeClassifier(random_state=25)
rfc = RandomForestClassifier(random_state=25)
knn = KNeighborsClassifier(n_neighbors=2)
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss")

models = [dtc, rfc, knn, xgb]

In [None]:
# Train and evaluate each model in the list
for model in models:
    print(f"Training model: {model.__class__.__name__}")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on training and test data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Compute metrics for training data
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred, zero_division=1)
    train_recall = recall_score(y_train, y_train_pred, zero_division=1)
    train_f1 = f1_score(y_train, y_train_pred, zero_division=1)

    # Compute metrics for test data
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, zero_division=1)
    test_recall = recall_score(y_test, y_test_pred, zero_division=1)
    test_f1 = f1_score(y_test, y_test_pred, zero_division=1)

    # Print results
    print("\nTraining Set Evaluation:")
    print(f"Accuracy: {train_accuracy:.2f}")
    print(f"Precision: {train_precision:.2f}")
    print(f"Recall: {train_recall:.2f}")
    print(f"F1 Score: {train_f1:.2f}")

    print("\nTest Set Evaluation:")
    print(f"Accuracy: {test_accuracy:.2f}")
    print(f"Precision: {test_precision:.2f}")
    print(f"Recall: {test_recall:.2f}")
    print(f"F1 Score: {test_f1:.2f}")
    
    print("-" * 50)


In [None]:
# Perform 5-Fold Cross-Validation
cv_scores = cross_val_score(xgb, X_train, y_train, cv=5, scoring="f1")

# Print Results
print(f"Cross-Validation f1 Scores: {cv_scores}")
print(f"Mean f1 Score: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation: {np.std(cv_scores):.4f}")

In [None]:
# Perform 5-Fold Cross-Validation
cv_scores = cross_val_score(xgb, X_train, y_train, cv=5, scoring="accuracy")

# Print Results
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy Score: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation: {np.std(cv_scores):.4f}")

In [21]:
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],              # Number of trees
    'max_depth': [1, 3, 5, 7, 9],                # Maximum depth of trees
    'learning_rate': [0.01, 0.1, 0.2, 0.3],      # Step size shrinkage
    'subsample': [0.8, 1.0, 1.2, 1.4],           # Fraction of samples used
    'colsample_bytree': [0.8, 1.0, 1.2, 1.4],    # Fraction of features used
    'reg_lambda': [0, 1, 5, 10]                  # L2 Regularization
}

# Perform Grid Search with 5-Fold Cross-Validation
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring="f1", n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Print the Best Parameters and Best Score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best F1 Score: {grid_search.best_score_:.4f}")


Fitting 5 folds for each of 3840 candidates, totalling 19200 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 200, 'reg_lambda': 5, 'subsample': 0.8}
Best F1 Score: 0.9169


In [None]:
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],              # Number of trees
    'max_depth': [1, 3, 5, 7, 9],                # Maximum depth of trees
    'learning_rate': [0.01, 0.1, 0.2, 0.3],      # Step size shrinkage
    'subsample': [0.8, 1.0, 1.2, 1.4],           # Fraction of samples used
    'colsample_bytree': [0.8, 1.0, 1.2, 1.4],    # Fraction of features used
    'reg_lambda': [0, 1, 5, 10]                  # L2 Regularization
}

# Perform Grid Search with 5-Fold Cross-Validation
grid_search1 = GridSearchCV(xgb, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2)
grid_search1.fit(X_train, y_train)

# Print the Best Parameters and Best Score
print(f"Best Parameters: {grid_search1.best_params_}")
print(f"Best F1 Score: {grid_search1.best_score_:.4f}")


In [22]:
grid_search.best_estimator_

In [23]:
model = grid_search.best_estimator_
y_test_pred = model.predict(X_test)
y_test_pred

array([0, 0, 0, ..., 0, 0, 0])

In [24]:
y_train_pred = model.predict(X_train)
y_train_pred

array([0, 0, 0, ..., 0, 1, 0])

In [25]:
# Accuracy
y_test_accuracy = accuracy_score(y_test, y_test_pred)
y_train_accuracy = accuracy_score(y_train, y_train_pred)
print('Train Accuracy =', y_train_accuracy)
print('Test Accuracy =', y_test_accuracy)

Train Accuracy = 0.9969066666666667
Test Accuracy = 0.99292


In [26]:
# F1 Score
y_test_f1 = f1_score(y_test, y_test_pred)
y_train_f1 = f1_score(y_train, y_train_pred)
print('Train F1 Score =', y_train_f1)
print('Test F1 Score =', y_test_f1)

Train F1 Score = 0.9688255845202902
Test F1 Score = 0.9251585623678646


In [27]:
# Precision
y_test_precision = precision_score(y_test, y_test_pred)
y_train_precision = precision_score(y_train, y_train_pred)
print('Train precision =', y_train_precision)
print('Test precision =', y_test_precision)

Train precision = 0.9841659841659842
Test precision = 0.9562937062937062


In [28]:
joblib.dump(model, "fraud_detection_model.pkl")

['fraud_detection_model.pkl']