## Financial Institution Fraud Detection Analysis

#### Author: Andrew Tran

## Blog Post Inspiration and Objectives

In this blog post, ...

## Data Preprocessing - Cleaning and Analytics

In [None]:
# Imported needed libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
%matplotlib inline
import seaborn as sns
color = sns.color_palette()
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, \
    adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import MiniBatchKMeans, KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
import kneed
plt.style.use("fivethirtyeight")

In [None]:
# Reading and displaying the initial dataset
df = pd.read_csv("datasets/bs140513_032310.csv")
df

In [None]:
# Determining the shape of the initial dataset
df.shape

In [None]:
# Getting a sample of the initial dataset through the seeing the first 10 entries
# completely in the dataset
df.head(10)

In [None]:
# Figuring out all of the columns (and their names) available for me to use in 
# the dataset
df.columns

In [None]:
# Getting basic information about the dataset
df.info()

In [None]:
# Figuring out the number of duplicated elements in the dataset (could be 
# problematic if not resolved)
df.duplicated().sum()

In [None]:
# Renaming the columns to be more readable 
df = df.rename(columns={"zipcodeOri": "ZipCodeOrig", "step": "TimeStep"})

cols_rename_dict = {}
for col in df.columns:
    cols_rename_dict.update({col: str(col[0].upper() + col[1:])})

df = df.rename(columns=cols_rename_dict)
df

In [None]:
# Figuring out the number of 'null'/'NaN' elements in the dataset (i.e. if NaN 
# filling is needed or not)
print(df.isnull().sum())
(df.isnull().sum() / df.shape[0]) * 100

In [None]:
df

In [None]:
# Remove the single quotation characters around the following columns' entries
single_quotation_cols: [str] = ["Customer", "Age", "Gender", "ZipCodeOrig", "Merchant", "ZipMerchant", "Category"]

for col in single_quotation_cols:
    df[col] = df[col].str.strip("'")

df

In [None]:
df["Age"] = df["Age"].map(lambda entry: entry if entry != "U" else "-1")
df

In [None]:
df["Category"].value_counts()

In [None]:
df["Category"] = df["Category"].map(lambda entry: entry.replace("es_", ""))

df["Category"].replace({"barsandrestaurants": "bars_and_restaurants", 
    "hotelservices": "hotel_services", "otherservices": "other_services", 
    "sportsandtoys": "sports_and_toys", "wellnessandbeauty": "wellness_and_beauty"}, inplace=True)

# Fix the capitalization on the entries in the "Category" column for readability
def capitalize_first_letter(entry: str):
    word_entries = entry.split("_")
    word_entries = [(word[0].upper() + word[1:]) for word in word_entries]
    return "_".join(word_entries)

df["Category"] = df["Category"].apply(capitalize_first_letter)
df

In [None]:
df["Category"].value_counts()

In [None]:
df["Gender"] = df["Gender"].map({"M": "Male", "F": "Female", "E": "Enterprise", "U": "Unknown"})
df

In [None]:
df["Age"] = df["Age"].astype("int64")
df["ZipCodeOrig"] = df["ZipCodeOrig"].astype("int64")
df["ZipMerchant"] = df["ZipMerchant"].astype("int64")
df.info()

In [None]:
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 7
sns.barplot(x=df["Category"].unique(), y=df["Category"].value_counts(), palette=sns.color_palette("husl", 8))
plt.xlabel("Transaction Type")
plt.ylabel("Transaction Type Count")
plt.title("Different Types of Category Transactions in this Fin. Inst. Fraud Detection Analysis Dataset")
ax = plt.subplot()
ax.set_xticklabels(list(df["Category"].unique()),
                   rotation=30,
                   fontsize="8",
                   horizontalalignment="right")
plt.show()
df["Category"].value_counts()

In [None]:
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 7
sns.barplot(x=df["Gender"].unique(), y=df["Gender"].value_counts(), palette=sns.color_palette("husl", 8))
plt.xlabel("Transaction Type")
plt.ylabel("Transaction Type Count")
plt.title("Different Types of Entities in this Fin. Inst. Fraud Detection Analysis Dataset")
ax = plt.subplot()
ax.set_xticklabels(list(df["Gender"].unique()),
                   rotation=30,
                   fontsize="8",
                   horizontalalignment="right")
plt.show()
df["Gender"].value_counts()

In [None]:
# Set the display precision for floating-point numbers to 3 decimal places
pd.set_option("display.float_format", "{:.2f}".format)
df.loc[:, ["TimeStep", "Amount"]].describe()

In [None]:
# Step = Map of unit of time in the real world. 1 step = 1 hour
df["TimeStep"].value_counts()

In [None]:
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 7
plt.title('Distribution of Time Feature')
sns.countplot(x="TimeStep", data=df)

In [None]:
df["Amount"].value_counts()

In [None]:
df.shape

In [None]:
df["Amount"].value_counts()

In [None]:
amount_counts = df["Amount"].value_counts()
amount_filter_value: float = 250.00
df_amount_dist_display_sample = df[df["Amount"] <= amount_filter_value]
num_entries_above_amount_filter_value = df[df["Amount"] > amount_filter_value].shape[0]
perc_amount_above_amount_filter_value = (float(num_entries_above_amount_filter_value) / df.shape[0]) * 100
print(f"Note: The number of entries above the filtered amount value of {int(amount_filter_value)} is {num_entries_above_amount_filter_value} \
({perc_amount_above_amount_filter_value:.2f}% of total entries).")
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 7
plt.title('Distribution of Amount Feature')
sns.countplot(x="Amount", data=df_amount_dist_display_sample, color="red")
plt.show()

In [None]:
counts_fraud_col = df["Fraud"].value_counts()
normal_cases, fraud_cases = counts_fraud_col[0], counts_fraud_col[1]
percent_normal = (normal_cases / (normal_cases + fraud_cases)) * 100
percent_fraud = (fraud_cases / (normal_cases + fraud_cases)) * 100
results = f"There were {normal_cases} non-fraudulent transactions ({percent_normal:.3f}%) \
and {fraud_cases} fradulent transactions ({percent_fraud:.3f}%)"
results

In [None]:
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 7
sns.barplot(x=counts_fraud_col.index, y=counts_fraud_col, palette=sns.color_palette("husl", 8))
plt.title("Comparison of the Number of Fradulent vs. Non-Fraudulent Transactions")
plt.ylabel("Count")
plt.xlabel("Class: (0 - Non-Fraudulent vs. 1 - Fradulent)")
plt.show()

In [None]:
print("Mean Feature Values per Category:")
df_category_grouping_amount_mean = df.groupby('Category')["Amount"].mean()
df_category_grouping_fraud_mean = df.groupby('Category')["Fraud"].mean()
df_mean = pd.concat([df_category_grouping_amount_mean, df_category_grouping_fraud_mean], keys=["Amount", "Fraud"])
df_mean = pd.DataFrame(index=df["Category"].unique())
df_mean = pd.merge(left=df_mean, right=df_category_grouping_amount_mean, how="inner", left_on=df_mean.index, right_on=df_category_grouping_amount_mean.index)
df_mean.rename(columns={"key_0": "Category"}, inplace=True)
df_mean.set_index(keys="Category", drop=True, inplace=True)
df_mean = pd.merge(left=df_mean, right=df_category_grouping_fraud_mean, how="inner", left_on=df_mean.index, right_on=df_category_grouping_fraud_mean.index)
df_mean.rename(columns={"key_0": "Category"}, inplace=True)
df_mean.set_index(keys="Category", drop=True, inplace=True)
df_mean

In [None]:
df_non_fraud = df[df["Fraud"] == 0]
df_fraud = df[df["Fraud"] == 1]

pd.concat([df_fraud.groupby("Category")["Amount"].mean(), df_non_fraud.groupby("Category")["Amount"].mean(), \
    df.groupby("Category")["Fraud"].mean() * 100], keys=["Fraudulent", "Non-Fradulent", "Percentage (%)"], axis=1, \
    sort=False).sort_values(by=["Non-Fradulent"])

In [None]:
# Plot histograms of the amounts in fraud and non-fraud data
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 7
hist_bins: int = 100
plt.hist(df_fraud["Amount"], alpha=0.5, label="Fraud", bins=hist_bins)
plt.hist(df_non_fraud["Amount"], alpha=0.5, label="Non-Fraud", bins=hist_bins)
plt.title("Histogram Comparing the Distribution of Fraud vs. Non-Fraud Payments")
plt.xlabel("Amount")
plt.ylabel("Count")
plt.xlim(0, 1000)
plt.ylim(0, 10000)
plt.legend()
plt.show()

In [None]:
df.drop(labels=["Customer", "ZipCodeOrig", "Merchant", "ZipMerchant"], axis=1, inplace=True)
df

In [None]:
df["Amount"] = df["Amount"].round(2)
df

In [None]:
df["Category"].value_counts()

In [None]:
df["Gender"] = df["Gender"].map({"Male": 0, "Female": 1, "Enterprise": 2, "Unknown": 3}).astype("int64")
df["Category"] = df["Category"].map({"Transportation": 0, "Food": 1, "Health": 2, "Wellness_And_Beauty": 3,
                                    "Fashion": 4, "Bars_And_Restaurants": 5, "Hyper": 6, "Sports_And_Toys": 7,
                                    "Tech": 8, "Home": 9, "Hotel_Services": 10, "Other_Services": 11, 
                                    "Contents": 12, "Travel": 13, "Leisure": 14}).astype("int64")
display(df.info())
df

In [None]:
# Correlation heatmap to quantify relationships between auctioning used-car
# attributes
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 7
sns.heatmap(df.corr(), annot=True, linewidths=0.5)
plt.title("Correlation Heatmap Between All Financial Institution Quantiative Factors (2005-2015)")
plt.show()

# Correlation bar graph between ROI and all other auctioning used-car
# attributes
target_corr = df.corr()["Fraud"].abs().sort_values(ascending=False)
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 7
sns.barplot(x=target_corr.index[1:], y=target_corr.values[1:], palette=sns.color_palette("husl", 8))
plt.xticks(rotation=45, ha="right")
plt.xlabel("Auctioned Car Features")
plt.ylabel("Correlation with Fraud")
plt.title("Correlation between Fraud and Other Features When Comparing Across All Financial Transactions in the Dataset (2005-2015)")
plt.tight_layout()
plt.show()

## Machine Learning - Model Training and Evaluation

Great, now we are onto the Machine Learning part of the blog post!

In [None]:
X = df.drop(["Fraud"], axis=1)
y = df["Fraud"]

print("X Shape:", X.shape)
print("Y Shape:", y.shape)

pipeline = Pipeline([
    ("std_scaler", StandardScaler()),
    ("min_max_scaler", MinMaxScaler())
])

X_scaled = pipeline.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=True, random_state=1)
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.fit_transform(X_test)

In [None]:
kmeans_model = KMeans()
kmeans_elbow_visualizer = KElbowVisualizer(kmeans_model, k=(1, 11))
kmeans_elbow_visualizer.fit(X_scaled)
kmeans_elbow_visualizer.show()

In [None]:
# Run the kmeans model on scaled data
kmeans_model = KMeans(n_clusters=4, random_state=42).fit(X_train)

# Get the cluster number for each datapoint
X_test_clusters = kmeans_model.predict(X_test)

# Save the cluster centroids
X_test_clusters_centers = kmeans_model.cluster_centers_

# Obtain predictions and calculate distance from cluster centroid (using Euclidean distance)
kmeans_dist = [np.linalg.norm(x - y) for x, y in zip(X_test, X_test_clusters_centers[X_test_clusters])]

y_pred = np.array(kmeans_dist)
y_pred[kmeans_dist >= np.percentile(kmeans_dist, 95)] = 1
y_pred[kmeans_dist >= np.percentile(kmeans_dist, 95)] = 0
y_pred

In [None]:
clf_report = pd.DataFrame(classification_report(y_true=y_test, y_pred=y_pred, output_dict=True, zero_division=0))
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)

print(f"ROC AUC Score: {roc_auc_score(y_true=y_test, y_pred=y_pred) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report}")
print("_______________________________________________")
print(f"Confusion Matrix:\n{conf_matrix}")

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 7
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')
plt.colorbar()
plt.show()

In [None]:
dbscan_min_samples = 2 * (X.shape[1])
dbscan_min_samples

In [None]:
nn_model = NearestNeighbors(n_neighbors=dbscan_min_samples)
nn_model_fit = nn_model.fit(X_scaled)
nn_distances, nn_indices = nn_model_fit.kneighbors(X_scaled)

In [None]:
nn_distances = np.sort(nn_distances, axis=0)
nn_distances = nn_distances[:, 1]
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 7
plt.plot(nn_distances)
plt.xlabel("Data Points")
plt.ylabel("Distance")
plt.title("Distances Between Neighbors for All Data Points")
plt.show()

In [None]:
# Defining the inflection point (x, y) using the Kneed package
kneedle_model = kneed.KneeLocator(y=nn_distances, x=np.arange(0, X.shape[0]), 
                                  S=1.0, curve="convex", direction="increasing")
nn_model_inflection_point = [kneedle_model.knee, kneedle_model.knee_y]
nn_model_inflection_point

In [None]:
# Visualizing the kneedle model
kneedle_model.plot_knee()

In [None]:
df["Fraud"].value_counts()

In [None]:
rounded_nn_model_inflection_point_y_coord = round(nn_model_inflection_point[1], 2)
rounded_nn_model_inflection_point_y_coord

In [None]:
for dbscan_eps in np.arange(0.005, round(rounded_nn_model_inflection_point_y_coord, 2) + 0.005, 0.005):
    print()
    print(f"            ------------------            ")
    print(f"---------- | For eps = {round(dbscan_eps, 3):.3f}: | ----------")
    print(f"            ------------------            ")
    # Creating the model:
    dbscan= DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples).fit(X_scaled)
    # Integrating the classification made by DBSCAN to the original dataset:
    df["dbscan_labels"] = dbscan.labels_
    # Checking the number of points attributed to each cluster and the number of outliers ('dbscan_labels' == -1) identified bt the model:
    print(df["dbscan_labels"].value_counts())
    # Checking to which cluster(s) the model added most cases of fraud ('Fraud' == 1):
    anomalies = df.loc[df["Fraud"] == 1]
    print(anomalies["dbscan_labels"].value_counts())
    print(f"Number of Clustered Outliers: {anomalies['dbscan_labels'].value_counts()[-1]}")

In [None]:
dbscan_model = DBSCAN(eps=nn_model_inflection_point[1], min_samples=dbscan_min_samples)
dbscan_model.fit(X_train)

train_cluster_labels = dbscan_model.labels_
test_cluster_labels = dbscan_model.fit_predict(X_test)

In [None]:
# Calculate the ARI for the training and testing data
ari_dbscan_train = adjusted_rand_score(y_train, train_cluster_labels)
ari_dbscan_test = adjusted_rand_score(y_test, test_cluster_labels)

# Calculate the NMI for the training and testing data
nmi_dbscan_train = normalized_mutual_info_score(y_train, train_cluster_labels)
nmi_dbscan_test = normalized_mutual_info_score(y_test, test_cluster_labels)

#  Calculate the Silhouette score for the training and testing data 
silhouette_train = silhouette_score(X_train, train_cluster_labels)
silhouette_test = silhouette_score(X_test, test_cluster_labels)

print(f"ARI for Training Data: {ari_dbscan_train}")
print(f"ARI for Testing Data: {ari_dbscan_test}")
print(f"NMI for Training Data: {nmi_dbscan_train}")
print(f"NMI for Testing Data: {nmi_dbscan_test}")
print(f"Silhouette Score for Training Data: {silhouette_train}")
print(f"Silhouette Score for Testing Data: {silhouette_test}")

In [None]:
# Visualize outputs of DBSCAN
color_labels = dbscan_model.labels_
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 7
plt.title("DBSCAN Performance Visualzation")
plt.scatter(X, y, c=color_labels)
plt.show()

In [None]:
# Create and display outliers dataframe from DBSCAN
dbscan_outliers = pd.DataFrame(df[dbscan_model.labels_ == -1])
dbscan_outliers

## TO-DO

- Heatmap
- Drop all needed columns
- train_test_split
- Figure out which ML algorithm to use
- Do DBSCAN (as recommended)
- Report classification statistics if needed
- DONE...

Sources:

- Data: https://www.kaggle.com/datasets/ealaxi/paysim1/data (NOT USED)

- Data: https://www.kaggle.com/datasets/ealaxi/banksim1 (USED)

- KMeans Elbow: https://www.kaggle.com/code/javigallego/outliers-eda-clustering-tutorial

- KMeans: https://www.kaggle.com/code/mohamedisbaine/fraud-detection

- NearestNeighbor, KNeedle, DBSCAN Reference: https://www.kaggle.com/code/rodmnzs/fraud-detection-clustering-with-dbscan

- DBSCAN Reference #2: https://medium.com/@dilip.voleti/dbscan-algorithm-for-fraud-detection-outlier-detection-in-a-data-set-60a10ad06ea8

- Seaborne Color Palette: https://seaborn.pydata.org/tutorial/color_palettes.html

- https://www.kaggle.com/code/mukulkirti/outlier-or-anomalies-detection-and-removal#3.3-DBScane-Anomaly-Detection
