# Introduction

As the world is moving towards ubiquitous digitization in the financial sector, the risk of fraud grows faster than ever, posing significant challenges to both financial instutions and customers. As a result, the need for robust fraud detection systems capable of identifying and mitigating fraudulent activities is more important than ever.

## Project Description
This notebook aims to provide a comprehensive exploratory data analysis on the Bank Fraud Detection Base dataset, published at NeurIPS 2022.

## Dataset Description
The dataset is available at https://www.kaggle.com/datasets/sgpjesus/bank-account-fraud-dataset-neurips-2022/data.

This synthetic tabular dataset comprises 1M instances, where each instance represents a credit card application. The dataset contains 31 features and a corresponding binary target variable indicating whether the application is fraudulent or not. The features cover various information associated with the applicant or the application. The dataset contains a combination of numerical and categorical features, and there are no missing values in the dataset. The dataset is highly imbalanced, with only ~1% of the instances labeled as fraudulent. The dataset is also generated based off real-world data to protect the privacy of potential applicants.

A detailed description of the dataset can be found on https://github.com/feedzai/bank-account-fraud/blob/main/documents/datasheet.pdf.

# Imports and Data Loading

In [2]:
# Import libraries
import numpy as np

# Set the maximum number of columns and rows to display
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

import matplotlib.pyplot as plt
import matplotlib.colors

# Better visualizations for colorblind readers
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('colorblind')

from sklearn.model_selection import train_test_split

import statsmodels.api as sm

In [3]:
# Load the dataset
total_df = pd.read_csv('./Data/Base.csv')

# Define features and target
X = total_df.drop(columns='fraud_bool')
y = total_df['fraud_bool']

# Perform stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

train = pd.concat([X_train, y_train], axis=1).copy()
test = pd.concat([X_test, y_test], axis=1).copy()

# Exploratory Data Analysis

## Target

In [None]:
print(f"Target: 'fraud_bool'")
print(f"Data type: {y_train.dtype}")
print(f"Unique values: {y_train.dropna().unique()}")
print(f"NaN values: {y_train.isna().sum()}")
print(f"Null values: {y_train.isnull().sum()}")

# Get count and distribution
count_distribution = y_train.value_counts()
proportion_distribution = y_train.value_counts(normalize=True)

print("\nCount and Distribution of 'fraud_bool':")
for value in count_distribution.index:
    count = count_distribution[value]
    proportion = proportion_distribution[value]
    print(f"Value {value}: {count} ({proportion:.2%})")

## Features

In [None]:
# Shows the first 5 observations of the training data
X_train.head()

In [None]:
X_train.shape

In [None]:
X_train.dtypes

In [None]:
num_feats = X_train.select_dtypes(include='number').columns
cat_feats = X_train.select_dtypes(exclude='number').columns

thresh = 13

cont_feats = []
disc_feats = []

for feat in num_feats:
    if total_df[feat].nunique() >= thresh:
        cont_feats.append(feat)
    else:
        disc_feats.append(feat)

print("Total Features:", X_train.shape[1])
print("\nContinuous Features ({}): {}".format(len(cont_feats), cont_feats))
print("\nDiscrete Features ({}): {}".format(len(disc_feats), disc_feats))
print("\nCategorical Features ({}): {}".format(len(cat_feats), cat_feats))

In [None]:
# The datasheet details that the following categories can be negative to represent missing values
cols_missing= [
    'prev_address_months_count', 'current_address_months_count',
    'bank_months_count', 'session_length_in_minutes',
    'device_distinct_emails_8w', 'intended_balcon_amount'
]

# Replace all negative values with NaN
X_train[cols_missing] = X_train[cols_missing].mask(X_train[cols_missing] < 0, np.nan)

# Calculate missing values percentage and display as a table
missing_values = (X_train.isna().sum() / len(X_train) * 100).loc[lambda x: x > 0]
missing_table = pd.DataFrame(missing_values, columns=["Missing %"]).sort_values(by="Missing %")

# Print the missing values table
print("Missing Values Table:\n", missing_table)


## In-depth Descriptive Statistics

In [10]:
def feature_analysis(feat, X_train, y_train, cont_feats, disc_feats):
    def display_summary(summary):
        return display(summary)

    # Summary for continuous features
    def cont_summary(feat):
        columns = ['dtype', 'count', 'unique', 'top_value_counts', 'missing_count',
                   'missing_percentage', 'mean', 'std', 'min', 'median', 'max']
        summary = pd.DataFrame(index=[feat], columns=columns, dtype=float)
        col = X_train[feat].copy()
        summary.loc[feat, ['count', 'mean', 'std', 'min', 'median', 'max']] = col.describe(percentiles=[.5]).values.transpose()
        summary.loc[feat, 'unique'] = col.nunique()
        summary.loc[feat, 'missing_count'] = col.isnull().sum()
        summary.loc[feat, 'missing_percentage'] = col.isnull().sum() / len(col) * 100
        int_cols = ['count', 'unique', 'missing_count']
        summary[int_cols] = summary[int_cols].astype(int)
        summary = summary.round(2).astype(str)
        value_counts = X_train[feat].value_counts().head(3)
        value_counts.index = value_counts.index.astype(float).to_numpy().round(2)
        summary.loc[feat, 'top_value_counts'] = str(value_counts.to_dict())
        summary.loc[feat, 'dtype'] = col.dtypes
        return display_summary(summary)

    # Plots for continuous features
    def cont_plots(feat, bins='auto'):
        # Sample the data to a smaller size for resource-intensive plots
        sample_size = min(10000, len(X_train))
        X_train_sample = X_train.sample(n=sample_size, random_state=42)
        y_train_sample = y_train.loc[X_train_sample.index]

        n_cols = 7
        fig, axes = plt.subplots(1, n_cols, figsize=(6.4 * n_cols, 4.8))
        sns.histplot(data=X_train, x=feat, bins=bins, ax=axes[0])
        sns.boxplot(data=X_train, x=feat, y=y_train, ax=axes[1], orient='h')
        sns.kdeplot(data=X_train, x=feat, hue=y_train, fill=True, common_norm=False, ax=axes[2])
        sns.violinplot(data=X_train_sample, x=feat, y=y_train_sample, ax=axes[3], orient='h')
        sns.stripplot(data=X_train_sample, x=feat, y=y_train_sample, ax=axes[4], orient='h', jitter=True)
        sns.swarmplot(data=X_train_sample, x=feat, y=y_train_sample, ax=axes[5], orient='h')
        sns.boxenplot(data=X_train_sample, x=feat, y=y_train_sample, ax=axes[6], orient='h')
        axes[0].title.set_text('Histogram')
        axes[1].title.set_text('Box Plots')
        axes[2].title.set_text('KDE Plots')
        axes[3].title.set_text('Violin Plot')
        axes[4].title.set_text('Strip Plot')
        axes[5].title.set_text('Swarm Plot')
        axes[6].title.set_text('Boxen Plot')
        fig.tight_layout()
        plt.show()

    # Summary for discrete features
    def disc_summary(feat):
        columns = ['dtype', 'count', 'unique', 'missing_count', 'missing_percentage', 'mean', 'std', 'min', 'median', 'max', 'cv']
        summary = pd.DataFrame(index=[feat], columns=columns, dtype=float)
        col = X_train[feat].copy()
        summary.loc[feat, ['count', 'mean', 'std', 'min', 'median', 'max']] = col.describe(percentiles=[.5]).values.transpose()
        summary.loc[feat, 'unique'] = col.nunique()
        summary.loc[feat, 'cv'] = np.NaN if not col.mean() else col.std() / col.mean()
        summary.loc[feat, 'missing_count'] = col.isnull().sum()
        summary.loc[feat, 'missing_percentage'] = col.isnull().sum() / len(col) * 100
        int_cols = ['count', 'unique', 'missing_count']
        summary[int_cols] = summary[int_cols].astype(int)
        summary = summary.round(2).astype(str)
        summary.loc[feat, 'dtype'] = col.dtypes
        return display_summary(summary)

    # Plots for discrete features
    def disc_plots(feat):
        col = X_train[feat].copy()
        sample_size = min(10000, len(X_train))
        X_train_sample = X_train.sample(n=sample_size, random_state=42)
        y_train_sample = y_train.loc[X_train_sample.index]

        n_cols = 5
        fig, axes = plt.subplots(1, n_cols, figsize=(6.4 * n_cols, 4.8))
        unique_values = col.dropna().unique()
        unique_values.sort()
        val_counts = col.dropna().value_counts()
        val_counts = val_counts.reindex(unique_values)
        val_counts_pct = val_counts / len(col) * 100
        sns.countplot(x=col, order=unique_values, ax=axes[0])
        axes[0].xaxis.grid(False)
        lp_thresh = 1
        for i, p in enumerate(axes[0].patches):
            pct = val_counts_pct.iloc[i]
            axes[0].annotate(f'{pct:.2f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='bottom', xytext=(0, 0), textcoords='offset points')
            if pct < lp_thresh:
                axes[0].annotate(val_counts.iloc[i], (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='bottom', xytext=(0, 10), textcoords='offset points', color='red')
        df = pd.concat([X_train, y_train], axis=1).groupby(feat)[y_train.name].mean() * 100
        df = df.reindex(unique_values)
        sns.barplot(x=df.index, y=df.values, ax=axes[1])
        sns.violinplot(data=X_train_sample, x=feat, y=y_train_sample, ax=axes[2])
        sns.swarmplot(data=X_train_sample, x=feat, y=y_train_sample, ax=axes[3])
        sns.boxenplot(data=X_train_sample, x=feat, y=y_train_sample, ax=axes[4])
        axes[1].set_ylabel('Fraud %')
        axes[1].xaxis.grid(False)
        axes[2].title.set_text('Violin Plot')
        axes[3].title.set_text('Swarm Plot')
        axes[4].title.set_text('Boxen Plot')
        fig.tight_layout()
        plt.show()

    # Summary for categorical features
    def cat_summary(feat):
        columns = ['dtype', 'count', 'unique', 'missing_count', 'missing_percentage']
        summary = pd.DataFrame(index=[feat], columns=columns, dtype=float)
        col = X_train[feat].copy()
        summary.loc[feat, 'count'] = col.count()
        summary.loc[feat, 'unique'] = col.nunique()
        summary.loc[feat, 'missing_count'] = col.isnull().sum()
        summary.loc[feat, 'missing_percentage'] = col.isnull().sum() / len(col) * 100
        int_cols = ['count', 'unique', 'missing_count']
        summary[int_cols] = summary[int_cols].astype(int)
        summary = summary.round(2).astype(str)
        summary.loc[feat, 'dtype'] = col.dtypes
        return display_summary(summary)

    # Plots for categorical features
    def cat_plots(feat):
        col = X_train[feat].copy()
        sample_size = min(10000, len(X_train))
        X_train_sample = X_train.sample(n=sample_size, random_state=42)
        y_train_sample = y_train.loc[X_train_sample.index]

        n_cols = 5
        fig, axes = plt.subplots(1, n_cols, figsize=(6.4 * n_cols, 4.8))
        val_counts = col.dropna().value_counts()
        unique_values = val_counts.index
        sns.countplot(x=col, order=unique_values, ax=axes[0])
        axes[0].xaxis.grid(False)
        val_counts_pct = val_counts / len(col) * 100
        lp_thresh = 1
        for i, p in enumerate(axes[0].patches):
            pct = val_counts_pct.iloc[i]
            axes[0].annotate(f'{pct:.2f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='bottom', xytext=(0, 0), textcoords='offset points')
            if pct < lp_thresh:
                axes[0].annotate(val_counts.iloc[i], (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='bottom', xytext=(0, 10), textcoords='offset points', color='red')
        df = pd.concat([X_train, y_train], axis=1).groupby(feat)[y_train.name].mean() * 100
        sns.barplot(x=df.index, y=df.values, order=unique_values, ax=axes[1])
        sns.violinplot(data=X_train_sample, x=feat, y=y_train_sample, ax=axes[2])
        sns.swarmplot(data=X_train_sample, x=feat, y=y_train_sample, ax=axes[3])
        sns.boxenplot(data=X_train_sample, x=feat, y=y_train_sample, ax=axes[4])
        axes[1].set_ylabel('Fraud %')
        axes[1].xaxis.grid(False)
        axes[2].title.set_text('Violin Plot')
        axes[3].title.set_text('Swarm Plot')
        axes[4].title.set_text('Boxen Plot')
        fig.tight_layout()
        plt.show()

    # Plot for missing flag associated with a feature
    def missing_flag_plot(feat):
        col = X_train[feat].isnull().astype(int)
        if not col.sum():
            return
        df = (pd.concat([col, y_train], axis=1).groupby(feat).mean() * 100).reset_index()
        cols = [f'MISSING_{feat}', 'Fraud %']
        df.columns = cols
        fig = plt.figure(figsize=(6.4, 4.8))
        ax = sns.barplot(data=df, x=cols[0], y=cols[1])
        fig.tight_layout()
        plt.show()

    # Joint Plot for analyzing relationships between features
    if feat in cont_feats:
        cont_summary(feat)
        cont_plots(feat)
    elif feat in disc_feats:
        disc_summary(feat)
        disc_plots(feat)
    else:
        cat_summary(feat)
        cat_plots(feat)
    missing_flag_plot(feat)
    return


In [11]:
# for feat in cont_feats:
#     print(f"\033[1m Feature:\033[0m '{feat}'\n")
#     feature_analysis(feat, X_train, y_train, cont_feats, disc_feats)
#     print('-'*45, '\n')


In [12]:
# for feat in disc_feats:
#     print(f"\033[1m Feature:\033[0m '{feat}'\n")
#     feature_analysis(feat, X_train, y_train, cont_feats, disc_feats)
#     print('-'*45, '\n')

In [13]:
# for feat in cat_feats:
#     print(f"\033[1m Feature:\033[0m '{feat}'\n")
#     feature_analysis(feat, X_train, y_train, cont_feats, disc_feats)
#     print('-'*45, '\n')
