# Exploratory Data Analysis on the Bank Fraud Detection Base Dataset

In [10]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# Load the dataset
total_df = pd.read_csv('./Data/Base.csv')

# Split the DataFrame into training and test sets using stratified sampling to maintain anomaly distribution
train_df, test_df = train_test_split(total_df, test_size=0.2, stratify=total_df['fraud_bool'], random_state=42)

In [11]:
# Shows the first 5 observations of the training data
train_df.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
39111,0,0.7,0.229712,-1,63,50,0.02472,50.674001,AA,1305,...,1,1500.0,0,INTERNET,3.58055,linux,0,1,0,0
822700,0,0.2,0.928428,199,24,70,0.014153,15.631407,AA,833,...,0,500.0,0,INTERNET,7.087779,other,1,1,0,6
914415,0,0.1,0.65863,95,2,40,0.045801,-1.410133,AB,237,...,0,200.0,0,INTERNET,0.547804,other,1,1,0,7
581307,0,0.8,0.774858,-1,122,30,0.005569,-0.539938,AB,895,...,1,500.0,0,INTERNET,4.671407,other,1,1,0,4
603136,0,0.9,0.99346,103,9,20,0.010832,-0.501067,AB,4105,...,1,200.0,0,INTERNET,9.293206,linux,0,1,0,4


In [12]:
# Shows a summary of the training dataset
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 800000 entries, 39111 to 228494
Data columns (total 32 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   fraud_bool                        800000 non-null  int64  
 1   income                            800000 non-null  float64
 2   name_email_similarity             800000 non-null  float64
 3   prev_address_months_count         800000 non-null  int64  
 4   current_address_months_count      800000 non-null  int64  
 5   customer_age                      800000 non-null  int64  
 6   days_since_request                800000 non-null  float64
 7   intended_balcon_amount            800000 non-null  float64
 8   payment_type                      800000 non-null  object 
 9   zip_count_4w                      800000 non-null  int64  
 10  velocity_6h                       800000 non-null  float64
 11  velocity_24h                      800000 non-null  fl

In [13]:
# Shows summary statistics for numerical columns
print(train_df.describe())

          fraud_bool         income  name_email_similarity  \
count  800000.000000  800000.000000          800000.000000   
mean        0.011029       0.562860               0.493798   
std         0.104437       0.290343               0.289099   
min         0.000000       0.100000               0.000001   
25%         0.000000       0.300000               0.225325   
50%         0.000000       0.600000               0.492314   
75%         0.000000       0.800000               0.755595   
max         1.000000       0.900000               0.999999   

       prev_address_months_count  current_address_months_count   customer_age  \
count              800000.000000                 800000.000000  800000.000000   
mean                   16.700988                     86.614125      33.700075   
std                    44.017921                     88.391093      12.028264   
min                    -1.000000                     -1.000000      10.000000   
25%                    -1.000000    

In [19]:
# Display missing values for columns
print(train_df.isnull().sum())

fraud_bool                          0
income                              0
name_email_similarity               0
prev_address_months_count           0
current_address_months_count        0
customer_age                        0
days_since_request                  0
intended_balcon_amount              0
payment_type                        0
zip_count_4w                        0
velocity_6h                         0
velocity_24h                        0
velocity_4w                         0
bank_branch_count_8w                0
date_of_birth_distinct_emails_4w    0
employment_status                   0
credit_risk_score                   0
email_is_free                       0
housing_status                      0
phone_home_valid                    0
phone_mobile_valid                  0
bank_months_count                   0
has_other_cards                     0
proposed_credit_limit               0
foreign_request                     0
source                              0
session_leng

In [21]:
# Get the distribution of the fraud_bool
distribution = train_df['fraud_bool'].value_counts(normalize=True)
print("\nDistribution of the the fraud bool:")
for value, proportion in distribution.items():
    print(f"Value {value}: {proportion:.2%}")


Distribution of the the fraud bool:
Value 0: 98.90%
Value 1: 1.10%


In [22]:
# Number of unique values of the float data
train_df.select_dtypes(include=['float64']).nunique()  

income                            9
name_email_similarity        799289
days_since_request           793121
intended_balcon_amount       796805
velocity_6h                  799150
velocity_24h                 799310
velocity_4w                  798908
proposed_credit_limit            12
session_length_in_minutes    796391
dtype: int64

In [23]:
# Create numerical dataframe
num_df = train_df.select_dtypes(include=['float64']).drop(columns=['income', 'proposed_credit_limit'])

# Create categorical dataframe
cat_df = train_df.select_dtypes(include=['int64', 'object']).copy()
cat_df[['income', 'proposed_credit_limit']] = train_df[['income', 'proposed_credit_limit']]
cat_df['income'] = cat_df['income'].round(1)
cat_df['proposed_credit_limit'] = cat_df['proposed_credit_limit'].round(0).astype('int64')


In [24]:
# Display missing values for columns
print(train_df.isnull().sum())

fraud_bool                          0
income                              0
name_email_similarity               0
prev_address_months_count           0
current_address_months_count        0
customer_age                        0
days_since_request                  0
intended_balcon_amount              0
payment_type                        0
zip_count_4w                        0
velocity_6h                         0
velocity_24h                        0
velocity_4w                         0
bank_branch_count_8w                0
date_of_birth_distinct_emails_4w    0
employment_status                   0
credit_risk_score                   0
email_is_free                       0
housing_status                      0
phone_home_valid                    0
phone_mobile_valid                  0
bank_months_count                   0
has_other_cards                     0
proposed_credit_limit               0
foreign_request                     0
source                              0
session_leng