# Detecting Fraud in Bank Transactions

Creating a predictive system to detect fraudulent transactions in bank datasets.

## 1. Importing the libraries

In [47]:
# Basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

# Data pre-processing packages
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Machine learning packages
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier


# 2. Creating fake data

# 3. Preprocessing

Data normalization, outliers, missing values, feature selection.

# 4. Loading data

In [48]:
#df = pd.read_csv('data/transferencias.csv')
df = pd.read_csv('data/bank_transactions.csv')
df

Unnamed: 0,Timestamp,country,city,district,postal_code,ip_address,day,hour,minute,operating_system,amount,background,complaints,transaction_count,credit,global_limit,credit_type,merchant,accounts,loans,browser,android,ios,purchases,browsing_history,relationship,security_index,transaction_time,credit_limit,balance_history,Target
0,0.0000,-1.3598,-0.0728,2.5363,1.3782,-0.3383,0.4624,0.2396,0.0987,0.3638,0.0908,-0.5516,-0.6178,-0.9914,-0.3112,1.4682,-0.4704,0.2080,0.0258,0.4040,0.2514,-0.0183,0.2778,-0.1105,0.0669,0.1285,-0.1891,0.1336,-0.0211,149.6200,0
1,0.0000,1.1919,0.2662,0.1665,0.4482,0.0600,-0.0824,-0.0788,0.0851,-0.2554,-0.1670,1.6127,1.0652,0.4891,-0.1438,0.6356,0.4639,-0.1148,-0.1834,-0.1458,-0.0691,-0.2258,-0.6387,0.1013,-0.3398,0.1672,0.1259,-0.0090,0.0147,2.6900,0
2,1.0000,-1.3584,-1.3402,1.7732,0.3798,-0.5032,1.8005,0.7915,0.2477,-1.5147,0.2076,0.6245,0.0661,0.7173,-0.1659,2.3459,-2.8901,1.1100,-0.1214,-2.2619,0.5250,0.2480,0.7717,0.9094,-0.6893,-0.3276,-0.1391,-0.0554,-0.0598,378.6600,0
3,1.0000,-0.9663,-0.1852,1.7930,-0.8633,-0.0103,1.2472,0.2376,0.3774,-1.3870,-0.0550,-0.2265,0.1782,0.5078,-0.2879,-0.6314,-1.0596,-0.6841,1.9658,-1.2326,-0.2080,-0.1083,0.0053,-0.1903,-1.1756,0.6474,-0.2219,0.0627,0.0615,123.5000,0
4,2.0000,-1.1582,0.8777,1.5487,0.4030,-0.4072,0.0959,0.5929,-0.2705,0.8177,0.7531,-0.8228,0.5382,1.3459,-1.1197,0.1751,-0.4514,-0.2370,-0.0382,0.8035,0.4085,-0.0094,0.7983,-0.1375,0.1413,-0.2060,0.5023,0.2194,0.2152,69.9900,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0000,-11.8811,10.0718,-9.8348,-2.0667,-5.3645,-2.6068,-4.9182,7.3053,1.9144,4.3562,-1.5931,2.7119,-0.6893,4.6269,-0.9245,1.1076,1.9917,0.5106,-0.6829,1.4758,0.2135,0.1119,1.0145,-0.5093,1.4368,0.2500,0.9437,0.8237,0.7700,0
284803,172787.0000,-0.7328,-0.0551,2.0350,-0.7386,0.8682,1.0584,0.0243,0.2949,0.5848,-0.9759,-0.1502,0.9158,1.2148,-0.6751,1.1649,-0.7118,-0.0257,-1.2212,-1.5456,0.0596,0.2142,0.9244,0.0125,-1.0162,-0.6066,-0.3953,0.0685,-0.0535,24.7900,0
284804,172788.0000,1.9196,-0.3013,-3.2496,-0.5578,2.6305,3.0313,-0.2968,0.7084,0.4325,-0.4848,0.4116,0.0631,-0.1837,-0.5106,1.3293,0.1407,0.3135,0.3957,-0.5773,0.0014,0.2320,0.5782,-0.0375,0.6401,0.2657,-0.0874,0.0045,-0.0266,67.8800,0
284805,172788.0000,-0.2404,0.5305,0.7025,0.6898,-0.3780,0.6237,-0.6862,0.6791,0.3921,-0.3991,-1.9338,-0.9629,-1.0421,0.4496,1.9626,-0.6086,0.5099,1.1140,2.8978,0.1274,0.2652,0.8000,-0.1633,0.1232,-0.5692,0.5467,0.1088,0.1045,10.0000,0


# 5. Exploratory Data Analysis

First, let's check for missing values in the dataset.

In [49]:
# Checking for missing values
print(df.isna().sum())

Timestamp            0
country              0
city                 0
district             0
postal_code          0
ip_address           0
day                  0
hour                 0
minute               0
operating_system     0
amount               0
background           0
complaints           0
transaction_count    0
credit               0
global_limit         0
credit_type          0
merchant             0
accounts             0
loans                0
browser              0
android              0
ios                  0
purchases            0
browsing_history     0
relationship         0
security_index       0
transaction_time     0
credit_limit         0
balance_history      0
Target               0
dtype: int64


Great! There's no missing values in the dataset.

Now, let's check the distribution of the target variable.

In [50]:
display(df['Target'].value_counts())
px.bar(df['Target'].value_counts(), )

Target
0    284315
1       492
Name: count, dtype: int64

In [51]:
(len(df[df['Target'] == 1]) / len(df['Target'])) * 100

0.1727485630620034

As we can see, the dataset is extremely imbalanced, with only 0.17% of the transactions being frauds.

So, further on we'll need to perform a resampling to balance the dataset.

We also need to know the variables correlation to the target variable:

In [52]:
df.corr()['Target'].sort_values(ascending=False)

Target               1.0000
background           0.1549
postal_code          0.1334
city                 0.0913
android              0.0404
loans                0.0348
browser              0.0201
minute               0.0199
transaction_time     0.0176
credit_limit         0.0095
balance_history      0.0056
security_index       0.0045
relationship         0.0033
ios                  0.0008
purchases           -0.0027
global_limit        -0.0042
transaction_count   -0.0046
browsing_history    -0.0072
Timestamp           -0.0123
day                 -0.0436
ip_address          -0.0950
operating_system    -0.0977
country             -0.1013
accounts            -0.1115
hour                -0.1873
district            -0.1930
credit_type         -0.1965
amount              -0.2169
complaints          -0.2606
credit              -0.3025
merchant            -0.3265
Name: Target, dtype: float64

Notice that "postal code" and "background" are the most important features for fraud detection.

Let's a more holistic analysis between the features of the dataset.

In [53]:
df.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,Timestamp,country,city,district,postal_code,ip_address,day,hour,minute,operating_system,amount,background,complaints,transaction_count,credit,global_limit,credit_type,merchant,accounts,loans,browser,android,ios,purchases,browsing_history,relationship,security_index,transaction_time,credit_limit,balance_history,Target
Timestamp,1.0,0.117396,-0.010593,-0.419618,-0.10526,0.173072,-0.063016,0.084714,-0.036949,-0.00866,0.030617,-0.247689,0.124348,-0.065902,-0.098757,-0.183453,0.011903,-0.073297,0.090438,0.028975,-0.050866,0.044736,0.144059,0.051142,-0.016182,-0.233083,-0.041407,-0.005135,-0.009413,-0.010596,-0.012323
country,0.117396,1.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.227709,-0.101347
city,-0.010593,0.0,1.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.531409,0.091289
district,-0.419618,-0.0,0.0,1.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.21088,-0.192961
postal_code,-0.10526,-0.0,-0.0,0.0,1.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.098732,0.133447
ip_address,0.173072,0.0,0.0,-0.0,-0.0,1.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.386356,-0.094974
day,-0.063016,-0.0,0.0,0.0,-0.0,0.0,1.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.215981,-0.043643
hour,0.084714,-0.0,0.0,0.0,-0.0,0.0,0.0,1.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.397311,-0.187257
minute,-0.036949,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,1.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.103079,0.019875
operating_system,-0.00866,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.044246,-0.097733


This overview is important to detect interdependencies between variables and understand the data distribution. In this case, we can see that our most interesting variables are does not have a strong correlation with each other or with other variables. This could gave us a multicollinarity problem.

# 6. Preparing the data for the models

As we saw in the previous notebook, the dataset is imbalanced, with only 0.1% of the transactions being fraudulent. This is a common issue in fraud detection, and it requires special attention when building the model.

To address this issue, we can use a variety of techniques, such as oversampling the minority class, undersampling the majority class, or using a combination of both. In this notebook, we will use oversampling to balance the dataset.

We will use the SMOTE (Synthetic Minority Over-sampling Technique) algorithm to oversample the minority class. SMOTE works by creating synthetic examples of the minority class by interpolating between existing examples. This can help to balance the dataset and improve the performance of the model.

First, we will split the dataset into training and testing sets. We will use 70% of the data for training and 30% for testing.

### Spliting data into train and test
Setting our explanatory variables:

In [54]:
X = df.drop(['Target'], axis=1)
X

Unnamed: 0,Timestamp,country,city,district,postal_code,ip_address,day,hour,minute,operating_system,amount,background,complaints,transaction_count,credit,global_limit,credit_type,merchant,accounts,loans,browser,android,ios,purchases,browsing_history,relationship,security_index,transaction_time,credit_limit,balance_history
0,0.0000,-1.3598,-0.0728,2.5363,1.3782,-0.3383,0.4624,0.2396,0.0987,0.3638,0.0908,-0.5516,-0.6178,-0.9914,-0.3112,1.4682,-0.4704,0.2080,0.0258,0.4040,0.2514,-0.0183,0.2778,-0.1105,0.0669,0.1285,-0.1891,0.1336,-0.0211,149.6200
1,0.0000,1.1919,0.2662,0.1665,0.4482,0.0600,-0.0824,-0.0788,0.0851,-0.2554,-0.1670,1.6127,1.0652,0.4891,-0.1438,0.6356,0.4639,-0.1148,-0.1834,-0.1458,-0.0691,-0.2258,-0.6387,0.1013,-0.3398,0.1672,0.1259,-0.0090,0.0147,2.6900
2,1.0000,-1.3584,-1.3402,1.7732,0.3798,-0.5032,1.8005,0.7915,0.2477,-1.5147,0.2076,0.6245,0.0661,0.7173,-0.1659,2.3459,-2.8901,1.1100,-0.1214,-2.2619,0.5250,0.2480,0.7717,0.9094,-0.6893,-0.3276,-0.1391,-0.0554,-0.0598,378.6600
3,1.0000,-0.9663,-0.1852,1.7930,-0.8633,-0.0103,1.2472,0.2376,0.3774,-1.3870,-0.0550,-0.2265,0.1782,0.5078,-0.2879,-0.6314,-1.0596,-0.6841,1.9658,-1.2326,-0.2080,-0.1083,0.0053,-0.1903,-1.1756,0.6474,-0.2219,0.0627,0.0615,123.5000
4,2.0000,-1.1582,0.8777,1.5487,0.4030,-0.4072,0.0959,0.5929,-0.2705,0.8177,0.7531,-0.8228,0.5382,1.3459,-1.1197,0.1751,-0.4514,-0.2370,-0.0382,0.8035,0.4085,-0.0094,0.7983,-0.1375,0.1413,-0.2060,0.5023,0.2194,0.2152,69.9900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0000,-11.8811,10.0718,-9.8348,-2.0667,-5.3645,-2.6068,-4.9182,7.3053,1.9144,4.3562,-1.5931,2.7119,-0.6893,4.6269,-0.9245,1.1076,1.9917,0.5106,-0.6829,1.4758,0.2135,0.1119,1.0145,-0.5093,1.4368,0.2500,0.9437,0.8237,0.7700
284803,172787.0000,-0.7328,-0.0551,2.0350,-0.7386,0.8682,1.0584,0.0243,0.2949,0.5848,-0.9759,-0.1502,0.9158,1.2148,-0.6751,1.1649,-0.7118,-0.0257,-1.2212,-1.5456,0.0596,0.2142,0.9244,0.0125,-1.0162,-0.6066,-0.3953,0.0685,-0.0535,24.7900
284804,172788.0000,1.9196,-0.3013,-3.2496,-0.5578,2.6305,3.0313,-0.2968,0.7084,0.4325,-0.4848,0.4116,0.0631,-0.1837,-0.5106,1.3293,0.1407,0.3135,0.3957,-0.5773,0.0014,0.2320,0.5782,-0.0375,0.6401,0.2657,-0.0874,0.0045,-0.0266,67.8800
284805,172788.0000,-0.2404,0.5305,0.7025,0.6898,-0.3780,0.6237,-0.6862,0.6791,0.3921,-0.3991,-1.9338,-0.9629,-1.0421,0.4496,1.9626,-0.6086,0.5099,1.1140,2.8978,0.1274,0.2652,0.8000,-0.1633,0.1232,-0.5692,0.5467,0.1088,0.1045,10.0000


Setting our response variable:

In [55]:
y = df['Target']

Runnning the resampling (oversampling) method:

In [59]:
smt = SMOTE()
X, y = smt.fit_resample(X, y)

Now let's check the results of resampling.

In [None]:
px.bar(y.value_counts(), color=y.value_counts().index, labels={"value": "Count", "index": "Class"})

Spliting into train and test

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

# Creating predictive models for fraud detection in bank transactions

We'll create three predictive machine learning models to predict bank transaction fraud:

1. XGBoost
2. LightGBM
3. Random Forest