In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
financial_data = pd.read_csv('./dataset/raw.csv')
financial_data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.00,0.00,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.00,0.00,0.0,0.0
2,1,TRANSFER,181.00,C1305486145,181.0,0.00,C553264065,0.00,0.00,1.0,0.0
3,1,CASH_OUT,181.00,C840083671,181.0,0.00,C38997010,21182.00,0.00,1.0,0.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1307253,136,TRANSFER,426455.18,C379739604,0.0,0.00,C2142703715,705902.24,1132357.42,0.0,0.0
1307254,136,TRANSFER,464803.33,C1631993596,0.0,0.00,C1171040194,1011116.65,1475919.98,0.0,0.0
1307255,136,PAYMENT,15614.51,C1056703656,20063.0,4448.49,M915607287,0.00,0.00,0.0,0.0
1307256,136,PAYMENT,9973.69,C844867852,2693.0,0.00,M1648017509,0.00,0.00,0.0,0.0


In [8]:
# Information about the dataset
financial_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1307258 entries, 0 to 1307257
Data columns (total 11 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   step            1307258 non-null  int64  
 1   type            1307257 non-null  object 
 2   amount          1307257 non-null  float64
 3   nameOrig        1307257 non-null  object 
 4   oldbalanceOrg   1307257 non-null  float64
 5   newbalanceOrig  1307257 non-null  float64
 6   nameDest        1307257 non-null  object 
 7   oldbalanceDest  1307257 non-null  float64
 8   newbalanceDest  1307257 non-null  float64
 9   isFraud         1307257 non-null  float64
 10  isFlaggedFraud  1307257 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 109.7+ MB


In [10]:
# Checking the number of missing values in each column
financial_data.isnull().sum()

step              0
type              1
amount            1
nameOrig          1
oldbalanceOrg     1
newbalanceOrig    1
nameDest          1
oldbalanceDest    1
newbalanceDest    1
isFraud           1
isFlaggedFraud    1
dtype: int64

In [13]:
# Since there's only one row with null values, we drop the row
financial_data_cleaned = financial_data.dropna()
financial_data_cleaned.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [16]:
# Distribution of normal tranasactions and fraudulent transactions
financial_data_cleaned['isFraud'].value_counts()

0.0    1305709
1.0       1548
Name: isFraud, dtype: int64

This dataset is highly skewed. Here:

0 -> Normal Transaction
1 -> Fraudulent Transaction


In [17]:
# Separating the data for analysis
legit = financial_data_cleaned[financial_data_cleaned['isFraud'] == 0]
fraud = financial_data_cleaned[financial_data_cleaned['isFraud'] == 1]

In [20]:
print(legit.shape)
print(fraud.shape)

(1305709, 11)
(1548, 11)


In [21]:
# Statistical measures for each dataset
legit.amount.describe()

count    1.305709e+06
mean     1.619467e+05
std      2.611653e+05
min      1.000000e-01
25%      1.288144e+04
50%      7.866133e+04
75%      2.187456e+05
max      6.419835e+06
Name: amount, dtype: float64

In [22]:
fraud.amount.describe()

count    1.548000e+03
mean     1.244297e+06
std      2.055966e+06
min      1.190000e+02
25%      9.963024e+04
50%      3.786887e+05
75%      1.399119e+06
max      1.000000e+07
Name: amount, dtype: float64

The mean amount that has been transacted for left transactions amount to 1,61,946.7 and that of fradulent transactions amount to 12,44,297

=> The average of the fradulent transactions are significantly higher than the average of non-fraudulent transactions

In [28]:
financial_data_cleaned.groupby('isFraud').mean()

  financial_data_cleaned.groupby('isFraud').mean()


Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,47.379274,161946.7,876808.8,899000.53481,987959.234527,1118891.0,0.0
1.0,65.761628,1244297.0,1277354.0,38897.428101,461944.908766,1105808.0,0.0


The mean of difference between oldbalanceOrg and newbalanceOrig both cases are noticable: In case of fradulent transactions, large sum of money is transacted which lead top lower remaining balance in the account

UNDERSAMPLING

Build a sample dataset containing similar distribution of non-fraudulent transactions and fraudulent transactions

number of fraudulent transactions = 1548

In [34]:
# Uniform distribution
legit_sample = legit.sample(n=1548)
legit_sample.shape

(1548, 11)

Concatenating both dataframes

In [43]:
uniform_df = pd.concat([legit_sample,fraud],axis=0)
uniform_df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
440393,18,CASH_OUT,439052.63,C620383910,0.00,0.00,C1101967436,1390516.64,1829569.27,0.0,0.0
774526,39,PAYMENT,7388.18,C199562463,88950.00,81561.82,M392038901,0.00,0.00,0.0,0.0
416515,18,PAYMENT,25385.88,C248710725,7555.96,0.00,M928698803,0.00,0.00,0.0,0.0
924498,43,CASH_OUT,232395.18,C1277788282,0.00,0.00,C383435873,528683.74,761078.92,0.0,0.0
758194,38,CASH_IN,594997.18,C1987226157,81057.00,676054.18,C1450224214,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1291695,135,CASH_OUT,128417.96,C1400591529,128417.96,0.00,C1469656123,99429.91,227847.87,1.0,0.0
1291987,135,TRANSFER,7927.06,C3592918,7927.06,0.00,C956442492,0.00,0.00,1.0,0.0
1291988,135,CASH_OUT,7927.06,C2071408487,7927.06,0.00,C1313763244,0.00,7927.06,1.0,0.0
1293870,136,TRANSFER,3263443.15,C1406501579,3263443.15,0.00,C106008304,0.00,0.00,1.0,0.0


In [44]:
uniform_df['isFraud'].value_counts()

0.0    1548
1.0    1548
Name: isFraud, dtype: int64

In [47]:
uniform_df.groupby('isFraud').mean()

  uniform_df.groupby('isFraud').mean()


Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,48.175711,167057.7,874290.2,899469.658456,978285.910601,1104010.0,0.0
1.0,65.761628,1244297.0,1277354.0,38897.428101,461944.908766,1105808.0,0.0


In [54]:
# Splitting the dataset
# X: Features
# Y: Targets

X = uniform_df.drop(columns=['isFraud','type','nameOrig','nameDest','isFlaggedFraud'],axis=1)
y = uniform_df['isFraud']

In [56]:
X

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
440393,18,439052.63,0.00,0.00,1390516.64,1829569.27
774526,39,7388.18,88950.00,81561.82,0.00,0.00
416515,18,25385.88,7555.96,0.00,0.00,0.00
924498,43,232395.18,0.00,0.00,528683.74,761078.92
758194,38,594997.18,81057.00,676054.18,0.00,0.00
...,...,...,...,...,...,...
1291695,135,128417.96,128417.96,0.00,99429.91,227847.87
1291987,135,7927.06,7927.06,0.00,0.00,0.00
1291988,135,7927.06,7927.06,0.00,0.00,7927.06
1293870,136,3263443.15,3263443.15,0.00,0.00,0.00


In [64]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=2)

MODEL TRAINING: Logistic Regression

In [65]:
model = LogisticRegression()

In [66]:
model.fit(X_train,y_train)

MODEL EVALUATION: Accuracy score

In [68]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,y_train)

In [69]:
training_data_accuracy

0.9422455573505655

In [70]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,y_test)

In [71]:
test_data_accuracy

0.9451612903225807