In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import joblib

import warnings
warnings.filterwarnings('ignore')

In [75]:
df = pd.read_csv('Desktop/Fraud_Detection.csv')

In [76]:
df.shape

(636262, 10)

In [79]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,283,CASH_IN,210329.84,C1159819632,3778062.79,3988392.64,C1218876138,1519266.6,1308936.76,0
1,132,CASH_OUT,215489.19,C1372369468,21518.0,0.0,C467105520,6345756.55,6794954.89,0
2,355,DEBIT,4431.05,C1059822709,20674.0,16242.95,C76588246,80876.56,85307.61,0
3,135,CASH_OUT,214026.2,C1464960643,46909.73,0.0,C1059379810,13467450.36,13681476.56,0
4,381,CASH_OUT,8858.45,C831134427,0.0,0.0,C579876929,1667180.58,1676039.03,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 636262 entries, 0 to 636261
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            636262 non-null  int64  
 1   type            636251 non-null  object 
 2   amount          636262 non-null  float64
 3   nameOrig        636262 non-null  object 
 4   oldbalanceOrg   636261 non-null  float64
 5   newbalanceOrig  636262 non-null  float64
 6   nameDest        636262 non-null  object 
 7   oldbalanceDest  636262 non-null  float64
 8   newbalanceDest  636252 non-null  float64
 9   isFraud         636262 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 48.5+ MB


In [19]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud'],
      dtype='object')

In [21]:
col_names = ['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud']
for col in col_names:
    
    print(df[col].value_counts()) 

step
19     5206
187    5039
18     4926
235    4652
163    4626
       ... 
56        1
414       1
224       1
510       1
319       1
Name: count, Length: 630, dtype: int64
type
CASH_OUT    224212
PAYMENT     214965
CASH_IN     139611
TRANSFER     53293
DEBIT         4170
Name: count, dtype: int64
amount
10000000.00    314
15000.00        10
10000.00         9
500.00           7
120000.00        5
              ... 
55223.70         1
30255.37         1
171121.82        1
177699.92        1
95142.89         1
Name: count, Length: 622502, dtype: int64
nameOrig
C334643493     2
C288001979     2
C21821244      2
C1612765417    2
C1942864508    2
              ..
C1972843152    1
C322189211     1
C934806957     1
C2065879391    1
C874575079     1
Name: count, Length: 636171, dtype: int64
oldbalanceOrg
0.00          210268
164.00           111
137.00           104
157.00           103
115.00            99
               ...  
81368.65           1
342837.00          1
1243597.88         1

In [23]:
df.isnull().sum()

step               0
type              11
amount             0
nameOrig           0
oldbalanceOrg      1
newbalanceOrig     0
nameDest           0
oldbalanceDest     0
newbalanceDest    10
isFraud            0
dtype: int64

In [None]:
# Handaling Null Values

In [25]:
df['newbalanceDest'].value_counts()

newbalanceDest
0.00          243934
4692432.30         5
641979.14          5
971418.91          5
4416613.81         5
               ...  
532509.32          1
3481910.06         1
1456608.43         1
654158.77          1
526522.96          1
Name: count, Length: 387354, dtype: int64

In [27]:
df['newbalanceDest'].fillna(df['newbalanceDest'].mode()[0], inplace=True)

In [29]:
df['newbalanceDest'].isnull().sum()

0

In [31]:
df['oldbalanceOrg'].value_counts()

oldbalanceOrg
0.00          210268
164.00           111
137.00           104
157.00           103
115.00            99
               ...  
81368.65           1
342837.00          1
1243597.88         1
1199384.74         1
83669.00           1
Name: count, Length: 258156, dtype: int64

In [33]:
df['oldbalanceOrg'].fillna(df['oldbalanceOrg'].mode()[0], inplace=True)

In [35]:
df['oldbalanceOrg'].isnull().sum()

0

In [37]:
null_rows = df[df['type'].isnull()]
null_rows

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
151,307,,342616.22,C1755798883,77195.0,0.0,C166847480,52234.79,394851.02,0
302,158,,7732.21,C1234527342,1342324.35,1350056.56,C1377667801,1824833.18,1817100.97,0
585,16,,112729.09,C1364384249,17605333.9,17718062.98,C1360951522,435841.14,323112.06,0
586,358,,27110.8,C425055824,1418.0,0.0,M212550022,0.0,0.0,0
587,158,,1472.69,C1577593539,0.0,0.0,M1478160182,0.0,0.0,0
588,258,,172742.22,C488703547,29902.0,0.0,C1217210790,0.0,172742.22,0
589,252,,269887.0,C1109038861,51581.0,0.0,C492589157,0.0,269887.0,0
590,179,,237127.22,C1902249193,0.0,0.0,C1294113408,1339533.7,1941840.53,0
591,334,,85152.95,C275461737,9052573.26,9137726.22,C860780151,831650.75,746497.8,0
592,179,,754.21,C814909892,30499.0,29744.79,M1164217842,0.0,0.0,0


In [39]:
df['isFraud'].value_counts()

isFraud
0    635441
1       821
Name: count, dtype: int64

In [41]:
df = df.dropna(subset=['type'])
df.reset_index(drop=True, inplace=True)

In [43]:
df['type'].isnull().sum()

0

In [47]:
legitimate = df[df['isFraud'] == 0]
fraudulent = df[df['isFraud'] == 1]
# len(fraudulent)
legitimate_downsampled = legitimate.sample(len(fraudulent))
balanced_data = pd.concat([legitimate_downsampled, fraudulent])

In [49]:
legitimate_downsampled

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
524173,186,TRANSFER,91924.23,C966991707,2184.00,0.00,C346482466,2740467.85,2832392.08,0
111323,656,PAYMENT,4047.19,C460393288,0.00,0.00,M1451648327,0.00,0.00,0
469595,304,CASH_IN,44521.56,C1884442480,8121687.76,8166209.32,C2076929098,100663.96,56142.40,0
616468,187,CASH_OUT,255830.78,C206334815,0.00,0.00,C1320482982,334775.79,590606.57,0
367859,393,CASH_OUT,371685.54,C1231348788,0.00,0.00,C746338703,3909803.21,4281488.74,0
...,...,...,...,...,...,...,...,...,...,...
17652,19,PAYMENT,5977.14,C301450905,0.00,0.00,M1603945063,0.00,0.00,0
315776,12,PAYMENT,5831.89,C872509690,0.00,0.00,M72548139,0.00,0.00,0
444379,135,CASH_IN,644436.01,C1639373533,7040.00,651476.01,C769726649,409.18,0.00,0
541811,157,PAYMENT,4403.09,C1174103161,0.00,0.00,M1841519426,0.00,0.00,0


In [51]:
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [53]:
df


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,283,CASH_IN,210329.84,C1159819632,3778062.79,3988392.64,C1218876138,1519266.60,1308936.76,0
1,132,CASH_OUT,215489.19,C1372369468,21518.00,0.00,C467105520,6345756.55,6794954.89,0
2,355,DEBIT,4431.05,C1059822709,20674.00,16242.95,C76588246,80876.56,85307.61,0
3,135,CASH_OUT,214026.20,C1464960643,46909.73,0.00,C1059379810,13467450.36,13681476.56,0
4,381,CASH_OUT,8858.45,C831134427,0.00,0.00,C579876929,1667180.58,1676039.03,0
...,...,...,...,...,...,...,...,...,...,...
636246,351,CASH_OUT,28761.10,C742050657,0.00,0.00,C568407561,328534.52,357295.62,0
636247,184,CASH_OUT,167820.71,C561181412,62265.00,0.00,C1852953580,106429.48,274250.18,0
636248,35,PAYMENT,8898.12,C1773417333,30808.00,21909.88,M445701551,0.00,0.00,0
636249,277,CASH_OUT,176147.90,C1423233247,83669.00,0.00,C1328739120,0.00,176147.90,0


In [55]:
label_encoder = LabelEncoder()

In [57]:
balanced_data['type']=label_encoder.fit_transform(balanced_data['type'])
balanced_data['type'].unique()

array([4, 1, 3, 0, 2])

In [59]:
mapping_dict = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))
mapping_dict

{0: 'CASH_IN', 1: 'CASH_OUT', 2: 'DEBIT', 3: 'PAYMENT', 4: 'TRANSFER'}

In [61]:
x = balanced_data.drop(['isFraud'], axis=1)
y = balanced_data['isFraud']

In [63]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42, stratify = y)

In [65]:
x_train.shape, x_test.shape

((1313, 9), (329, 9))

In [67]:
y_train.shape, y_test.shape

((1313,), (329,))

In [69]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [81]:
encoder = ce.OrdinalEncoder(cols=['nameOrig', 'nameDest'])

In [83]:
encoder = ce.OrdinalEncoder(cols=['nameOrig', 'nameDest'])

x_train = encoder.fit_transform(x_train)
x_test = encoder.transform(x_test)

In [85]:

clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)


# fit the model
clf_gini.fit(x_train, y_train)

In [87]:
y_pred_gini = clf_gini.predict(x_test)

In [89]:
print('Model accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gini)))

Model accuracy score with criterion gini index: 0.9149


In [91]:
y_pred_train_gini = clf_gini.predict(x_train)
y_pred_train_gini

array([0, 1, 0, ..., 1, 0, 1])

In [93]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train_gini)))

Training-set accuracy score: 0.9406


In [95]:
print('Training set score: {:.4f}'.format(clf_gini.score(x_train, y_train)))
print('Test set score: {:.4f}'.format(clf_gini.score(x_test, y_test)))

Training set score: 0.9406
Test set score: 0.9149


In [97]:
clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)


# fit the model
clf_en.fit(x_train, y_train)

In [99]:
y_pred_en = clf_en.predict(x_test)

In [101]:
print('Model accuracy score with criterion entropy: {0:0.4f}'. format(accuracy_score(y_test, y_pred_en)))

Model accuracy score with criterion entropy: 0.9210


In [103]:
y_pred_train_en = clf_en.predict(x_train)

y_pred_train_en

array([1, 1, 0, ..., 1, 0, 1])

In [105]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train_en)))

Training-set accuracy score: 0.9452


In [107]:
print('Training set score: {:.4f}'.format(clf_en.score(x_train, y_train)))

print('Test set score: {:.4f}'.format(clf_en.score(x_test, y_test)))

Training set score: 0.9452
Test set score: 0.9210


In [109]:
cm = confusion_matrix(y_test, y_pred_en)

print('Confusion matrix\n\n', cm)

Confusion matrix

 [[158   7]
 [ 19 145]]


In [111]:
print(classification_report(y_test, y_pred_en))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92       165
           1       0.95      0.88      0.92       164

    accuracy                           0.92       329
   macro avg       0.92      0.92      0.92       329
weighted avg       0.92      0.92      0.92       329

