In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("Fraud.csv")

In [3]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [5]:
data.isnull().sum()                                                                                        #No null observations

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [6]:
data.shape

(6362620, 11)

In [7]:
data.drop_duplicates()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [8]:
data.shape                                                                     #shape same before and after, hence no duplicates

(6362620, 11)

In [9]:
data["isFraud"].value_counts()                                                                                 #imbalanced Data

0    6354407
1       8213
Name: isFraud, dtype: int64

In [10]:
data1 = data.drop(['nameOrig', 'nameDest','isFlaggedFraud','step'], axis=1)

In [11]:
data1

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,PAYMENT,9839.64,170136.00,160296.36,0.00,0.00,0
1,PAYMENT,1864.28,21249.00,19384.72,0.00,0.00,0
2,TRANSFER,181.00,181.00,0.00,0.00,0.00,1
3,CASH_OUT,181.00,181.00,0.00,21182.00,0.00,1
4,PAYMENT,11668.14,41554.00,29885.86,0.00,0.00,0
...,...,...,...,...,...,...,...
6362615,CASH_OUT,339682.13,339682.13,0.00,0.00,339682.13,1
6362616,TRANSFER,6311409.28,6311409.28,0.00,0.00,0.00,1
6362617,CASH_OUT,6311409.28,6311409.28,0.00,68488.84,6379898.11,1
6362618,TRANSFER,850002.52,850002.52,0.00,0.00,0.00,1


In [12]:
one_hot = pd.get_dummies(data = data1, columns = ['type'], drop_first = True)

In [13]:
one_hot

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,9839.64,170136.00,160296.36,0.00,0.00,0,0,0,1,0
1,1864.28,21249.00,19384.72,0.00,0.00,0,0,0,1,0
2,181.00,181.00,0.00,0.00,0.00,1,0,0,0,1
3,181.00,181.00,0.00,21182.00,0.00,1,1,0,0,0
4,11668.14,41554.00,29885.86,0.00,0.00,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
6362615,339682.13,339682.13,0.00,0.00,339682.13,1,1,0,0,0
6362616,6311409.28,6311409.28,0.00,0.00,0.00,1,0,0,0,1
6362617,6311409.28,6311409.28,0.00,68488.84,6379898.11,1,1,0,0,0
6362618,850002.52,850002.52,0.00,0.00,0.00,1,0,0,0,1





# Scaling Data

In [16]:
rScaler = RobustScaler()

In [19]:
One_Hot_Scaled = rScaler.fit_transform(one_hot)

In [27]:
One_Hot_Scaled=pd.DataFrame(One_Hot_Scaled, columns =one_hot.columns)

In [28]:
One_Hot_Scaled

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,-0.332932,1.452991,1.111175,-0.140722,-0.193057,0.0,0.0,0.0,1.0,0.0
1,-0.373762,0.065610,0.134375,-0.140722,-0.193057,0.0,0.0,0.0,1.0,0.0
2,-0.382380,-0.130708,0.000000,-0.140722,-0.193057,1.0,0.0,0.0,0.0,1.0
3,-0.382380,-0.130708,0.000000,-0.118260,-0.193057,1.0,1.0,0.0,0.0,0.0
4,-0.323571,0.254820,0.207169,-0.140722,-0.193057,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
6362615,1.355693,3.032881,0.000000,-0.140722,0.112438,1.0,1.0,0.0,0.0,0.0
6362616,31.927899,58.679504,0.000000,-0.140722,-0.193057,1.0,0.0,0.0,0.0,1.0
6362617,31.927899,58.679504,0.000000,-0.068096,5.544730,1.0,1.0,0.0,0.0,0.0
6362618,3.968274,7.788223,0.000000,-0.140722,-0.193057,1.0,0.0,0.0,0.0,1.0


# Balancing the Data

In [32]:
fraud_data = One_Hot_Scaled[One_Hot_Scaled['isFraud']==1.0]

not_fraud = One_Hot_Scaled[One_Hot_Scaled['isFraud']==0.0]

In [34]:
fraud_data.shape

(8213, 10)

In [35]:
not_fraud.shape

(6354407, 10)

In [39]:
not_fraud_balanced = not_fraud.sample(n=8213, random_state=42 )         #taking equal samples from not_fraud as that of fraud

In [41]:
not_fraud_balanced.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
1777056,0.557689,0.048297,0.0,0.264959,0.316318,0.0,1.0,0.0,0.0,0.0
1350600,-0.380637,-0.132395,0.0,-0.140722,-0.193057,0.0,0.0,0.0,1.0,0.0
1991933,-0.3655,0.052602,0.11351,-0.140722,-0.193057,0.0,0.0,0.0,1.0,0.0
5092368,-0.374521,-0.078636,0.028096,-0.140722,-0.193057,0.0,0.0,0.0,1.0,0.0
5066515,0.91259,12.247024,10.963863,2.736389,2.019436,0.0,0.0,0.0,0.0,0.0


In [42]:
not_fraud_balanced.shape

(8213, 10)

In [44]:
balanced_data = pd.merge(fraud_data, not_fraud_balanced, how="outer")

In [45]:
balanced_data.shape

(16426, 10)

In [48]:
final_data = balanced_data.sample(frac=1.0, random_state=42, replace =False)                            #Shuffling the dataframe

In [50]:
final_data.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
13724,-0.337698,0.511083,0.416935,-0.140722,-0.193057,0.0,0.0,0.0,1.0,0.0
12284,-0.318713,3.028219,2.263749,-0.140722,-0.193057,0.0,0.0,0.0,1.0,0.0
1163,16.032974,29.748033,0.0,3.517473,6.071399,1.0,1.0,0.0,0.0,0.0
6057,-0.256477,0.098456,0.0,-0.140722,-0.193057,1.0,0.0,0.0,0.0,1.0
6492,10.621852,19.898868,0.0,-0.140722,1.740247,1.0,1.0,0.0,0.0,0.0


In [54]:
X = final_data.drop(['isFraud'], axis=1)
Y = final_data['isFraud']

In [57]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size =.3, random_state=42)

In [61]:
print("X_Train Shape: ", X_train.shape)
print("X_Test Shape: ", X_test.shape)
print("Y_Train Shape: ", Y_train.shape)
print("Y_test Shape: ", Y_test.shape)

X_Train Shape:  (11498, 9)
X_Test Shape:  (4928, 9)
Y_Train Shape:  (11498,)
Y_test Shape:  (4928,)





# Model Training

### Logistic Regression

In [63]:
from sklearn.linear_model import LogisticRegression

logReg = LogisticRegression()

logReg.fit(X_train,Y_train)

LR_pred = logReg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [64]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()

DT.fit(X_train,Y_train)

DT_pred = DT.predict(X_test)

### Random Forest

In [66]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()

RF.fit(X_train,Y_train)

RF_pred = RF.predict(X_test)

### Gradient Boosting

In [67]:
from sklearn.ensemble import GradientBoostingClassifier

GB = GradientBoostingClassifier()

GB.fit(X_train, Y_train)

GB_pred = GB.predict(X_test)

# Reports & Metrics

In [74]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [69]:
print("Logistic Regression Report: \n\n", classification_report(Y_test,LR_pred))

Logistic Regression Report: 

               precision    recall  f1-score   support

         0.0       0.93      0.94      0.93      2444
         1.0       0.94      0.93      0.93      2484

    accuracy                           0.93      4928
   macro avg       0.93      0.93      0.93      4928
weighted avg       0.93      0.93      0.93      4928



In [70]:
print("Decision Tree Report: \n\n", classification_report(Y_test,DT_pred))

Decision Tree Report: 

               precision    recall  f1-score   support

         0.0       1.00      0.99      0.99      2444
         1.0       0.99      1.00      0.99      2484

    accuracy                           0.99      4928
   macro avg       0.99      0.99      0.99      4928
weighted avg       0.99      0.99      0.99      4928



In [71]:
print("Random Forest Report: \n\n", classification_report(Y_test,RF_pred))

Random Forest Report: 

               precision    recall  f1-score   support

         0.0       1.00      0.99      0.99      2444
         1.0       0.99      1.00      0.99      2484

    accuracy                           0.99      4928
   macro avg       0.99      0.99      0.99      4928
weighted avg       0.99      0.99      0.99      4928



In [72]:
print("Gradient Boosting Report: \n\n", classification_report(Y_test,GB_pred))

Gradient Boosting Report: 

               precision    recall  f1-score   support

         0.0       1.00      0.98      0.99      2444
         1.0       0.98      1.00      0.99      2484

    accuracy                           0.99      4928
   macro avg       0.99      0.99      0.99      4928
weighted avg       0.99      0.99      0.99      4928



In [75]:
print("Accuracy Score for Logisitic Regression: ", accuracy_score(Y_test, LR_pred))
print("Accuracy Score for Decision Tree: ", accuracy_score(Y_test, DT_pred))
print("Accuracy Score for Random Forest: ", accuracy_score(Y_test, RF_pred))
print("Accuracy Score for Gradient Boosting: ", accuracy_score(Y_test, GB_pred))

Accuracy Score for Logisitic Regression:  0.9330357142857143
Accuracy Score for Decision Tree:  0.9926948051948052
Accuracy Score for Random Forest:  0.9933035714285714
Accuracy Score for Gradient Boosting:  0.9880275974025974
