In [2]:

import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.cm as cm
from random import seed,sample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, roc_curve, auc,\
precision_score
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor



In [3]:
# loading data

data = pd.read_csv("../input/PS_20174392719_1491204439457_log.csv")

In [4]:
data.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [5]:
data.shape

(6362620, 11)

In [6]:
data.head(7)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
5,1,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0
6,1,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0


In [7]:

data_new = data.copy() 
data_new["type1"] = np.nan


data_new.loc[data.nameOrig.str.contains('C') & data.nameDest.str.contains('C'),"type1"] = "CC" 
data_new.loc[data.nameOrig.str.contains('C') & data.nameDest.str.contains('M'),"type1"] = "CM"
data_new.loc[data.nameOrig.str.contains('M') & data.nameDest.str.contains('C'),"type1"] = "MC"
data_new.loc[data.nameOrig.str.contains('M') & data.nameDest.str.contains('M'),"type1"] = "MM"

    

In [8]:
data_new = data_new.drop('type1',1)

In [9]:

data_new = data_new[(data_new["type"] == "CASH_OUT") | (data_new["type"] == "TRANSFER")]

In [10]:

data_new["errorBalanceOrg"] = data_new.newbalanceOrig + data_new.amount - data_new.oldbalanceOrg
data_new["errorBalanceDest"] = data_new.oldbalanceDest + data_new.amount - data_new.newbalanceDest



In [11]:
# getting rid of nameOrig and nameDest column.
names = ["nameOrig","nameDest"]
data_new = data_new.drop(names,1)

In [12]:
# dropping isFlaggedFraud column from the fraud,valid, and new_data datasets

data_new = data_new.drop("isFlaggedFraud",1)

In [13]:
dataset1 = data_new.copy()


# adding feature HourOfDay to Dataset1 
dataset1["HourOfDay"] = np.nan 
dataset1.HourOfDay = data_new.step % 24


print("Head of dataset1: \n", pd.DataFrame.head(dataset1))


Head of dataset1: 
     step      type     amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
2      1  TRANSFER     181.00          181.0             0.0             0.0   
3      1  CASH_OUT     181.00          181.0             0.0         21182.0   
15     1  CASH_OUT  229133.94        15325.0             0.0          5083.0   
19     1  TRANSFER  215310.30          705.0             0.0         22425.0   
24     1  TRANSFER  311685.89        10835.0             0.0          6267.0   

    newbalanceDest  isFraud  errorBalanceOrg  errorBalanceDest  HourOfDay  
2             0.00        1             0.00             181.0          1  
3             0.00        1             0.00           21363.0          1  
15        51513.44        0        213808.94          182703.5          1  
19            0.00        0        214605.30          237735.3          1  
24      2719172.89        0        300850.89        -2401220.0          1  


In [14]:
# finalizing dataset
dataset = dataset1.copy() # unchanged dataset1

In [15]:
# getting one-hot encoding of the 'type' variable

dataset = pd.get_dummies(dataset,prefix=['type'])

In [16]:
pd.DataFrame.head(dataset)

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,errorBalanceOrg,errorBalanceDest,HourOfDay,type_CASH_OUT,type_TRANSFER
2,1,181.0,181.0,0.0,0.0,0.0,1,0.0,181.0,1,0,1
3,1,181.0,181.0,0.0,21182.0,0.0,1,0.0,21363.0,1,1,0
15,1,229133.94,15325.0,0.0,5083.0,51513.44,0,213808.94,182703.5,1,1,0
19,1,215310.3,705.0,0.0,22425.0,0.0,0,214605.3,237735.3,1,0,1
24,1,311685.89,10835.0,0.0,6267.0,2719172.89,0,300850.89,-2401220.0,1,0,1


In [17]:

RandomState = 42
seed(21)
X = dataset.drop("isFraud",1)
y = dataset.isFraud
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [47]:

parametersRF = {'n_estimators':15,'oob_score':True,'class_weight': "balanced",'n_jobs':-1,\
                 'random_state':RandomState}
RF = RandomForestClassifier(**parametersRF)
fitted_vals = RF.fit(X_train, y_train)
 

predictionsRF = RF.predict(X_test)
 
     

CM_RF = confusion_matrix(y_test,predictionsRF)
CR_RF = classification_report(y_test,predictionsRF)
fprRF, recallRF, thresholdsRF = roc_curve(y_test, predictionsRF)
AUC_RF = auc(fprRF, recallRF)

resultsRF = {"Confusion Matrix":CM_RF,"Classification Report":CR_RF,"Area Under Curve":AUC_RF}

In [48]:


for measure in resultsRF:
    print(measure,": \n",resultsRF[measure])

Confusion Matrix : 
 [[1242960       1]
 [     17    3707]]
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1242961
           1       1.00      1.00      1.00      3724

    accuracy                           1.00   1246685
   macro avg       1.00      1.00      1.00   1246685
weighted avg       1.00      1.00      1.00   1246685

Area Under Curve : 
 0.9977171057906179


In [49]:

parametersRFR = {'n_estimators':15,'oob_score':True,'n_jobs':-1,\
                 'random_state':RandomState}
RFR = RandomForestRegressor(**parametersRFR)
fitted_vals = RFR.fit(X_train, y_train)
 

predictionsRFR = RFR.predict(X_test)


In [50]:
predictionsRFR=np.array(predictionsRFR,dtype='int') 
     

CM_RFR = confusion_matrix(y_test,predictionsRFR)
CR_RFR = classification_report(y_test,predictionsRFR)
fprRFR, recallRFR, thresholdsRFR = roc_curve(y_test, predictionsRF)
AUC_RFR = auc(fprRFR, recallRFR)

resultsRFR = {"Confusion Matrix":CM_RFR,"Classification Report":CR_RFR,"Area Under Curve":AUC_RFR}
for measure in resultsRFR:
    print(measure,": \n",resultsRFR[measure])

Confusion Matrix : 
 [[1242959       2]
 [     44    3680]]
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1242961
           1       1.00      0.99      0.99      3724

    accuracy                           1.00   1246685
   macro avg       1.00      0.99      1.00   1246685
weighted avg       1.00      1.00      1.00   1246685

Area Under Curve : 
 0.9977171057906179


In [44]:
from sklearn import svm
linear = svm.SVC( kernel='poly',degree=17,cache_size=450,tol=0.001,max_iter=1200,random_state=RandomState)
linear.fit(X_train, y_train)

predictionsSVMlin = linear.predict(X_test)

In [45]:
predictionsRFR=np.array(predictionsSVMlin,dtype='int') 
     

CM_RFR = confusion_matrix(y_test,predictionsRFR)
CR_RFR = classification_report(y_test,predictionsRFR)
fprRFR, recallRFR, thresholdsRFR = roc_curve(y_test, predictionsRFR)
AUC_RFR = auc(fprRFR, recallRFR)

resultsRFR = {"Confusion Matrix":CM_RFR,"Classification Report":CR_RFR,"Area Under Curve":AUC_RFR}
for measure in resultsRFR:
    print(measure,": \n",resultsRFR[measure])

Confusion Matrix : 
 [[1219838   23123]
 [   2419    1305]]
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      0.98      0.99   1242961
           1       0.05      0.35      0.09      3724

    accuracy                           0.98   1246685
   macro avg       0.53      0.67      0.54   1246685
weighted avg       1.00      0.98      0.99   1246685

Area Under Curve : 
 0.6659132437192564
