In [4]:
import sklearn as sk
import pandas as pd
import numpy as np
import imblearn as im
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
from sklearn import tree
from sklearn import neighbors
from sklearn import discriminant_analysis
from sklearn import linear_model
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle
#from xgboost import XGBClassifier

In [5]:
def normalize_confusion_matrix(matrix):
    temp_matrix = np.zeros((2,2))
    for indx,i in enumerate(matrix):
        column_total = sum(i)
        normalized_column = [i[0]/column_total,i[1]/column_total]
        temp_matrix[indx] = normalized_column
    return(temp_matrix)

In [6]:
df = pd.read_csv("fraud.csv")

In [7]:

#we only want to study the types of transactions that are suceptible to Fraud 
df = df[df['type'].isin(['CASH_OUT', 'TRANSFER'])]
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
15,1,CASH_OUT,229133.94,C905080434,15325.0,0.0,C476402209,5083.0,51513.44,0,0
19,1,TRANSFER,215310.3,C1670993182,705.0,0.0,C1100439041,22425.0,0.0,0,0
24,1,TRANSFER,311685.89,C1984094095,10835.0,0.0,C932583850,6267.0,2719172.89,0,0


In [10]:
sum(df['isFraud'])/len(df) 

0.002964544224336551

Although we have refined our data set the proportion of Fraud is still almos negligeable compared to the total number of transactions. If we built a model that systematically predicted non-Fraud it would be right 99.7% of the time. This performance is certainly good but useless. Thus we decided to undersample the data

In [11]:
#Thus we choose to do some undersampling
sampler = RandomUnderSampler(sampling_strategy = "majority", random_state = 997)
undersample_x, undersample_y = sampler.fit_resample(df,df["isFraud"])
undersample_x = pd.DataFrame(undersample_x)
undersample_y = pd.DataFrame(undersample_y)

In [12]:

colnames = ['step', 'type', 'amount', 'nameOrig', 'oldBalanceOrg', 'NewBalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']
undersample_x.columns = colnames

In [13]:
undersample_x.head()

Unnamed: 0,step,type,amount,nameOrig,oldBalanceOrg,NewBalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,234,TRANSFER,743926,C782754095,8083.0,0,C1091730919,0.0,743926.0,0,0
1,159,CASH_OUT,190219,C556384959,14923.0,0,C1619565877,0.0,190219.0,0,0
2,161,CASH_OUT,112818,C1202686488,0.0,0,C367226885,1897530.0,2010350.0,0,0
3,684,CASH_OUT,65506,C968140940,38522.4,0,C161404711,227155.0,292661.0,0,0
4,585,TRANSFER,270413,C257544981,0.0,0,C1024411975,433591.0,704003.0,0,0


In [14]:

category = []
suspect = []
for i in range(len(undersample_x)):
    if undersample_x['type'][i] == 'TRANSFER':
        category.append(1)
    else:
        category.append(0)
    if (undersample_x['newbalanceDest'][i] == undersample_x['oldBalanceOrg'][i]) and (undersample_x['NewBalanceOrig'][i]==0):
        suspect.append(1)
    else:
        suspect.append(0)

undersample_x['type'] = category
undersample_x['suspect'] = suspect

undersample_x = undersample_x.loc[:, undersample_x.columns != 'nameOrig']
undersample_x = undersample_x.loc[:, undersample_x.columns != 'oldBalanceOrg']
undersample_x = undersample_x.loc[:, undersample_x.columns != 'oldbalanceDest']
undersample_x = undersample_x.loc[:, undersample_x.columns != 'NewBalanceOrig']
undersample_x = undersample_x.loc[:, undersample_x.columns != 'nameDest']
undersample_x = undersample_x.loc[:, undersample_x.columns != 'newbalanceDest']
undersample_x = undersample_x.loc[:, undersample_x.columns != 'isFlaggedFraud']
undersample_x = undersample_x.loc[:, undersample_x.columns != 'step']
     
      

In [25]:
undersample_x.head()

Unnamed: 0,type,amount,isFraud,suspect
0,1,743926,0,0
1,0,190219,0,0
2,0,112818,0,0
3,0,65506,0,0
4,1,270413,0,0


In [24]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn import preprocessing



X = undersample_x.loc[:, undersample_x.columns != 'isFraud']

n = X.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
n_scaled = min_max_scaler.fit_transform(n)
X = pd.DataFrame(n_scaled)

y = undersample_x['isFraud']

y = y.astype('int')
X = X.astype('float')

logit_model = sm.Logit(y,X)
result = logit_model.fit()
print(result.summary2())



Optimization terminated successfully.
         Current function value: 0.591655
         Iterations 9
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.146     
Dependent Variable: isFraud          AIC:              19443.0410
Date:               2019-02-11 12:42 BIC:              19466.1609
No. Observations:   16426            Log-Likelihood:   -9718.5   
Df Model:           2                LL-Null:          -11386.   
Df Residuals:       16423            LLR p-value:      0.0000    
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     9.0000                                       
--------------------------------------------------------------------
       Coef.     Std.Err.       z       P>|z|      [0.025     0.975]
--------------------------------------------------------------------
0      0.6658      0.0327    20.3668    0.0000     0.6018     0.7299
1     12.7416      0.7017    18.1571    0.0000    11.3

