In [1]:
# Efficacy: https://www.hindawi.com/journals/scn/2018/5483472/#conclusions 
# Source: https://www.kaggle.com/anishpai/intro-to-credit-card-fraud-detection 
# Data Source: https://www.kaggle.com/mlg-ulb/creditcardfraud 

# Library Loads
import numpy as np
import matplotlib.pyplot as plt
import keras
import pandas as pd
data = pd.read_csv('https://sabwoody.blob.core.windows.net/backups/creditcard.csv')

# Data Descriptions
data.head()
data.dtypes
data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.16598e-15,3.416908e-16,-1.37315e-15,2.086869e-15,9.604066e-16,1.490107e-15,-5.556467e-16,1.177556e-16,-2.406455e-15,...,1.656562e-16,-3.44485e-16,2.578648e-16,4.471968e-15,5.340915e-16,1.687098e-15,-3.666453e-16,-1.220404e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [26]:
# Deal with missing data
data.isnull().sum()
data.describe()

# Show percentages of + versus - in training set
print('No Fraud detected is ', round(data['Class'].value_counts()[0]/len(data) * 100,2), '% of the dataset')
print('Fraud detected is  ', round(data['Class'].value_counts()[1]/len(data) * 100,2), '% of the dataset')
print('(Fraudulent Transactions: ', data['Class'].value_counts()[1], ' out of ', data['Class'].value_counts()[0], ' records)')

No Fraud detected is  99.83 % of the dataset
Fraud detected is   0.17 % of the dataset
(Fraudulent Transactions:  492  out of  284315  records)


In [27]:
# Machine Learning approach - data prep

## Let's deal with outliers first
from sklearn.preprocessing import StandardScaler, RobustScaler
std_scaler = StandardScaler()
rob_scaler = RobustScaler()
data['scaled_amount'] = rob_scaler.fit_transform(data['Amount'].values.reshape(-1,1))
data['scaled_time'] = rob_scaler.fit_transform(data['Time'].values.reshape(-1,1))

In [28]:
## More data cleaning, prep for graphics if desired
data.drop(['Time','Amount'], axis=1, inplace=True)
amount = data['scaled_amount']
time = data['scaled_time']
data.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
data.insert(0, 'amount', amount)
data.insert(1, 'time', time)

# Examine data cleaned so far:
data.head()

Unnamed: 0,amount,time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,1.783274,-0.994983,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,-0.269825,-0.994983,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,4.983721,-0.994972,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,1.418291,-0.994972,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,0.670579,-0.99496,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


In [30]:
data = data.sample(frac=1)

fraud_data = data.loc[data['Class']==1]
nfraud_data = data.loc[data['Class']==0][:492]

normal_distributed_df = pd.concat([fraud_data, nfraud_data])

# Shuffle dataframe rows
ndata = normal_distributed_df.sample(frac=1, random_state=42)
ndata.head()

print('Fraudulent Transactions', data['Class'].value_counts()[1])
print('Non-Fraudulent Transactions', data['Class'].value_counts()[0])

Fraudulent Transactions 492
Non-Fraudulent Transactions 284315


In [31]:
X= data.iloc[:, data.columns != 'Class']
y = data.iloc[:, data.columns == 'Class']

dataframe = pd.DataFrame(data=ndata)

dataframe

X= dataframe.iloc[:, ndata.columns != 'Class']
y = dataframe.iloc[:, ndata.columns == 'Class']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.25, random_state=1)

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)



In [32]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'gini', random_state= 0 )
classifier.fit(X_train, y_train.ravel())
from sklearn.metrics import classification_report, confusion_matrix

cm_grid = confusion_matrix(y_test,y_pre)
y_pre = classifier.predict(X_test)
print(classification_report(y_test,y_pre))

classifier.score(X_test,y_test)


              precision    recall  f1-score   support

           0       0.90      0.93      0.91       121
           1       0.93      0.90      0.91       125

    accuracy                           0.91       246
   macro avg       0.91      0.91      0.91       246
weighted avg       0.91      0.91      0.91       246



0.9146341463414634

In [33]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout


#Initializing ANN
classifier = Sequential()

#Input Layer
classifier.add(Dense(30, activation='relu'))
#2nd layer
classifier.add(Dense(16, activation='relu'))
classifier.add(Dense(16, activation='relu'))

#Output layer
classifier.add(Dense(1, activation='sigmoid'))

#Compling the ANN
classifier.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

#Fitting the dataset into ANN
classifier.fit(X_train, y_train, batch_size=100, epochs=100, verbose = 0)

score = classifier.evaluate(X_test, y_test, verbose = 0) 
print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

#Predicting the results
y_pred = classifier.predict(X_test)


Test loss: 0.33069172501564026
Test accuracy: 0.9268292784690857
