In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.svm import LinearSVC
from sklearn import linear_model

import helpers
import models

import datetime
import pandas

In [2]:
train_path = r"C:\Users\bruno\Downloads\PS_20174392719_1491204439457_log.csv"

In [3]:
df = pandas.read_csv(train_path).drop(['isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1)

In [4]:
df.columns

Index(['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
       'oldbalanceDest', 'newbalanceDest', 'isFraud'],
      dtype='object')

In [5]:
df.type.value_counts()

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64

In [None]:
df.isFraud.value_counts()

In [6]:
df.loc[df['type'] == 'CASH_IN', 'type'] = 0
df.loc[df['type'] == 'CASH_OUT', 'type'] = 1
df.loc[df['type'] == 'DEBIT', 'type'] = 2
df.loc[df['type'] == 'PAYMENT', 'type'] = 3
df.loc[df['type'] == 'TRANSFER', 'type'] = 4

In [7]:
len(df[df['isFraud'] == 1] )/len(df) * 100

0.12908204481801522

In [8]:
df_non_fraud = df[df['isFraud'] == 0]
df_fraud = df[df['isFraud'] == 1]

In [9]:
df_fraud, fraud_validation = helpers.data_separation(df_fraud)
# df_non_fraud = helpers.data_separation(df_non_fraud, 0.05)[1].reset_index(drop=True)
# df = df_non_fraud.append(df_fraud).reset_index(drop=True)

In [10]:
train, test = helpers.data_separation(df)
test, validation = helpers.data_separation(test, 0.5)

In [11]:
x_train, y_train = helpers.x_and_y_separation(train)
x_test, y_test = helpers.x_and_y_separation(test)
x_validation, y_validation = helpers.x_and_y_separation(validation)

y_train = pandas.DataFrame(y_train)
y_test = pandas.DataFrame(y_test)
y_validation = pandas.DataFrame(y_validation)

In [12]:
x_train, y_train = SMOTE().fit_sample(x_train, y_train)

In [13]:
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
y_validation.reset_index(drop=True, inplace=True)

In [14]:
d = datetime.datetime.now()
model = linear_model.LinearRegression().fit(x_train, y_train)
helpers.time_screening(d)

0:00:05.100708


In [17]:
d = datetime.datetime.now()
model = DecisionTreeClassifier().fit(x_train, y_train)
helpers.time_screening(d)

0:02:33.888025


In [38]:
d = datetime.datetime.now()
model = models.RandomForest(x_train, y_train.isFraud).fit()
helpers.time_screening(d)

0:51:12.724637


In [44]:
d = datetime.datetime.now()
model = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=10000).fit(x_train, y_train.isFraud.values)
helpers.time_screening(d)

0:38:12.734978


In [45]:
y_train['Predict'] = pandas.DataFrame(model.predict(x_train))
y_train['Predict'] = y_train['Predict'].astype(int)

helpers.confusion_matrix(y_train, ['isFraud', 'Predict'], ['Non-Fraud', 'Fraud'])

Unnamed: 0,Non-Fraud,Fraud
Non-Fraud,4910515,172951
Fraud,678349,4405117


In [46]:
y_test['Predict'] = pandas.DataFrame(model.predict(x_test))[0]
y_test['Predict'] = y_test['Predict'].astype(int)

helpers.confusion_matrix(y_test, ['isFraud', 'Predict'], ['Non-Fraud', 'Fraud'])

Unnamed: 0,Non-Fraud,Fraud
Non-Fraud,613734,21748
Fraud,103,677


In [47]:
y_validation['Predict'] = pandas.DataFrame(model.predict(x_validation))[0]
y_validation['Predict'] = y_validation['Predict'].astype(int)

helpers.confusion_matrix(y_validation, ['isFraud', 'Predict'],['Non-Fraud', 'Fraud'])

Unnamed: 0,Non-Fraud,Fraud
Non-Fraud,613671,21788
Fraud,127,676


In [48]:
x_fraud, y_fraud = helpers.x_and_y_separation(fraud_validation.reset_index(drop=True))
y_fraud = pandas.DataFrame(y_fraud)

y_fraud['Predict'] = pandas.DataFrame(model.predict(x_fraud))[0]
y_fraud['Predict'] = y_fraud['Predict'].astype(int)

helpers.confusion_matrix(y_fraud, ['isFraud', 'Predict'], ['Non-Fraud', 'Fraud'])

Unnamed: 0,Non-Fraud,Fraud
Non-Fraud,0,0
Fraud,245,1398


In [49]:
print('{:0.3f}'.format(accuracy_score(y_train['isFraud'], y_train['Predict'])))
print('{:0.3f}'.format(accuracy_score(y_test['isFraud'], y_test['Predict'])))
print('{:0.3f}'.format(accuracy_score(y_validation['isFraud'], y_validation['Predict'])))
print('{:0.3f}'.format(accuracy_score(y_fraud['isFraud'], y_fraud['Predict'])))

0.916
0.966
0.966
0.851


In [51]:
acc = {
    '': ['tree classifier', 'random forest', 'svm'],
    'train': [1.000, 1.000, 0.916],
    'test': [1.000, 0.999, 0.966],
    'validation': [0.999, 0.999,0.966],
    'training time': ['0:02:33.888025', '0:31:12.724637', '0:28:47.734978']
}

pandas.DataFrame(acc).set_index('')

Unnamed: 0,train,test,validation,training time
,,,,
tree classifier,1.0,1.0,0.999,0:02:33.888025
random forest,1.0,0.999,0.999,0:31:12.724637
svm,0.916,0.966,0.966,0:28:47.734978


In [52]:
acc = {
    '': ['tree classifier', 'random forest', 'svm'],
    'fraud': [0.821, 0.847, 0.834],
    'training time': ['0:02:33.888025', '0:31:12.724637', '0:28:47.734978']
}

pandas.DataFrame(acc).set_index('')

Unnamed: 0,fraud,training time
,,
tree classifier,0.821,0:02:33.888025
random forest,0.847,0:31:12.724637
svm,0.834,0:28:47.734978


In [None]:
In [4]:ros = RandomOverSampler(random_state=0)
x_train, y_train = ros.fit_resample(x_train, y_train)