In [None]:
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.svm import LinearSVC
from sklearn import linear_model

import helpers
import models

import datetime
import pandas

In [None]:
train_path = r"C:\Users\bruno\PycharmProjects\mooncake\data\datasource.csv"

In [None]:
df = pandas.read_csv(train_path).drop(['isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1)

In [None]:
df.columns

In [None]:
df.type.value_counts()

In [None]:
df.isFraud.value_counts()

In [None]:
df.loc[df['type'] == 'CASH_IN', 'type'] = 0
df.loc[df['type'] == 'CASH_OUT', 'type'] = 1
df.loc[df['type'] == 'DEBIT', 'type'] = 2
df.loc[df['type'] == 'PAYMENT', 'type'] = 3
df.loc[df['type'] == 'TRANSFER', 'type'] = 4

In [None]:
len(df[df['isFraud'] == 1] )/len(df) * 100

In [None]:
df_non_fraud = df[df['isFraud'] == 0]
df_fraud = df[df['isFraud'] == 1]

In [None]:
df_fraud, fraud_validation = helpers.data_separation(df_fraud)
# df_non_fraud = helpers.data_separation(df_non_fraud, 0.05)[1].reset_index(drop=True)
# df = df_non_fraud.append(df_fraud).reset_index(drop=True)

In [None]:
train, test = helpers.data_separation(df)
test, validation = helpers.data_separation(test, 0.5)

In [None]:
x_train, y_train = helpers.x_and_y_separation(train)
x_test, y_test = helpers.x_and_y_separation(test)
x_validation, y_validation = helpers.x_and_y_separation(validation)

y_train = pandas.DataFrame(y_train)
y_test = pandas.DataFrame(y_test)
y_validation = pandas.DataFrame(y_validation)

In [None]:
x_train, y_train = SMOTE().fit_sample(x_train, y_train)

In [None]:
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
y_validation.reset_index(drop=True, inplace=True)

In [None]:
d = datetime.datetime.now()
model = linear_model.LinearRegression().fit(x_train, y_train)
helpers.time_screening(d)

In [None]:
d = datetime.datetime.now()
model = models.RandomForest(x_train, y_train.isFraud).fit()
helpers.time_screening(d)

In [None]:
d = datetime.datetime.now()
model = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=10000).fit(x_train, y_train.isFraud.values)
helpers.time_screening(d)

In [None]:
y_train['Predict'] = pandas.DataFrame(model.predict(x_train))
y_train['Predict'] = y_train['Predict'].astype(int)

helpers.confusion_matrix(y_train, ['isFraud', 'Predict'], ['Non-Fraud', 'Fraud'])

In [None]:
y_test['Predict'] = pandas.DataFrame(model.predict(x_test))
y_test['Predict'] = y_test['Predict'].astype(int)

helpers.confusion_matrix(y_test, ['isFraud', 'Predict'], ['Non-Fraud', 'Fraud'])

In [None]:
y_validation['Predict'] = pandas.DataFrame(model.predict(x_validation))
y_validation['Predict'] = y_validation['Predict'].astype(int)

helpers.confusion_matrix(y_validation, ['isFraud', 'Predict'],['Non-Fraud', 'Fraud'])

In [None]:
x_fraud, y_fraud = helpers.x_and_y_separation(fraud_validation.reset_index(drop=True))
y_fraud = pandas.DataFrame(y_fraud)

y_fraud['Predict'] = pandas.DataFrame(model.predict(x_fraud))
y_fraud['Predict'] = y_fraud['Predict'].astype(int)

helpers.confusion_matrix(y_fraud, ['isFraud', 'Predict'], ['Non-Fraud', 'Fraud'])

In [None]:
print('{:0.3f}'.format(accuracy_score(y_train['isFraud'], y_train['Predict'])))
print('{:0.3f}'.format(accuracy_score(y_test['isFraud'], y_test['Predict'])))
print('{:0.3f}'.format(accuracy_score(y_validation['isFraud'], y_validation['Predict'])))
print('{:0.3f}'.format(accuracy_score(y_fraud['isFraud'], y_fraud['Predict'])))

In [4]:
acc = {
    '': ['tree classifier', 'random forest', 'svm'],
    'train': [0.999, 1.000, 0.999],
    'test': [0.999, 1.000, 0.999],
    'validation': [0.999,1.000,0.999],
    'training time': ['0:00:05.459905', '0:07:41.670989', '0:03:38.943760']
}

pandas.DataFrame(acc).set_index('')

Unnamed: 0,train,test,validation,training time
,,,,
tree classifier,0.999,0.999,0.999,0:00:05.459905
random forest,1.0,1.0,1.0,0:07:41.670989
svm,0.999,0.999,0.999,0:03:38.943760


In [None]:
acc = {
    '': ['tree classifier', 'random forest', 'svm'],
    'fraud': [0.041, 0.595, 0.365],
    'training time': ['0:00:05.459905', '0:07:41.670989', '0:03:38.943760']
}

pandas.DataFrame(acc).set_index('')

In [5]:
acc = {
    '': ['tree classifier', 'random forest', 'svm'],
    'fraud': [0.041, 0.595, 0.365],
    'training time': ['0:00:05.459905', '0:07:41.670989', '0:03:38.943760']
}

pandas.DataFrame(acc).set_index('')

Unnamed: 0,fraud,training time
,,
tree classifier,0.041,0:00:05.459905
random forest,0.595,0:07:41.670989
svm,0.365,0:03:38.943760
