In [26]:
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn import linear_model

import helpers
import models

import pandas

In [4]:
train_path = r"C:\Users\bruno\Downloads\paysim1\datasource.csv"

In [5]:
df = pandas.read_csv(train_path).drop(['isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1)

In [6]:
df.columns

Index(['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
       'oldbalanceDest', 'newbalanceDest', 'isFraud'],
      dtype='object')

In [7]:
df.type.value_counts()

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64

In [8]:
df.isFraud.value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [9]:
df.loc[df['type'] == 'CASH_IN', 'type'] = 0
df.loc[df['type'] == 'CASH_OUT', 'type'] = 1
df.loc[df['type'] == 'DEBIT', 'type'] = 2
df.loc[df['type'] == 'PAYMENT', 'type'] = 3
df.loc[df['type'] == 'TRANSFER', 'type'] = 4

In [10]:
len(df[df['isFraud'] == 1] )/len(df) * 100

0.12908204481801522

In [11]:
df_non_fraud = df[df['isFraud'] == 0]
df_fraud = df[df['isFraud'] == 1]

In [12]:
df_fraud, fraud_validation = helpers.data_separation(df_fraud)
df_non_fraud = helpers.data_separation(df_non_fraud, 0.05)[1].reset_index(drop=True)
df = df_non_fraud.append(df_fraud).reset_index(drop=True)

In [13]:
train, test = helpers.data_separation(df)
test, validation = helpers.data_separation(test, 0.4)

In [14]:
x_train, y_train = helpers.x_and_y_separation(train)
x_test, y_test = helpers.x_and_y_separation(test)
x_validation, y_validation = helpers.x_and_y_separation(validation)

y_train = pandas.DataFrame(y_train)
y_test = pandas.DataFrame(y_test)
y_validation = pandas.DataFrame(y_validation)

In [15]:
x_train, y_train = SMOTE().fit_sample(x_train, y_train)

In [16]:
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
y_validation.reset_index(drop=True, inplace=True)

In [None]:
model = linear_model.LinearRegression().fit(x_train, y_train)

In [18]:
model = models.RandomForest(x_train, y_train.isFraud).fit()

In [19]:
y_train['Predict'] = pandas.DataFrame(model.predict(x_train))
y_train['Predict'] = y_train['Predict'].astype(int)

helpers.confusion_matrix(y_train, ['isFraud', 'Predict'])

Unnamed: 0,0,1
0,254203,0
1,0,254203


In [20]:
y_test['Predict'] = pandas.DataFrame(model.predict(x_test))
y_test['Predict'] = y_test['Predict'].astype(int)

helpers.confusion_matrix(y_test, ['isFraud', 'Predict'])

Unnamed: 0,0,1
0,37987,104
1,19,805


In [21]:
y_validation['Predict'] = pandas.DataFrame(model.predict(x_validation))
y_validation['Predict'] = y_validation['Predict'].astype(int)

helpers.confusion_matrix(y_validation, ['isFraud', 'Predict'])

Unnamed: 0,0,1
0,25357,70
1,8,509


In [22]:
x_fraud, y_fraud = helpers.x_and_y_separation(fraud_validation.reset_index(drop=True))
y_fraud = pandas.DataFrame(y_fraud)

y_fraud['Predict'] = pandas.DataFrame(model.predict(x_fraud))
y_fraud['Predict'] = y_fraud['Predict'].astype(int)

helpers.confusion_matrix(y_fraud, ['isFraud', 'Predict'])

Unnamed: 0,0,1
0,0,0
1,18,1625


In [68]:
print('{:0.3f}'.format(accuracy_score(y_train['isFraud'], y_train['Predict'])))
print('{:0.3f}'.format(accuracy_score(y_test['isFraud'], y_test['Predict'])))
print('{:0.3f}'.format(accuracy_score(y_validation['isFraud'], y_validation['Predict'])))
print('{:0.3f}'.format(accuracy_score(y_fraud['isFraud'], y_fraud['Predict'])))

1.000
0.997
0.997
0.989
