In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction import DictVectorizer

In [3]:
!pip install scikit-learn



In [2]:
data = './online_payments_fraud.zip'
df = pd.read_csv(data)

The below column reference:
------------------------------------
- step: represents a unit of time where 1 step equals 1 hour
- type: type of online transaction
- amount: the amount of the transaction
- nameOrig: customer starting the transaction
- oldbalanceOrg: balance before the transaction
- newbalanceOrig: balance after the transaction
- nameDest: recipient of the transaction
- oldbalanceDest: initial balance of recipient before the transaction
- newbalanceDest: the new balance of recipient after the transaction
- isFraud: fraud transaction

In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
df.type.value_counts()

payment_type_map = {
    1: 'CASH_OUT',
    2: 'PAYMENT',
    3: 'CASH_IN',
    4: 'TRANSFER',
    5: 'DEBIT'
}

## EDA
- check max values
- check data values ranges
- check value counts

In [5]:
df.describe().round()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.0,179862.0,833883.0,855114.0,1100702.0,1224996.0,0.0,0.0
std,142.0,603858.0,2888243.0,2924049.0,3399180.0,3674129.0,0.0,0.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13390.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74872.0,14208.0,0.0,132706.0,214661.0,0.0,0.0
75%,335.0,208721.0,107315.0,144258.0,943037.0,1111909.0,0.0,0.0
max,743.0,92445517.0,59585040.0,49585040.0,356015889.0,356179279.0,1.0,1.0


## 2.Data Preprocessing
- check missing values
- map categorical values if any

In [6]:
df.columns = df.columns.str.lower()
df.isfraud.value_counts()

0    6354407
1       8213
Name: isfraud, dtype: int64

In [7]:
df.isflaggedfraud.value_counts()

0    6362604
1         16
Name: isflaggedfraud, dtype: int64

In [8]:
df[df['amount'] == 0]

Unnamed: 0,step,type,amount,nameorig,oldbalanceorg,newbalanceorig,namedest,oldbalancedest,newbalancedest,isfraud,isflaggedfraud
2736447,212,CASH_OUT,0.0,C1510987794,0.0,0.0,C1696624817,0.0,0.0,1,0
3247298,250,CASH_OUT,0.0,C521393327,0.0,0.0,C480398193,0.0,0.0,1,0
3760289,279,CASH_OUT,0.0,C539112012,0.0,0.0,C1106468520,538547.63,538547.63,1,0
5563714,387,CASH_OUT,0.0,C1294472700,0.0,0.0,C1325541393,7970766.57,7970766.57,1,0
5996408,425,CASH_OUT,0.0,C832555372,0.0,0.0,C1462759334,76759.9,76759.9,1,0
5996410,425,CASH_OUT,0.0,C69493310,0.0,0.0,C719711728,2921531.34,2921531.34,1,0
6168500,554,CASH_OUT,0.0,C10965156,0.0,0.0,C1493336195,230289.66,230289.66,1,0
6205440,586,CASH_OUT,0.0,C1303719003,0.0,0.0,C900608348,1328472.86,1328472.86,1,0
6266414,617,CASH_OUT,0.0,C1971175979,0.0,0.0,C1352345416,0.0,0.0,1,0
6281483,646,CASH_OUT,0.0,C2060908932,0.0,0.0,C1587892888,0.0,0.0,1,0


### 2.2 Data Splitting

In [9]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

In [10]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [11]:
y_train = (df_train.isfraud).astype('int').values
y_val = (df_val.isfraud).astype('int').values
y_test = (df_test.isfraud).astype('int').values

In [12]:
df_train

Unnamed: 0,step,type,amount,nameorig,oldbalanceorg,newbalanceorig,namedest,oldbalancedest,newbalancedest,isfraud,isflaggedfraud
0,259,CASH_OUT,97180.10,C188026739,46412.00,0.00,C2065455874,1741414.84,1838594.95,0,0
1,139,PAYMENT,16171.32,C1126105242,0.00,0.00,M1024588184,0.00,0.00,0,0
2,306,CASH_OUT,36897.81,C1472855255,0.00,0.00,C958504659,854406.27,891304.08,0,0
3,374,PAYMENT,24056.40,C1759596511,0.00,0.00,M1655702403,0.00,0.00,0,0
4,376,CASH_OUT,206626.50,C1815277057,159.00,0.00,C2041931405,18631.49,225257.99,0,0
...,...,...,...,...,...,...,...,...,...,...,...
3817567,131,CASH_OUT,440806.92,C1652309632,17873.00,0.00,C269134108,44061.05,484867.97,0,0
3817568,375,PAYMENT,2926.82,C807741221,30591.00,27664.18,M286836507,0.00,0.00,0,0
3817569,393,CASH_IN,332714.50,C1445844044,13502634.94,13835349.44,C1279633304,976226.88,643512.37,0,0
3817570,134,CASH_IN,413330.79,C1030090774,8166966.64,8580297.43,C44501267,10212270.82,9798940.04,0,0


3.Training
-------------

In [13]:
### 3.1 Decision Tree