In [5]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

from sklearn.metrics import confusion_matrix, classification_report

In [6]:
data=pd.read_csv('FRAUDULANT_PAYMENT.csv',nrows=50000)

In [3]:
data['type'].unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [4]:
pd.get_dummies(data['type'],prefix='tp')

Unnamed: 0,tp_CASH_IN,tp_CASH_OUT,tp_DEBIT,tp_PAYMENT,tp_TRANSFER
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,0,1
3,0,1,0,0,0
4,0,0,0,1,0
...,...,...,...,...,...
49995,0,0,0,1,0
49996,0,0,0,1,0
49997,0,0,0,1,0
49998,0,0,0,1,0


In [7]:
data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
49995,9,PAYMENT,1322.91,C1402545110,1208457.61,1207134.71,M578074960,0.0,0.0,0,0
49996,9,PAYMENT,6900.48,C517372485,1207134.71,1200234.22,M1854114037,0.0,0.0,0,0
49997,9,PAYMENT,17399.12,C1948160352,1200234.22,1182835.10,M1940330634,0.0,0.0,0,0
49998,9,PAYMENT,5485.20,C1586381033,1182835.10,1177349.90,M741912557,0.0,0.0,0,0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            50000 non-null  int64  
 1   type            50000 non-null  object 
 2   amount          50000 non-null  float64
 3   nameOrig        50000 non-null  object 
 4   oldbalanceOrg   50000 non-null  float64
 5   newbalanceOrig  50000 non-null  float64
 6   nameDest        50000 non-null  object 
 7   oldbalanceDest  50000 non-null  float64
 8   newbalanceDest  50000 non-null  float64
 9   isFraud         50000 non-null  int64  
 10  isFlaggedFraud  50000 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 4.2+ MB


# PREPROCESSING

In [9]:
{column: len(data[column].unique()) for column in data.columns}

{'step': 9,
 'type': 5,
 'amount': 49786,
 'nameOrig': 50000,
 'oldbalanceOrg': 30572,
 'newbalanceOrig': 24328,
 'nameDest': 28499,
 'oldbalanceDest': 25369,
 'newbalanceDest': 9367,
 'isFraud': 2,
 'isFlaggedFraud': 1}

In [10]:
def onehot_encode(df,column,prefix):
    df=df.copy()
    
    #drop step nameOrig and 
    dummies=pd.get_dummies(df[column],prefix=prefix)
    df=pd.concat([df,dummies],axis=1)
    df=df.drop([columns],axis=1)
    return df

In [11]:
def preprocessing_input(df):
    df=df.copy()
    df=df.drop(["step","nameOrig","isFlaggedFraud"],axis=1)
    y=df["isFraud"].copy()
    x=df.drop("isFraud",axis=1).copy()
    
    return x,y


#nameOrig are unique so deleted them

In [12]:
x,y= preprocessing_input(data)

In [13]:
x

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest
0,PAYMENT,9839.64,170136.00,160296.36,M1979787155,0.0,0.0
1,PAYMENT,1864.28,21249.00,19384.72,M2044282225,0.0,0.0
2,TRANSFER,181.00,181.00,0.00,C553264065,0.0,0.0
3,CASH_OUT,181.00,181.00,0.00,C38997010,21182.0,0.0
4,PAYMENT,11668.14,41554.00,29885.86,M1230701703,0.0,0.0
...,...,...,...,...,...,...,...
49995,PAYMENT,1322.91,1208457.61,1207134.71,M578074960,0.0,0.0
49996,PAYMENT,6900.48,1207134.71,1200234.22,M1854114037,0.0,0.0
49997,PAYMENT,17399.12,1200234.22,1182835.10,M1940330634,0.0,0.0
49998,PAYMENT,5485.20,1182835.10,1177349.90,M741912557,0.0,0.0


In [14]:
y

0        0
1        0
2        1
3        1
4        0
        ..
49995    0
49996    0
49997    0
49998    0
49999    0
Name: isFraud, Length: 50000, dtype: int64