In [9]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
df  = pd.read_csv("../data/data-with-features.csv")

In [3]:
df.shape
df.head()

(6362620, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
0,0.0,0.0,0.0,1.0,0.0,9839.64,170136.0,160296.36,0
1,0.0,0.0,0.0,1.0,0.0,1864.28,21249.0,19384.72,0
2,0.0,0.0,0.0,0.0,1.0,181.0,181.0,0.0,1
3,0.0,1.0,0.0,0.0,0.0,181.0,181.0,0.0,1
4,0.0,0.0,0.0,1.0,0.0,11668.14,41554.0,29885.86,0


# Imbalanced Classes

In [4]:
train,test = train_test_split(df, test_size=0.2)

In [5]:
train.shape
train.head()
train['isFraud'].value_counts()

(5090096, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
4371513,0.0,0.0,0.0,1.0,0.0,34443.41,20306.0,0.0,0
3824543,0.0,1.0,0.0,0.0,0.0,166517.47,472.0,0.0,0
1657672,0.0,0.0,0.0,1.0,0.0,39657.79,0.0,0.0,0
1487618,0.0,0.0,0.0,1.0,0.0,15897.21,199746.22,183849.01,0
2488203,1.0,0.0,0.0,0.0,0.0,179204.08,24590.0,203794.08,0


0    5083540
1       6556
Name: isFraud, dtype: int64

In [6]:
test.shape
test.head()
test['isFraud'].value_counts()

(1272524, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
2411968,1.0,0.0,0.0,0.0,0.0,398781.39,6461163.05,6859944.44,0
4049468,1.0,0.0,0.0,0.0,0.0,45876.81,3762250.16,3808126.97,0
4961065,0.0,0.0,1.0,0.0,0.0,1194.45,0.0,0.0,0
3737065,0.0,0.0,0.0,1.0,0.0,17405.75,130341.39,112935.63,0
1560148,0.0,0.0,0.0,1.0,0.0,683.33,0.0,0.0,0


0    1270867
1       1657
Name: isFraud, dtype: int64

In [10]:
os.mkdir('../data/imbalanced-classes')
train.to_csv('../data/imbalanced-classes/train.csv', index=False)
test.to_csv('../data/imbalanced-classes/test.csv', index=False)


# Balanced Classes

In [11]:
df_balanced_1 = df[df['isFraud'] == 1]

In [12]:
df_balanced_1.shape
df_balanced_1.head()

(8213, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
2,0.0,0.0,0.0,0.0,1.0,181.0,181.0,0.0,1
3,0.0,1.0,0.0,0.0,0.0,181.0,181.0,0.0,1
251,0.0,0.0,0.0,0.0,1.0,2806.0,2806.0,0.0,1
252,0.0,1.0,0.0,0.0,0.0,2806.0,2806.0,0.0,1
680,0.0,0.0,0.0,0.0,1.0,20128.0,20128.0,0.0,1


In [13]:
df_balanced_0 = df[df['isFraud'] == 0].sample(n = 8213)

In [14]:
df_balanced_0.shape
df_balanced_0.head()

(8213, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
1679399,0.0,0.0,0.0,0.0,1.0,133514.6,0.0,0.0,0
2527173,1.0,0.0,0.0,0.0,0.0,95054.62,1955467.41,2050522.03,0
392767,0.0,1.0,0.0,0.0,0.0,187352.74,20058.0,0.0,0
5111433,0.0,0.0,0.0,0.0,1.0,785139.65,97029.67,0.0,0
1939360,0.0,0.0,0.0,0.0,1.0,805799.68,11041.0,0.0,0


In [15]:
df_balanced = df_balanced_1.append(df_balanced_0, ignore_index=True)

In [16]:
df_balanced.shape
df_balanced.head()

(16426, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
0,0.0,0.0,0.0,0.0,1.0,181.0,181.0,0.0,1
1,0.0,1.0,0.0,0.0,0.0,181.0,181.0,0.0,1
2,0.0,0.0,0.0,0.0,1.0,2806.0,2806.0,0.0,1
3,0.0,1.0,0.0,0.0,0.0,2806.0,2806.0,0.0,1
4,0.0,0.0,0.0,0.0,1.0,20128.0,20128.0,0.0,1


In [17]:
train_balanced,test_balanced = train_test_split(df_balanced, test_size=0.2)

In [18]:
train_balanced.shape
train_balanced.head()
train_balanced['isFraud'].value_counts()

(13140, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
4,0.0,0.0,0.0,0.0,1.0,20128.0,20128.0,0.0,1
12285,0.0,0.0,0.0,1.0,0.0,28937.49,61542.0,32604.51,0
9037,0.0,0.0,0.0,0.0,1.0,345973.45,42314.0,0.0,0
13423,1.0,0.0,0.0,0.0,0.0,255234.69,5291.0,260525.69,0
1133,0.0,1.0,0.0,0.0,0.0,363378.75,363378.75,0.0,1


0    6626
1    6514
Name: isFraud, dtype: int64

In [21]:
test_balanced.shape
test_balanced.head()
test_balanced['isFraud'].value_counts()

(3286, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
8373,0.0,0.0,0.0,1.0,0.0,44078.2,23637.0,0.0,0
4716,0.0,1.0,0.0,0.0,0.0,892376.71,892376.71,0.0,1
386,0.0,1.0,0.0,0.0,0.0,122032.62,122032.62,0.0,1
509,0.0,0.0,0.0,0.0,1.0,362152.77,362152.77,0.0,1
14468,0.0,1.0,0.0,0.0,0.0,199024.12,0.0,0.0,0


1    1699
0    1587
Name: isFraud, dtype: int64

In [23]:
os.mkdir('../data/balanced-classes')
train_balanced.to_csv('../data/balanced-classes/train.csv', index=False)
test_balanced.to_csv('../data/balanced-classes/test.csv', index=False)