In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
df  = pd.read_csv("data-with-features.csv")

In [3]:
df.shape
df.head()

(6362620, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
0,0.0,0.0,0.0,1.0,0.0,9839.64,170136.0,160296.36,0
1,0.0,0.0,0.0,1.0,0.0,1864.28,21249.0,19384.72,0
2,0.0,0.0,0.0,0.0,1.0,181.0,181.0,0.0,1
3,0.0,1.0,0.0,0.0,0.0,181.0,181.0,0.0,1
4,0.0,0.0,0.0,1.0,0.0,11668.14,41554.0,29885.86,0


# Imbalanced Classes

In [4]:
train,test = train_test_split(df, test_size=0.2)

In [5]:
train.shape
train.head()
train['isFraud'].value_counts()

(5090096, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
2314783,1.0,0.0,0.0,0.0,0.0,150255.19,5551.0,155806.19,0
4023142,1.0,0.0,0.0,0.0,0.0,74939.55,13378804.32,13453743.87,0
3032966,0.0,0.0,0.0,0.0,1.0,239494.42,6524.42,0.0,0
2186097,1.0,0.0,0.0,0.0,0.0,58610.47,14448986.61,14507597.08,0
4923083,0.0,0.0,0.0,1.0,0.0,8090.81,1561.3,0.0,0


0    5083574
1       6522
Name: isFraud, dtype: int64

In [6]:
test.shape
test.head()
test['isFraud'].value_counts()

(1272524, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
2259236,0.0,1.0,0.0,0.0,0.0,174722.86,47761.0,0.0,0
3934611,0.0,1.0,0.0,0.0,0.0,157063.75,0.0,0.0,0
6098708,0.0,0.0,0.0,0.0,1.0,579838.23,0.0,0.0,0
5263523,1.0,0.0,0.0,0.0,0.0,106130.04,3641962.85,3748092.89,0
1851524,0.0,1.0,0.0,0.0,0.0,244323.99,0.0,0.0,0


0    1270833
1       1691
Name: isFraud, dtype: int64

In [7]:
train.to_csv('imbalanced-classes/train.csv', index=False)
test.to_csv('imbalanced-classes/test.csv', index=False)


# Balanced Classes

In [6]:
df_balanced_1 = df[df['isFraud'] == 1]

In [7]:
df_balanced_1.shape
df_balanced_1.head()

(8213, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
2,0.0,0.0,0.0,0.0,1.0,181.0,181.0,0.0,1
3,0.0,1.0,0.0,0.0,0.0,181.0,181.0,0.0,1
251,0.0,0.0,0.0,0.0,1.0,2806.0,2806.0,0.0,1
252,0.0,1.0,0.0,0.0,0.0,2806.0,2806.0,0.0,1
680,0.0,0.0,0.0,0.0,1.0,20128.0,20128.0,0.0,1


In [8]:
df_balanced_0 = df[df['isFraud'] == 0].sample(n = 8213)

In [9]:
df_balanced_0.shape
df_balanced_0.head()

(8213, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
713808,0.0,1.0,0.0,0.0,0.0,142778.43,0.0,0.0,0
3059488,0.0,1.0,0.0,0.0,0.0,5830.55,0.0,0.0,0
5633416,1.0,0.0,0.0,0.0,0.0,30322.17,730004.21,760326.37,0
1942303,0.0,0.0,0.0,1.0,0.0,4715.34,0.0,0.0,0
6298272,0.0,0.0,0.0,1.0,0.0,9065.09,0.0,0.0,0


In [10]:
df_balanced = df_balanced_1.append(df_balanced_0, ignore_index=True)

In [11]:
df_balanced.shape
df_balanced.head()

(16426, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
0,0.0,0.0,0.0,0.0,1.0,181.0,181.0,0.0,1
1,0.0,1.0,0.0,0.0,0.0,181.0,181.0,0.0,1
2,0.0,0.0,0.0,0.0,1.0,2806.0,2806.0,0.0,1
3,0.0,1.0,0.0,0.0,0.0,2806.0,2806.0,0.0,1
4,0.0,0.0,0.0,0.0,1.0,20128.0,20128.0,0.0,1


In [12]:
train_balanced,test_balanced = train_test_split(df_balanced, test_size=0.2)

In [13]:
train_balanced.shape
train_balanced.head()
train_balanced['isFraud'].value_counts()

(13140, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
10094,0.0,0.0,0.0,1.0,0.0,1837.29,312574.95,310737.67,0
15940,0.0,1.0,0.0,0.0,0.0,352282.54,33.0,0.0,0
6533,0.0,0.0,0.0,0.0,1.0,1777758.65,1777758.65,0.0,1
9669,0.0,0.0,0.0,1.0,0.0,2463.98,735807.26,733343.28,0
245,0.0,1.0,0.0,0.0,0.0,37100.29,37100.29,0.0,1


1    6579
0    6561
Name: isFraud, dtype: int64

In [14]:
test_balanced.shape
test_balanced.head()
test_balanced['isFraud'].value_counts()

(3286, 9)

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,amount,oldbalanceOrg,newbalanceOrig,isFraud
16185,0.0,0.0,0.0,1.0,0.0,5919.05,22686.0,16766.95,0
12705,0.0,0.0,0.0,1.0,0.0,3492.66,1084.0,0.0,0
12812,0.0,0.0,0.0,1.0,0.0,15078.39,0.0,0.0,0
10151,0.0,0.0,0.0,1.0,0.0,11585.29,0.0,0.0,0
14617,0.0,1.0,0.0,0.0,0.0,114940.19,15336.0,0.0,0


0    1652
1    1634
Name: isFraud, dtype: int64

In [15]:
train_balanced.to_csv('balanced-classes/train.csv', index=False)
test_balanced.to_csv('balanced-classes/test.csv', index=False)