In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

import os
import random
import warnings
warnings.filterwarnings("ignore")

In [2]:
np.random.seed(2022)
random.seed(2022)
os.environ['PYTHONHASHSEED'] = '0'

In [3]:
data = pd.read_csv('./creditcard.csv')

In [4]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.918649e-15,5.682686e-16,-8.761736e-15,2.811118e-15,-1.552103e-15,2.04013e-15,-1.698953e-15,-1.893285e-16,-3.14764e-15,...,1.47312e-16,8.042109e-16,5.282512e-16,4.456271e-15,1.426896e-15,1.70164e-15,-3.662252e-16,-1.217809e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [6]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [7]:
print('Class 0:', data['Class'].value_counts()[0]) # No Frauds
print('Class 1:', data['Class'].value_counts()[1]) # Frauds

Class 0: 284315
Class 1: 492


In [9]:
print("No frauds rate : {}".format((284315 / (284315 + 492)) * 100))

No frauds rate : 99.82725143693798


In [10]:
# Scaling
from sklearn.preprocessing import RobustScaler

rob_scaler = RobustScaler()

data['amount'] = rob_scaler.fit_transform(data['Amount'].values.reshape(-1,1))
data['time'] = rob_scaler.fit_transform(data['Time'].values.reshape(-1,1))

data.drop(['Time', 'Amount'], axis=1, inplace=True)

In [11]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,amount,time
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,1.783274,-0.994983
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.269825,-0.994983
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,4.983721,-0.994972
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,1.418291,-0.994972
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,0.670579,-0.99496


In [12]:
# Undersampling

In [14]:
data1 = data.sample(frac=1) # data random shuffle

fraud_data = data1.loc[data1['Class'] == 1]
non_fraud_data = data1.loc[data1['Class'] == 0][:492]

In [15]:
print(len(fraud_data))
print(len(non_fraud_data))

492
492


In [16]:
balanced_data = pd.concat([fraud_data, non_fraud_data])
balanced_data = balanced_data.sample(frac=1)

In [17]:
balanced_data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,amount,time
213005,1.787155,-1.617071,-1.140874,-0.937252,-0.791878,0.097439,-0.749371,0.025273,-0.215018,0.822725,...,0.550282,-0.024122,0.307207,-0.203156,-0.223969,-0.046904,-0.023155,0,2.640956,0.638906
10484,1.088375,0.898474,0.394684,3.170258,0.175739,-0.221981,-0.022989,-0.010874,0.860044,-0.592473,...,-0.800852,0.077614,0.167608,0.350182,-0.118941,0.012948,0.054254,1,-0.254454,-0.793066
215678,2.148323,0.540882,-3.527269,0.564761,1.453125,-1.212508,0.672031,-0.311852,0.3485,-0.999488,...,-0.24941,-0.100089,-0.195055,0.417648,0.733449,-0.091336,-0.024408,0,-0.296793,0.65177
243547,-6.618211,3.835943,-6.316453,1.844111,-2.476892,-1.886718,-3.817495,0.61347,-1.482121,-4.868747,...,0.038727,0.278218,0.78667,0.063895,0.154707,-2.042403,1.405141,1,0.499266,0.790423
154693,-2.488363,4.359019,-7.77641,5.364027,-1.823877,-2.44514,-4.964221,1.48489,-2.947899,-7.17535,...,1.021226,-0.266476,-0.37088,0.365535,0.081372,0.184983,-0.211582,1,-0.29344,0.210611


In [18]:
X = balanced_data.drop('Class', axis=1)
y = balanced_data['Class']

In [19]:
# Data Split
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

In [21]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)

LogisticRegression()

In [22]:
from sklearn.model_selection import cross_val_score

training_score = cross_val_score(logistic, X_train, y_train, cv=5)

In [23]:
training_score

array([0.94936709, 0.93670886, 0.92993631, 0.93630573, 0.91082803])

In [25]:
# 평가
y_pred = logistic.predict(X_test)

from sklearn.metrics import accuracy_score
test_accuracy = accuracy_score(y_test, y_pred)
test_accuracy

0.949238578680203

In [26]:
# Oversampling - SMOTE

In [27]:
X = data.drop('Class', axis=1)
y = data['Class']

In [28]:
X.shape

(284807, 30)

In [29]:
kfold = StratifiedKFold(n_splits=5)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

In [30]:
print(X_train.shape)
print(X_test.shape)

(227845, 30)
(56962, 30)


In [31]:
from imblearn.over_sampling import SMOTE

In [32]:
logistic = LogisticRegression()

In [34]:
for train, test in kfold.split(X_train, y_train):
    smote = SMOTE(sampling_strategy='minority')
    x_oversample, y_oversample = smote.fit_resample(X_train[train], y_train[train])
    
    logistic.fit(x_oversample, y_oversample)

In [35]:
y_pred = logistic.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(test_accuracy)

0.9752817667918964
