# Data Sampling

## Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rc('figure', figsize=(10, 6))

from matplotlib import rcParams
rcParams['font.family'] = 'DejaVu Serif'
rcParams['font.size'] = 10
rcParams['axes.unicode_minus'] = False

In [None]:
!pip install mglearn

## Contents
- Under Sampling: 데이터의 일부를 Sampling (Skipped - shuffle과 stratify 활용)
- Over Sampling - SMOTE: 데이터가 부족한 경우를 가상으로 채우는 Sampling

## Data Import

In [None]:
df = pd.read_csv('./drive/MyDrive/Colab Notebooks/Machine Learning/creditcard.csv')
df.drop('Time', axis=1, inplace=True)
df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [None]:
# 0: 정상 거래, 1: 사기 거래
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

## 데이터 분할
1. train / test split
2. train data로 fit하여 over sampling - model: SMOTE
3. 생성된 데이터와 train data로 모델 학습 (ex. Random Forest)
4. test data로 평가

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=123)

In [None]:
# 0: 정상 거래, 1: 사기 거래
print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
pd.Series(y_train).value_counts()

X_train: (213605, 29), y_train: (213605,)


Class
0    213236
1       369
Name: count, dtype: int64

## Over Sampling 적용 - SMOTE

In [None]:
!pip install imbalanced-learn

#### SMOTE 모델 생성

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=123)

#### 데이터 생성

In [None]:
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

#### 데이터 확인

In [None]:
# 0: 정상 거래, 1: 사기 거래
print(f'X_train_over: {X_train_over.shape}, y_train_over: {y_train_over.shape}')
pd.Series(y_train_over).value_counts()

X_train_over: (426472, 29), y_train_over: (426472,)


Class
0    213236
1    213236
Name: count, dtype: int64

## Over Sampling 적용 후 분류 모델 평가

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

# 모델 생성
rf = RandomForestClassifier(n_estimators=5, random_state=123)

# 모델 학습
rf.fit(X_train_over, y_train_over)

# 모델 평가
rf.score(X_test, y_test)

CPU times: user 21.3 s, sys: 53.3 ms, total: 21.3 s
Wall time: 21.4 s


0.9994241734782731