In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import mglearn
%matplotlib inline
import seaborn as sns
import platform
from matplotlib import font_manager , rc

if platform.system() == 'Darwin':
  rc('font' , family = 'AppleGothic')
elif platform.system() == 'Windows':
  path = 'C:/Windows/Fonts/malgun.ttf'
  font_name = font_manager.FontProperties(fname = path).get_name()
  rc('font' , family = font_name)
else:
  print('모름')
plt.rcParams['axes.unicode_minus'] = False
import warnings
warnings.filterwarnings('ignore')

#### 불균형 데이터를 처리하기 위한 샘플링 기법
**불균형 데이터**
- 정상 범주의 관측치 수의 이상 범주의 관측치 수가 현저이 차이가 나는 데이터(편향이 심한거)
- 문제점 : 정상을 정확히 분류하는 것과 이상을 정확히 분류하는 것 중 일반적으로 이상을 정확히 분류하는 것이 중요하다.
- 불균형한 데이터 세트는 이상 데이터를 정확히 찾아내지 못할 수 있다는 문제점이 존재

**기법들**
1. 언더 샘플링 : 다수 범주의 데이터를 소수 범주의 데이터 수에 맞게 줄이는 방식
 - Random Sampleing : 다수의 범주에서 무작위로 샘플링하는 것
 - Tomek Links : 두 범주 사이를 탐지하고 정리를 통해 부정확한 분류경계선을 방지하는 방법
 - CNN Rule : 합성곱신경망
 - One Sided Selection : Tomek Links + CNN RUle
2. 오버 샘플링 : 소수 범주의 데이터를 다수 범주의 데이터 수에 맞게 늘이는 방식
 - Resampling : 소수 범주의 데이터를 다수 범주의 데이터 수에 맞게 늘이는 방식
 - SMOTE : 소수 범주에서 가상의 데이터를 생성하는 방법 , knn으로 데이터를 증식시킴
 - GAN
 
 
**피처 엔지니어링(특성공학)**
- log변환
- IQR(Inter QUantile Range)
 - Q3 + 1.5*IQR보다 크면 이상치
 - Q1 - 1.5*IQR보다 작으면 이상치

In [2]:
card_df = pd.read_csv('creditcard.csv')

- 이상 거래 판단할 관련 데이터 셋
- 이상 거래는 카드값을 지불하지 않을 의도를 가지고서 결제를 하거나 , 도난된 카드를 가지고 결제를 하는 거래 등을 말한다.
- 종속변수 : 이상거래 여부
- 알고리즘 종류 : 분류
- 평가지표 : 정확도 , 혼동 행렬 , 분류 리포트 등

In [3]:
card_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
data = card_df.iloc[:,1:-1]
target = card_df.iloc[:,-1]

In [5]:
from sklearn.model_selection import train_test_split

train_input , test_input , train_target , test_target = train_test_split(data , target , test_size = 0.2 , random_state = 42 , stratify = target)

In [6]:
print('train : \n' , train_target.value_counts()/train_target.shape[0])
print('test : \n' , test_target.value_counts()/test_target.shape[0])

train : 
 0    0.998271
1    0.001729
Name: Class, dtype: float64
test : 
 0    0.99828
1    0.00172
Name: Class, dtype: float64


In [45]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score , classification_report
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, precision_recall_curve

In [8]:
def get(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    #roc_auc = roc_auc_score(y_test, pred_proba)

    print('오차 행렬(혼돈 행렬)')
    print(confusion)

    print(f'정확도:{accuracy:.4f}, 정밀도:{precision:.4f}, 재현율:{recall:.4f}, F1:{f1:.4f}')

def model_fit(model):
    model.fit(train_input , train_target)
    pred = model.predict(test_input)
    return get(test_target , pred)

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs = -1 , random_state = 0)

In [10]:
def model_fit(model):
    model.fit(train_input , train_target)
    pred = model.predict(test_input)
    return get(test_target , pred)

In [11]:
model_fit(rf)

오차 행렬(혼돈 행렬)
[[56859     5]
 [   18    80]]
정확도:0.9996, 정밀도:0.9412, 재현율:0.8163, F1:0.8743


In [12]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(random_state = 0)
model_fit(lgb)

오차 행렬(혼돈 행렬)
[[56596   268]
 [   33    65]]
정확도:0.9947, 정밀도:0.1952, 재현율:0.6633, F1:0.3016


In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
model_fit(lr)

오차 행렬(혼돈 행렬)
[[56850    14]
 [   36    62]]
정확도:0.9991, 정밀도:0.8158, 재현율:0.6327, F1:0.7126


In [14]:
card_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [15]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [16]:
amount = ss.fit_transform(card_df.Amount.values.reshape(-1,1))

In [17]:
card_df.Amount = amount

In [18]:
from sklearn.model_selection import train_test_split

train_input , test_input , train_target , test_target = train_test_split(data , target , test_size = 0.2 , random_state = 42 , stratify = target)

In [19]:
data = card_df.iloc[:,1:-1]
target = card_df.iloc[:,-1]

In [20]:
lr = LogisticRegression()
model_fit(lr)

오차 행렬(혼돈 행렬)
[[56850    14]
 [   36    62]]
정확도:0.9991, 정밀도:0.8158, 재현율:0.6327, F1:0.7126


In [21]:
card = pd.read_csv('creditcard.csv')

In [22]:
amount_n = np.log1p(card['Amount'])
card.drop(['Time','Amount'] , axis = 1 , inplace = True)
card.insert(0 , 'AmountScaled' , amount_n)
card.head()

Unnamed: 0,AmountScaled,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,5.01476,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,1.305626,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,5.939276,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,4.824306,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,4.262539,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


In [23]:
data = card.iloc[:,:-1]
target = card.iloc[:,-1]

In [24]:
from sklearn.model_selection import train_test_split

train_input , test_input , train_target , test_target = train_test_split(data , target , test_size = 0.2 , random_state = 42 , stratify = target)

In [25]:
lr = LogisticRegression()
model_fit(lr)

오차 행렬(혼돈 행렬)
[[56851    13]
 [   34    64]]
정확도:0.9992, 정밀도:0.8312, 재현율:0.6531, F1:0.7314


In [26]:
def get_outlier(df = None , column = None , weight = 1.5):
    fraud = df[df['Class'] == 1][column]
    quantile_25 = np.percentile(fraud.values , 25)
    quantile_75 = np.percentile(fraud.values , 75)
    iqr = quantile_75 - quantile_25
    iqr_weight = iqr * weight
    lowest_val = quantile_25 - iqr_weight
    highest_val = quantile_75 + iqr_weight
    outlier_index = fraud[(fraud<lowest_val)|(fraud>highest_val)].index
    return outlier_index

In [27]:
outlier_index = get_outlier(df = card , column = 'V14' , weight = 1.5)

In [28]:
card.drop(outlier_index , axis = 0 , inplace = True)

In [29]:
data = card.iloc[:,:-1]
target = card.iloc[:,-1]
from sklearn.model_selection import train_test_split

train_input , test_input , train_target , test_target = train_test_split(data , target , test_size = 0.2 , random_state = 42 , stratify = target)
lr = LogisticRegression()
model_fit(lr)

오차 행렬(혼돈 행렬)
[[56851    12]
 [   33    65]]
정확도:0.9992, 정밀도:0.8442, 재현율:0.6633, F1:0.7429


In [30]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 0)
train_input_over , train_target_over = smote.fit_resample(train_input , train_target)

In [31]:
lr = LogisticRegression()
lr.fit(train_input_over , train_target_over)

In [32]:
get(test_target , lr.predict(test_input))

오차 행렬(혼돈 행렬)
[[55307  1556]
 [    6    92]]
정확도:0.9726, 정밀도:0.0558, 재현율:0.9388, F1:0.1054


In [33]:
lgb = LGBMClassifier()
lgb.fit(train_input_over , train_target_over)
get(test_target , lgb.predict(test_input))

오차 행렬(혼돈 행렬)
[[56787    76]
 [   12    86]]
정확도:0.9985, 정밀도:0.5309, 재현율:0.8776, F1:0.6615


In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [40]:
train_input.shape

(227842, 29)

In [52]:
model = Sequential()
model.add(Dense(30,  input_dim=29, activation='softplus'))
model.add(Dense(12, activation='softplus'))
model.add(Dense(1, activation='sigmoid'))
model.summary()



Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_11 (Dense)            (None, 30)                900       
                                                                 
 dense_12 (Dense)            (None, 12)                372       
                                                                 
 dense_13 (Dense)            (None, 1)                 13        
                                                                 
Total params: 1285 (5.02 KB)
Trainable params: 1285 (5.02 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [53]:
# 모델을 컴파일합니다.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 모델을 실행합니다.
history=model.fit(train_input , train_target , epochs=20) # 0.8 x 0.25 = 0.2

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [54]:
model.evaluate(test_input , test_target)



[0.0023415624164044857, 0.9995259642601013]

In [47]:
rf.fit(train_input , train_target)

In [48]:
print(classification_report(test_target , rf.predict(test_input)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       0.94      0.82      0.87        98

    accuracy                           1.00     56961
   macro avg       0.97      0.91      0.94     56961
weighted avg       1.00      1.00      1.00     56961



In [62]:
print(classification_report(test_target , np.where(model.predict(test_input)>0.7 , 1 , 0)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       0.96      0.80      0.87        98

    accuracy                           1.00     56961
   macro avg       0.98      0.90      0.94     56961
weighted avg       1.00      1.00      1.00     56961



In [50]:
confusion_matrix(test_target , rf.predict(test_input))

array([[56858,     5],
       [   18,    80]], dtype=int64)

In [61]:
confusion_matrix(test_target , np.where(model.predict(test_input)>=0.7 , 1 , 0))



array([[56860,     3],
       [   20,    78]], dtype=int64)