Import

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

#### Data Load
Train set에는 사기 여부 X

Validation set에는 사기 여부 O (정상 : 0, 사기 :1 )

In [2]:
train_df = pd.read_csv('data/train.csv') # Train : 113842 rows × 31 columns
val_df = pd.read_csv('data/val.csv') # Validation : 28462 rows × 32 columns
test_df = pd.read_csv('data/test.csv') # Test : 142503 rows × 31 columns

#### 결측치 확인

In [None]:
train_df.isnull().sum()

In [None]:
val_df.isnull().sum()

## 1) EDA

In [None]:
val_df.describe()

##### Train & Validation Feature 분포 확인

In [None]:
train = train_df.drop(['ID'],axis=1)
val = val_df.drop(['ID'],axis=1)
test = test_df.drop(['ID'],axis=1)

In [None]:
train.hist(bins = 50, figsize = (20,20))
plt.show()

In [None]:
val.hist(bins = 50, figsize = (20,20))
plt.show()

In [None]:
val_class0 = val[val['Class']==0]
val_class1 = val[val['Class']==1]
columns = val.drop('Class', axis=1).columns

import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt 
import seaborn as sns 

grid = gridspec.GridSpec(6, 6)
plt.figure(figsize=(20, 20))

for n, col in enumerate(val_df[columns]):
    ax = plt.subplot(grid[n])
    sns.histplot(val_class0[col], bins=50, color='dodgerblue', stat='density', kde=True)
    sns.histplot(val_class1[col], bins=50, color='orangered', stat='density', kde=True)
    ax.set_title(str(col))
    ax.set_xlabel('')
    
plt.show()

Features correlation

In [None]:
plt.figure(figsize = (14,14))
plt.title('features correlation plot (Pearson)')
corr = train.corr()
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,linewidths=.1,cmap="Reds")
plt.show()

validation set에서의 사기 비율

In [12]:
val_normal, val_fraud = val_df['Class'].value_counts()
val_contamination = val_fraud / val_normal
print(f'사기 비율 : {val_contamination}')

사기 비율 : 0.0010551491277433877


## 2) 베이스라인 모델

#### 알고리즘 1 : k-means

In [3]:
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, f1_score
from sklearn import model_selection
from sklearn.cluster import KMeans
from sklearn.covariance import EllipticEnvelope

k_X_train : train_df에서 ID를 제외한 V1~V30

k_X_val : val_df에서 ID와 Class를 제외한 V1~V30

In [4]:
k_X_train = train_df.drop('ID', axis=1)

k_X_val = val_df.drop(['ID', 'Class'], axis=1)
k_y_val = val_df['Class']

모델 학습

In [5]:
kmeans = KMeans(n_clusters=2, init='k-means++', max_iter=300, random_state=42)
kmeans.fit(k_X_train)
kmeans_val_pred = kmeans.predict(k_X_val.to_numpy())

모델 결과 확인

In [6]:
unique, counts = np.unique(kmeans_val_pred, return_counts=True)
kmeans_contamination = counts[1] / counts[0]
print(f'클래스 : {unique}, 클래스별 개수 : {counts}')
print(f'정상 데이터 : {counts[0]}, 사기 데이터 : {counts[1]}')
print(f'검증 데이터의 사기 비율 : [{kmeans_contamination}]')

클래스 : [0 1], 클래스별 개수 : [27911   551]
정상 데이터 : 27911, 사기 데이터 : 551
검증 데이터의 사기 비율 : [0.019741320626276378]


모델 성능 확인 - classification report

In [7]:
k_report = classification_report(k_y_val, kmeans_val_pred)
print(k_report)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     28432
           1       0.00      0.00      0.00        30

    accuracy                           0.98     28462
   macro avg       0.50      0.49      0.49     28462
weighted avg       1.00      0.98      0.99     28462



모델 성능 확인 - f1 score

In [9]:
k_f1_score = f1_score(k_y_val, kmeans_val_pred, average = "macro")
print(f'Validation F1 Score : [{k_f1_score}]')

Validation F1 Score : [0.4948440800099391]


## 3) 성능개선 과정

#### 알고리즘 2 : EllipticEnvelope

In [10]:
ell_X_train = train_df.drop('ID', axis=1)

ell_X_val = val_df.drop(['ID', 'Class'], axis=1)
ell_y_val = val_df['Class']

모델 학습

(*) Validation set의 사기 거래 비율이 다른 데이터 셋에서도 비슷하게 발생할 것이라고 가정
    
    -> model parameter : contamination=val_contamination(=0.001055) 적용

In [13]:
ell = EllipticEnvelope(contamination=val_contamination)
ell.fit(k_X_train)
ell_val_pred = ell.predict(k_X_val)

Class 컬럼
- 0: 정상 거래
- 1: 사기 거래

ell_val_pred
- 1: 정상 거래
- -1 : 이상거래

In [14]:
def get_pred_label(model_pred):
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [15]:
ell_val_pred = get_pred_label(ell_val_pred)

모델 결과 확인

In [16]:
unique, counts = np.unique(ell_val_pred, return_counts=True)
ell_contamination = counts[1] / counts[0]
print(f'클래스 : {unique}, 클래스별 개수 : {counts}')
print(f'정상 데이터 : {counts[0]}, 사기 데이터 : {counts[1]}')
print(f'검증 데이터의 사기 비율 : [{ell_contamination}]')

클래스 : [0 1], 클래스별 개수 : [28433    29]
정상 데이터 : 28433, 사기 데이터 : 29
검증 데이터의 사기 비율 : [0.0010199416171350192]


모델 성능 확인 - classification_report

In [17]:
ell_report = classification_report(ell_y_val, ell_val_pred)
print(ell_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.00      0.00      0.00        30

    accuracy                           1.00     28462
   macro avg       0.50      0.50      0.50     28462
weighted avg       1.00      1.00      1.00     28462



모델 성능 확인 - f1 score

In [18]:
ell_f1_score = f1_score(ell_y_val, ell_val_pred, average = "macro")
print(f'Validation F1 Score : [{ell_f1_score}]')

Validation F1 Score : [0.49948122746856594]


#### 알고리즘 3 : IsolationForest
밀도 기반으로 이상치를 탐지하는 알고리즘<br>
Random Forest의 변형 알고리즘

데이터의 이상치가 속한 구역을 찾아내기 위해 무작위로 분할을 수행하는 결정트리 이용 (균등하게 분할되지 X)

--> 무작위 분할을 통해 각 분할 구간의 경계선을 찾아내며, 이를 이용하여 이상치 구별

각 결정트리에서 이상치로 판단되는 데이터는 더 적은 분할 횟수를 거침

장점
- 이상치 탐지에 있어 성능이 높다
- 학습 데이터의 크기에 비해 빠른 속도로 탐지
- 상대적으로 적은 메모리 필요

단점
- 특정 데이터에 대해 이상치를 정확하게 판단하지 못함

In [19]:
IF_X_train = train_df.drop('ID', axis=1)

IF_X_val = val_df.drop(['ID', 'Class'], axis=1)
IF_y_val = val_df['Class']

(*) Validation set의 사기 거래 비율이 다른 데이터 셋에서도 비슷하게 발생할 것이라고 가정
    
    -> model parameter : contamination=val_contamination(=0.001055) 적용

In [20]:
from sklearn.ensemble import IsolationForest 

IFmodel=IsolationForest(n_estimators=125, 
                      max_samples=len(train_df), 
                      contamination=val_contamination, 
                      random_state=42, 
                      verbose=0)
IFmodel.fit(IF_X_train)

decision_function() 메서드와 predict() 메서드를 사용하여 train_df 데이터프레임의 이상치를 탐지

- decision_function() 메서드는 Isolation Forest 모델에서 데이터 포인트의 이상치 점수를 계산합니다. 계산된 이상치 점수는 해당 데이터 포인트가 이상치일 확률을 나타냅니다. 이상치일 가능성이 높을수록 점수가 더 낮아집니다.

- predict() 메서드는 decision_function() 메서드에서 계산된 이상치 점수를 바탕으로 데이터 포인트가 이상치인지 여부를 결정합니다. 이상치인 경우 1을, 정상적인 데이터 포인트인 경우 0을 반환합니다.


In [22]:
score=IFmodel.decision_function(IF_X_train.to_numpy())
anomaly=IFmodel.predict(IF_X_train.to_numpy())

In [23]:
IF_X_train['scores']=score
IF_X_train['anomaly']=anomaly

In [24]:
IF_X_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V23,V24,V25,V26,V27,V28,V29,V30,scores,anomaly
0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,4.983721,-0.994972,0.204528,1
1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,1.418291,-0.994972,0.228848,1
2,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,-0.371407,...,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.081080,-0.256131,-0.994960,0.245011,1
3,-0.644269,1.417964,1.074380,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,1.249376,...,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,0.262698,-0.994901,0.218990,1
4,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,-0.410430,...,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,0.994900,-0.994901,0.239372,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113837,-12.516732,10.187818,-8.476671,-2.510473,-4.586669,-1.394465,-3.632516,5.498583,4.893089,8.655320,...,0.890675,-1.253276,1.786717,0.320763,2.090712,1.232864,-0.169496,1.034857,0.082280,1
113838,1.884849,-0.143540,-0.999943,1.506772,-0.035300,-0.613638,0.190241,-0.249058,0.666458,0.120908,...,-0.042114,-0.053206,0.316403,-0.461441,0.018265,-0.041068,0.530986,1.034881,0.246407,1
113839,-0.241923,0.712247,0.399806,-0.463406,0.244531,-1.343668,0.929369,-0.206210,0.106234,-0.284708,...,0.279598,0.371441,-0.559238,0.113144,0.131507,0.081265,-0.230699,1.034904,0.251669,1
113840,0.120316,0.931005,-0.546012,-0.745097,1.130314,-0.235973,0.812722,0.115093,-0.204064,-0.657422,...,0.050343,0.102800,-0.435870,0.124079,0.217940,0.068803,-0.269825,1.034939,0.252301,1


val_df 로 모델 성능 확인 = 검증

In [25]:
score_val=IFmodel.decision_function(IF_X_val.to_numpy())
anomaly_val=IFmodel.predict(IF_X_val.to_numpy())

In [27]:
IF_X_val['scores']=score_val
IF_X_val['anomaly']=anomaly_val

In [29]:
IF_X_val

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V23,V24,V25,V26,V27,V28,V29,V30,scores,anomaly
0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,-0.366846,...,-0.120794,-0.385050,-0.069733,0.094199,0.246219,0.083076,-0.255991,-0.994878,0.241016,1
1,0.962496,0.328461,-0.171479,2.109204,1.129566,1.696038,0.107712,0.521502,-1.191311,0.724396,...,-0.048508,-1.371866,0.390814,0.199964,0.016371,-0.014605,0.168937,-0.994784,0.219359,1
2,1.145524,0.575068,0.194008,2.598192,-0.092210,-1.044430,0.531588,-0.241888,-0.896287,0.757952,...,-0.076510,0.691320,0.633984,0.048741,-0.053192,0.016251,0.169496,-0.994502,0.234993,1
3,0.927060,-0.323684,0.387585,0.544474,0.246787,1.650358,-0.427576,0.615371,0.226278,-0.225495,...,0.096632,-0.992569,0.085096,0.377447,0.036096,-0.005960,0.331307,-0.994467,0.226669,1
4,-3.005237,2.600138,1.483691,-2.418473,0.306326,-0.824575,2.065426,-1.829347,4.009259,6.051521,...,-0.163747,0.515821,0.136318,0.460054,-0.251259,-1.105751,-0.287012,-0.994373,0.170190,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28457,-0.546378,1.433992,-0.313252,0.926044,0.522388,-0.565669,1.066075,0.269799,-1.099446,-0.077753,...,-0.175451,0.030397,0.018381,-0.395994,0.301655,0.173585,0.202753,1.034622,0.243429,1
28458,-0.764523,0.588379,-0.907599,-0.418847,0.901528,-0.760802,0.758545,0.414698,-0.730854,-1.245088,...,0.141759,0.587119,-0.200998,0.267337,-0.152951,-0.065285,0.810452,1.034693,0.247480,1
28459,-0.446951,1.302212,-0.168583,0.981577,0.578957,-0.605641,1.253430,-1.042610,-0.417116,0.076605,...,-0.148093,-0.038712,0.010209,-0.362666,0.503092,0.229921,0.537972,1.034763,0.237981,1
28460,2.039560,-0.175233,-1.196825,0.234580,-0.008713,-0.726571,0.017050,-0.118228,0.435402,0.267772,...,0.297930,-0.359769,-0.315610,0.201114,-0.080826,-0.075071,-0.269964,1.034928,0.248362,1


In [31]:
print(IF_y_val.value_counts())
print(IF_X_val['anomaly'].value_counts())

0    28432
1       30
Name: Class, dtype: int64
 1    28433
-1       29
Name: anomaly, dtype: int64


In [33]:
IF_X_val['anomaly'] = get_pred_label(IF_X_val['anomaly'])

모델 성능 확인 - classification_report

In [34]:
IF_val_pred = IF_X_val['anomaly']
IF_report = classification_report(IF_y_val, IF_val_pred)
print(IF_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.41      0.40      0.41        30

    accuracy                           1.00     28462
   macro avg       0.71      0.70      0.70     28462
weighted avg       1.00      1.00      1.00     28462



모델 성능 확인 - f1 score

In [35]:
IF_f1_score = f1_score(IF_y_val, IF_val_pred, average = "macro")
print(f'Validation F1 Score : [{IF_f1_score}]')

Validation F1 Score : [0.7030820840915222]


## 4) 최종 모델