## Import

In [None]:
import torch

import pandas as pd
import numpy as np

from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data Load

In [None]:
directory = './DACON_신용카드 사기 거래 탐지 AI 경진대회'
train_df = pd.read_csv(f'{directory}/data/train.csv') # Train
val_df = pd.read_csv(f'{directory}/data/val.csv') # Validation
test_df = pd.read_csv(f'{directory}/data/test.csv') # test

In [None]:
## 스케일링 함수
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
def train_df_scaler(df) :
  df_columns = df.columns
  scaler = StandardScaler()#MinMaxScaler()
  scaler.fit(df)
  result_df = pd.DataFrame(scaler.transform(df), columns = df_columns)
  return result_df

def vaild_df_scaler(df, label : str) :
  class_df = df[[f'{label}']]
  df = df.drop(columns=['Class'])
  df_columns = df.columns
  scaler = StandardScaler()#MinMaxScaler()
  scaler.fit(df)
  result_df = pd.DataFrame(scaler.transform(df), columns = df_columns)
  result_df = pd.concat([result_df, class_df], axis = 1)
  return result_df

In [None]:
train_data = train_df.drop(columns=['ID'])
train_data = train_df_scaler(train_data)

valid_data = val_df.drop(columns=['ID'])
valid_data = vaild_df_scaler(valid_data, 'Class')

test_data = test_df.drop(columns=['ID'])
test_data = train_df_scaler(test_data)
test_data = pd.concat([test_df[['ID']], test_data], axis = 1)

## Model Define & Fit

In [None]:
# Train dataset은 Label이 존재하지 않음
train_x = train_df.drop(columns=['ID']) # Input Data

In [None]:
# 가설 설정 : Train dataset도 Validation dataset과 동일한 비율로 사기거래가 발생 했을 것이다. -> model parameter : contamination=val_contamination(=0.001055) 적용
#val_contamination = 0.00
val_contamination = 0.001
model = EllipticEnvelope(support_fraction = 0.994, contamination = val_contamination, random_state = 42)
model.fit(train_x)

EllipticEnvelope(contamination=0.001, random_state=42, support_fraction=0.994)

# val_contamination = 0.005, support_fraction - 0.994, k =

In [None]:
def get_pred_label(model, x, k):
  prob = model.score_samples(x) # 각 샘플의 로그 우도를 계산합니다.
  prob = torch.tensor(prob, dtype = torch.float) # 텐서로 변환 => n차원 배열 == 벡터
  topk_indices = torch.topk(prob, k = k, largest = False).indices # 결과값중 가장 큰값 k개만 이상치로 추출
  pred = torch.zeros(len(x), dtype = torch.long)
  pred[topk_indices] = 1
  return pred.tolist(), prob.tolist()

val_x = val_df.drop(columns=['ID', 'Class']) # Input Data
val_y = val_df['Class'] # Label

val_pred, val_prob = get_pred_label(model, val_x, 29) # vaild 데이터셋은 이상치가 30개임, 원본코드는 29로함
val_score = f1_score(val_y, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y, val_pred))
tn, fp, fn, tp = confusion_matrix(val_y, val_pred).ravel()
print('tp : ', tp, ', fp : ', fp, ', tn : ', tn, ', fn : ', fn)

Validation F1 Score : [0.9236496787663914]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.86      0.83      0.85        30

    accuracy                           1.00     28462
   macro avg       0.93      0.92      0.92     28462
weighted avg       1.00      1.00      1.00     28462

tp :  25 , fp :  4 , tn :  28428 , fn :  5


## Inference : Test set

In [None]:
test_x = test_data.drop(columns=['ID'])

In [None]:
test_pred, _ = get_pred_label(model, test_x, 318)
print('n_fraud : ', sum(test_pred))

n_fraud :  318


In [None]:
# 스케일링 전 제출물과 비교
output = pd.read_csv('./DACON_신용카드 사기 거래 탐지 AI 경진대회/submit/EllipticEnvelope_0.005.csv')
output['new_class'] = test_pred
output[output['Class'] != output['new_class']]

Unnamed: 0,ID,Class,new_class
24797,A0xc271,0,1
29109,A0xe462,0,1
110069,0x35c35,1,0
112266,0x36d3a,1,0


In [None]:
## EllipticEnvelope 결과물 모두 비교
submit = pd.read_csv(f'{directory}/data/sample_submission.csv')
output1 = pd.read_csv('/content/drive/MyDrive/DACON_신용카드 사기 거래 탐지 AI 경진대회/submit/EllipticEnvelope_0.001055_by_sh.csv') #
output2 = pd.read_csv('/content/drive/MyDrive/DACON_신용카드 사기 거래 탐지 AI 경진대회/submit/EllipticEnvelope_0.002_by_sh.csv')
output3 = pd.read_csv('/content/drive/MyDrive/DACON_신용카드 사기 거래 탐지 AI 경진대회/submit/EllipticEnvelope_0.003_by_sh.csv')
output5 = pd.read_csv('/content/drive/MyDrive/DACON_신용카드 사기 거래 탐지 AI 경진대회/submit/EllipticEnvelope_0.005_by_sh.csv')

submit['class_1'] = output1['Class']#0.9305289388
submit['class_2'] = output2['Class']
submit['class_3'] = output3['Class']
submit['class_5'] = output5['Class']#0.9305289388
submit['class_mn_5'] = test_pred

In [None]:
submit[submit['class_1'] != submit['class_5']]

Unnamed: 0,ID,Class,class_1,class_2,class_3,class_5,class_mn_5


In [None]:
submit[submit['class_1'] != submit['class_mn_5']][['ID','class_1', 'class_mn_5']]

Unnamed: 0,ID,class_1,class_mn_5
24797,A0xc271,0,1
29109,A0xe462,0,1
110069,0x35c35,1,0
112266,0x36d3a,1,0


## Submission

In [None]:
submit = pd.read_csv(f'{directory}/sample_submission.csv')
submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,1
1,AAAA0x2,1
2,AAAA0x5,1
3,AAAA0x7,1
4,AAAA0xc,1


In [None]:
submit['Class'] = test_pred
submit.to_csv(f'{directory}/EllipticEnvelope_{val_contamination}_by_young3.csv', index=False)