- 사기거래탐지

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
card_df = pd.read_csv('../data/ftd/cards_data.csv')
trans_df = pd.read_csv('../data/ftd/transactions_data.csv')
user_df = pd.read_csv('../data/ftd/users_data.csv')
mcc_df = pd.read_json('../data/ftd/mcc_codes.json', orient='index').rename({0:'category'}, axis=1)
fraud_df = pd.read_json('../data/ftd/train_fraud_labels.json')

- card_df의 id는 카드 id = card_id
- user_df의 id는 고객 id = client_id

In [3]:
card_user_df = card_df.merge(user_df, left_on='client_id', right_on='id') #카드 정보와 고객 정보를 병함(카드 데이터를 기준으로)
card_user_df = card_user_df.drop(columns=['card_number','cvv']) #카드 번호, cvv(보안 번호, cvc 번호랑 비슷함) 삭제
card_user_df = card_user_df.rename(columns={'id_x':'card_id'}) #card_df의 id를 card_id로 변경

In [4]:
mcc_df = mcc_df.reset_index()
fraud_df = fraud_df.reset_index()

fraud_df = fraud_df.rename(columns={'index':'id','target':'fraud_label'}) #card_df의 id를 card_id로 변경
mcc_df = mcc_df.rename(columns={'index':'mcc','category':'mcc_label'}) #card_df의 id를 card_id로 변경

trans_df = trans_df.merge(fraud_df, on='id',how='left')
trans_df = trans_df.merge(mcc_df, on='mcc',how='left')

#merchant_city가 ONLINE 일 떄 merchant_state도 ONLINE으로 변경
trans_df.loc[trans_df['merchant_city'] == 'ONLINE', 'merchant_state'] = 'ONLINE'
trans_df = trans_df.drop(columns=['zip','id','merchant_id','mcc']) #거래 데이터에서 상점 우편 번호 거래 id, 상점 id 삭제

In [5]:
trans_df = trans_df.merge(card_user_df, on=['card_id','client_id'],how='left')

In [6]:
trans_df

Unnamed: 0,date,client_id,card_id,amount,use_chip,merchant_city,merchant_state,errors,fraud_label,mcc_label,...,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards
0,2010-01-01 00:01:00,1556,2972,$-77.00,Swipe Transaction,Beulah,ND,,No,Miscellaneous Food Stores,...,7,Female,594 Mountain View Street,46.80,-100.76,$23679,$48277,$110153,740,4
1,2010-01-01 00:02:00,561,4575,$14.57,Swipe Transaction,Bettendorf,IA,,No,Department Stores,...,6,Male,604 Pine Street,40.80,-91.12,$18076,$36853,$112139,834,5
2,2010-01-01 00:02:00,1129,102,$80.00,Swipe Transaction,Vista,CA,,No,Money Transfer,...,4,Male,2379 Forest Lane,33.18,-117.29,$16894,$34449,$36540,686,3
3,2010-01-01 00:05:00,430,2860,$200.00,Swipe Transaction,Crown Point,IN,,,Money Transfer,...,5,Female,903 Hill Boulevard,41.42,-87.35,$26168,$53350,$128676,685,5
4,2010-01-01 00:06:00,848,3915,$46.41,Swipe Transaction,Harwood,MD,,No,Drinking Places (Alcoholic Beverages),...,5,Male,166 River Drive,38.86,-76.60,$33529,$68362,$96182,711,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13305910,2019-10-31 23:56:00,1718,2379,$1.11,Chip Transaction,West Covina,CA,,No,Miscellaneous Food Stores,...,11,Female,766 Third Drive,34.02,-117.89,$22681,$33483,$196,698,5
13305911,2019-10-31 23:56:00,1766,2066,$12.80,Online Transaction,ONLINE,ONLINE,,No,"Digital Goods - Media, Books, Apps",...,9,Male,6076 Bayview Boulevard,43.06,-87.96,$9995,$20377,$12092,789,4
13305912,2019-10-31 23:57:00,199,1031,$40.44,Swipe Transaction,Allen,TX,,No,"Utilities - Electric, Gas, Water, Sanitary",...,4,Female,7927 Plum Lane,33.10,-96.66,$32580,$78329,$40161,720,3
13305913,2019-10-31 23:58:00,1986,5443,$4.00,Chip Transaction,Daly City,CA,,,"Grocery Stores, Supermarkets",...,12,Female,5887 Seventh Lane,37.68,-122.43,$23752,$48430,$62384,716,2


In [8]:
trans_df['errors'] = trans_df['errors'].fillna(0) #정상 거래는 0값으로 처리

#우선 중요도가 낮은 피쳐들을 제거(추후 바뀔 수 있음)
trans_df = trans_df.drop(columns=['client_id','card_id','id_y','address','latitude','longitude'])

In [20]:
train_trans_df = trans_df[trans_df['fraud_label'].notnull()] # 사기 정보가 있는 데이터 900만개
test_trans_df = trans_df[trans_df['fraud_label'].isnull()] # 사기 정보가 없는 데이터  400만개

YN_dict = {"Yes": 1, "No" : 0}
train_trans_df['fraud_label'] = train_trans_df['fraud_label'].map(YN_dict)

In [10]:
train_trans_df.columns

Index(['date', 'amount', 'use_chip', 'merchant_city', 'merchant_state',
       'errors', 'fraud_label', 'mcc_label', 'card_brand', 'card_type',
       'expires', 'has_chip', 'num_cards_issued', 'credit_limit',
       'acct_open_date', 'year_pin_last_changed', 'card_on_dark_web',
       'current_age', 'retirement_age', 'birth_year', 'birth_month', 'gender',
       'per_capita_income', 'yearly_income', 'total_debt', 'credit_score',
       'num_credit_cards'],
      dtype='object')

- target은 fraud_label
- fraud_label 값이 0: 정상 거래 1: 사기 거래

- 학습할 데이터: train_trans_df (X,y 분리 안되어 있음)