In [1]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
import os
import pandas as pd
from sklearn.metrics import average_precision_score
from scipy.stats import ks_2samp
from scipy.stats import mannwhitneyu
import numpy as np
import seaborn as sns

In [2]:
os.chdir("..")
test = pd.read_parquet("data/online/test_raw.parquet")
train = pd.read_parquet("data/online/train_oss.parquet")

In [3]:
df = pd.concat([train, test], axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 886255 entries, 0 to 178153
Data columns (total 58 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   client_id                        886255 non-null  int64  
 1   card_id                          886255 non-null  int64  
 2   amount                           886255 non-null  float32
 3   merchant_id                      886255 non-null  int64  
 4   current_age                      886255 non-null  int64  
 5   retirement_age                   886255 non-null  int64  
 6   birth_year                       886255 non-null  int64  
 7   birth_month                      886255 non-null  int8   
 8   latitude                         886255 non-null  float64
 9   longitude                        886255 non-null  float64
 10  per_capita_income                886255 non-null  float32
 11  yearly_income                    886255 non-null  float32
 12  total_d

In [4]:
df.shape

(886255, 58)

In [5]:
# fraud 데이터가 아예 없는 구간 drop
df = df[~df["tx_year"].isin([2017, 2018])].copy()

In [6]:
df.shape

(719351, 58)

In [7]:
# amount log 컬럼 추가
df = df[df["amount"] >= 0].copy()
df["log_amount"] = np.log1p(df["amount"])
df.shape

(715565, 59)

In [8]:
# Num_cards_issued drop
df.drop("num_cards_issued", axis=1, inplace=True)
df.shape

(715565, 58)

In [9]:
# hour
df["hour_sin"] = np.sin(2*np.pi*df["tx_hour"]/24)
df["hour_cos"] = np.cos(2*np.pi*df["tx_hour"]/24)

In [10]:
# month
df["month_sin"] = np.sin(2 * np.pi * df["tx_month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["tx_month"] / 12)

In [11]:
# day
df.drop("tx_day", axis=1, inplace=True) 

In [12]:
# err
df.drop(columns=["err_bad_pin", "err_bad_zipcode", "err_technical_glitch"], inplace=True)

In [13]:
# months_to_expire_calc
tx_total_month = df["tx_year"] * 12 + df["tx_month"]
expire_total_month = df["expires_year"] * 12 + df["expires_month"]
df["months_to_expire_calc"] = expire_total_month - tx_total_month

In [14]:
# mccg_Medical
df.drop("mccg_Medical", axis=1, inplace=True)

In [15]:
df.shape

(715565, 58)

In [16]:
df.drop("birth_year", axis=1, inplace=True)

In [17]:
df.drop("merchant_id", axis=1, inplace=True)

In [18]:
df.drop("birth_month", axis=1, inplace=True)

In [19]:
df.drop("retirement_age", axis=1, inplace=True)

In [20]:
df.drop("is_online", axis=1, inplace=True)

In [21]:
df.drop(columns=["acct_open_year", "acct_open_month", "expires_year", "expires_month", "months_to_expire_calc"], inplace=True)

In [22]:
df.drop("amount", axis=1, inplace=True)

In [23]:
df.drop(columns=["has_error", "distance_imputed"], axis=1, inplace=True)

In [25]:
df.drop(columns=["latitude", "longitude"], inplace=True)

In [28]:
df.columns

Index(['client_id', 'card_id', 'current_age', 'per_capita_income',
       'yearly_income', 'total_debt', 'credit_score', 'num_credit_cards',
       'has_chip', 'credit_limit', 'year_pin_last_changed',
       'err_bad_card_number', 'err_bad_expiration', 'err_bad_cvv',
       'err_insufficient_balance', 'months_to_expire', 'tx_year', 'tx_month',
       'tx_hour', 'is_weekend', 'is_credit', 'is_prepaid', 'male',
       'mccg_Food_Daily', 'mccg_Transport_Travel', 'mccg_Digital_Online',
       'mccg_Financial', 'mccg_Retail', 'mccg_Entertainment',
       'mccg_Automotive_Home', 'mccg_Utilities_Government',
       'mccg_Professional_Services', 'mccg_Industrial_/_Manufacturing',
       'cb_Visa', 'cb_Mastercard', 'cb_Amex', 'cb_Discover', 'fraud',
       'log_amount', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'],
      dtype='object')

In [None]:
df.drop(columns=["tx_year", "tx_month", "tx_hour"], inplace=True)

Unnamed: 0,tx_year,tx_month,tx_hour
0,2010,1,0
1,2010,1,1
2,2010,1,1
3,2010,1,1
4,2010,1,1
5,2010,1,1
6,2010,1,2
7,2010,1,2
8,2010,1,2
9,2010,1,3
