In [1]:
import joblib

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [27]:
def cat_encoding(df):
    le = LabelEncoder()
    original_columns = list(df.columns)
    
    for col in df:
        # df의 컬럼의 유형이 object인 것들만
        if df[col].dtype == 'object':
            
            # df의 컬럼별로 포함된 개체 수가 2개 이하이면 LabelEncoding
            if len(list(df[col].unique())) <= 2:
                le.fit(df[col])
                df[col] = le.transform(df[col])

            # df의 컬럼별로 포함된 개체 수가 3개 이상이면 ont-hot-encoding
            elif len(list(df[col].unique())) > 2:            
                df = pd.get_dummies(df, columns = [col], dummy_na=False)
    # 새롭게 만들어진 컬럼들의 이름을 리스트로 저장
    new_columns = [c for c in df.columns if c not in original_columns]
    
    # 수치형으로 변경된 df와 새롭게 만들어진 컬럼 이름 리스트를 반환
    return df, new_columns

In [20]:
df = pd.read_csv('data/credit_card_balance.csv')

In [21]:
df.shape

(3840312, 23)

In [22]:
df['AMT_BALANCE_RATIO'] = df['AMT_BALANCE'] /df['AMT_CREDIT_LIMIT_ACTUAL']

df['ONCE_DRAWINGS_ATM_CURRENT'] = df['AMT_DRAWINGS_ATM_CURRENT']/df['CNT_DRAWINGS_ATM_CURRENT']
df['ONCE_DRAWINGS_CURRENT'] = df['AMT_DRAWINGS_CURRENT']/df['CNT_DRAWINGS_CURRENT']
df['ONCE_DRAWINGS_OTHER_CURRENT'] =df['AMT_DRAWINGS_OTHER_CURRENT']/df['CNT_DRAWINGS_OTHER_CURRENT']
df['ONCE_DRAWINGS_POS_CURRENT']= df['AMT_DRAWINGS_POS_CURRENT']/df['CNT_DRAWINGS_POS_CURRENT']

df['AMT_DRAWINGS_ATM_CURRENT_RATIO'] = df['AMT_DRAWINGS_ATM_CURRENT']/df['AMT_DRAWINGS_CURRENT']
df['AMT_DRAWINGS_OTHER_CURRENT_RATIO'] = df['AMT_DRAWINGS_OTHER_CURRENT']/ df['AMT_DRAWINGS_CURRENT']
df['AMT_DRAWINGS_POS_CURRENT_RATIO'] = df['AMT_DRAWINGS_POS_CURRENT']/df['AMT_DRAWINGS_CURRENT']

df['CNT_DRAWINGS_ATM_CURRENT_RATIO'] = df['CNT_DRAWINGS_ATM_CURRENT']/df['CNT_DRAWINGS_CURRENT']
df['CNT_DRAWINGS_OTHER_CURRENT_RATIO'] = df['CNT_DRAWINGS_OTHER_CURRENT']/df['CNT_DRAWINGS_CURRENT']
df['CNT_DRAWINGS_POS_CURRENT_RATIO'] = df['CNT_DRAWINGS_POS_CURRENT']/df['CNT_DRAWINGS_CURRENT']

df['AMT_RECIVABLE_DIFF'] = df['AMT_RECIVABLE'] - df['AMT_TOTAL_RECEIVABLE']
df['SK_DPD_LOW_LOAN'] = df['SK_DPD'] - df['SK_DPD_DEF']

In [28]:
df, df_new_columns = cat_encoding(df)

In [36]:
ccb_aggregations = {}

for col in df_new_columns:
    ccb_aggregations[col] = ['mean']

In [37]:
ccb_columns = [c for c in df.columns if c not in df_new_columns]
ccb_columns.remove('SK_ID_PREV')
ccb_columns.remove('SK_ID_CURR')
ccb_columns

['MONTHS_BALANCE',
 'AMT_BALANCE',
 'AMT_CREDIT_LIMIT_ACTUAL',
 'AMT_DRAWINGS_ATM_CURRENT',
 'AMT_DRAWINGS_CURRENT',
 'AMT_DRAWINGS_OTHER_CURRENT',
 'AMT_DRAWINGS_POS_CURRENT',
 'AMT_INST_MIN_REGULARITY',
 'AMT_PAYMENT_CURRENT',
 'AMT_PAYMENT_TOTAL_CURRENT',
 'AMT_RECEIVABLE_PRINCIPAL',
 'AMT_RECIVABLE',
 'AMT_TOTAL_RECEIVABLE',
 'CNT_DRAWINGS_ATM_CURRENT',
 'CNT_DRAWINGS_CURRENT',
 'CNT_DRAWINGS_OTHER_CURRENT',
 'CNT_DRAWINGS_POS_CURRENT',
 'CNT_INSTALMENT_MATURE_CUM',
 'SK_DPD',
 'SK_DPD_DEF',
 'AMT_BALANCE_RATIO',
 'ONCE_DRAWINGS_ATM_CURRENT',
 'ONCE_DRAWINGS_CURRENT',
 'ONCE_DRAWINGS_OTHER_CURRENT',
 'ONCE_DRAWINGS_POS_CURRENT',
 'AMT_DRAWINGS_ATM_CURRENT_RATIO',
 'AMT_DRAWINGS_OTHER_CURRENT_RATIO',
 'AMT_DRAWINGS_POS_CURRENT_RATIO',
 'CNT_DRAWINGS_ATM_CURRENT_RATIO',
 'CNT_DRAWINGS_OTHER_CURRENT_RATIO',
 'CNT_DRAWINGS_POS_CURRENT_RATIO',
 'AMT_RECIVABLE_DIFF',
 'SK_DPD_LOW_LOAN']

In [38]:
for col in ccb_columns:
    ccb_aggregations[col] = ['min', 'max', 'mean', 'median', 'sum', 'size']
ccb_aggregations

{'NAME_CONTRACT_STATUS_Active': ['mean'],
 'NAME_CONTRACT_STATUS_Approved': ['mean'],
 'NAME_CONTRACT_STATUS_Completed': ['mean'],
 'NAME_CONTRACT_STATUS_Demand': ['mean'],
 'NAME_CONTRACT_STATUS_Refused': ['mean'],
 'NAME_CONTRACT_STATUS_Sent proposal': ['mean'],
 'NAME_CONTRACT_STATUS_Signed': ['mean'],
 'MONTHS_BALANCE': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'AMT_BALANCE': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'AMT_CREDIT_LIMIT_ACTUAL': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'AMT_DRAWINGS_ATM_CURRENT': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'AMT_DRAWINGS_CURRENT': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'AMT_DRAWINGS_OTHER_CURRENT': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'AMT_DRAWINGS_POS_CURRENT': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'AMT_INST_MIN_REGULARITY': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'AMT_PAYMENT_CURRENT': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'AMT_PAYMENT_TOTAL_CUR

In [40]:
ccb = df.groupby(['SK_ID_PREV', 'SK_ID_CURR']).agg(ccb_aggregations)

In [41]:
ccb.columns = pd.Index(['CCB_' + e[0] + "_" + e[1].upper() for e in ccb.columns.tolist()])

In [42]:
ccb = ccb.groupby(['SK_ID_CURR']).agg('mean')
ccb

Unnamed: 0_level_0,CCB_NAME_CONTRACT_STATUS_Active_MEAN,CCB_NAME_CONTRACT_STATUS_Approved_MEAN,CCB_NAME_CONTRACT_STATUS_Completed_MEAN,CCB_NAME_CONTRACT_STATUS_Demand_MEAN,CCB_NAME_CONTRACT_STATUS_Refused_MEAN,CCB_NAME_CONTRACT_STATUS_Sent proposal_MEAN,CCB_NAME_CONTRACT_STATUS_Signed_MEAN,CCB_MONTHS_BALANCE_MIN,CCB_MONTHS_BALANCE_MAX,CCB_MONTHS_BALANCE_MEAN,...,CCB_AMT_RECIVABLE_DIFF_MEAN,CCB_AMT_RECIVABLE_DIFF_MEDIAN,CCB_AMT_RECIVABLE_DIFF_SUM,CCB_AMT_RECIVABLE_DIFF_SIZE,CCB_SK_DPD_LOW_LOAN_MIN,CCB_SK_DPD_LOW_LOAN_MAX,CCB_SK_DPD_LOW_LOAN_MEAN,CCB_SK_DPD_LOW_LOAN_MEDIAN,CCB_SK_DPD_LOW_LOAN_SUM,CCB_SK_DPD_LOW_LOAN_SIZE
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100006,1.000000,0.0,0.000000,0.0,0.0,0.0,0.0,-6.0,-1.0,-3.5,...,0.0,0.0,0.0,6.0,0.0,0.0,0.000000,0.0,0.0,6.0
100011,1.000000,0.0,0.000000,0.0,0.0,0.0,0.0,-75.0,-2.0,-38.5,...,0.0,0.0,0.0,74.0,0.0,0.0,0.000000,0.0,0.0,74.0
100013,1.000000,0.0,0.000000,0.0,0.0,0.0,0.0,-96.0,-1.0,-48.5,...,0.0,0.0,0.0,96.0,0.0,0.0,0.000000,0.0,0.0,96.0
100021,0.411765,0.0,0.588235,0.0,0.0,0.0,0.0,-18.0,-2.0,-10.0,...,0.0,0.0,0.0,17.0,0.0,0.0,0.000000,0.0,0.0,17.0
100023,1.000000,0.0,0.000000,0.0,0.0,0.0,0.0,-11.0,-4.0,-7.5,...,0.0,0.0,0.0,8.0,0.0,0.0,0.000000,0.0,0.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456244,0.878049,0.0,0.121951,0.0,0.0,0.0,0.0,-41.0,-1.0,-21.0,...,0.0,0.0,0.0,41.0,0.0,0.0,0.000000,0.0,0.0,41.0
456246,1.000000,0.0,0.000000,0.0,0.0,0.0,0.0,-9.0,-2.0,-5.5,...,0.0,0.0,0.0,8.0,0.0,0.0,0.000000,0.0,0.0,8.0
456247,1.000000,0.0,0.000000,0.0,0.0,0.0,0.0,-96.0,-2.0,-49.0,...,0.0,0.0,0.0,95.0,0.0,1.0,0.010526,0.0,1.0,95.0
456248,1.000000,0.0,0.000000,0.0,0.0,0.0,0.0,-24.0,-2.0,-13.0,...,0.0,0.0,0.0,23.0,0.0,0.0,0.000000,0.0,0.0,23.0


In [43]:
ccb.shape

(103558, 205)