In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import LabelEncoder

In [2]:
# 카테고리형 변수들을 수치형으로 변경하는 함수 생성
def cat_encoding(df):
    le = LabelEncoder()
    original_columns = list(df.columns)
    
    for col in df:
        # df의 컬럼의 유형이 object인 것들만
        if df[col].dtype == 'object':
            
            # df의 컬럼별로 포함된 개체 수가 2개 이하이면 LabelEncoding
            if len(list(df[col].unique())) <= 2:
                le.fit(df[col])
                df[col] = le.transform(df[col])

            # df의 컬럼별로 포함된 개체 수가 3개 이상이면 ont-hot-encoding
            elif len(list(df[col].unique())) > 2:            
                df = pd.get_dummies(df, columns = [col], dummy_na=False)
    # 새롭게 만들어진 컬럼들의 이름을 리스트로 저장
    new_columns = [c for c in df.columns if c not in original_columns]
    
    # 수치형으로 변경된 df와 새롭게 만들어진 컬럼 이름 리스트를 반환
    return df, new_columns

In [3]:
df = pd.read_csv('../../data/previous_application.csv')

In [4]:
prev_df = df.copy()

In [5]:
prev_df.shape

(1670214, 37)

In [11]:
prev_df.isnull().sum()

SK_ID_PREV                           0
SK_ID_CURR                           0
NAME_CONTRACT_TYPE                   0
AMT_ANNUITY                     372235
AMT_APPLICATION                      0
AMT_CREDIT                           1
AMT_DOWN_PAYMENT                895844
AMT_GOODS_PRICE                 385515
WEEKDAY_APPR_PROCESS_START           0
HOUR_APPR_PROCESS_START              0
FLAG_LAST_APPL_PER_CONTRACT          0
NFLAG_LAST_APPL_IN_DAY               0
RATE_DOWN_PAYMENT               895844
RATE_INTEREST_PRIMARY          1664263
RATE_INTEREST_PRIVILEGED       1664263
NAME_CASH_LOAN_PURPOSE               0
NAME_CONTRACT_STATUS                 0
DAYS_DECISION                        0
NAME_PAYMENT_TYPE                    0
CODE_REJECT_REASON                   0
NAME_TYPE_SUITE                 820405
NAME_CLIENT_TYPE                     0
NAME_GOODS_CATEGORY                  0
NAME_PORTFOLIO                       0
NAME_PRODUCT_TYPE                    0
CHANNEL_TYPE             

데이터프레임 결측치 처리

In [12]:
# 소수의 결측값을 갖는 피쳐에 대해 해당 결측값을 포함하는 행을 제외시킴
prev_df = prev_df.dropna(subset=['PRODUCT_COMBINATION'])
prev_df = prev_df.dropna(subset=['AMT_CREDIT'])

# 피쳐의 특징에 따라 결측값을 채워줌
prev_df['NAME_TYPE_SUITE'].fillna("XNA", inplace=True)
prev_df['NFLAG_INSURED_ON_APPROVAL'].fillna(0, inplace=True)
prev_df['AMT_DOWN_PAYMENT'].fillna(0, inplace=True)
prev_df['AMT_ANNUITY'].fillna(0, inplace=True)
prev_df['AMT_GOODS_PRICE'].fillna(0, inplace=True)

이상치 처리

In [13]:
# 소수의 이상치를 갖는 피쳐에 대해 해당 이상치를 갖는 행을 제외시킴 
prev_df = prev_df.drop([191373, 251875, 287370, 627069, 648826, 772107, 1346611, 1521472])

# 다수의 이상치를 갖는 피쳐에 대해 이상치를 갖는 행의 이상치를 결측치로 바꿈
prev_df['DAYS_FIRST_DRAWING'].replace({365243: np.nan}, inplace=True)
prev_df['DAYS_FIRST_DUE'].replace({365243: np.nan}, inplace=True)
prev_df['DAYS_LAST_DUE_1ST_VERSION'].replace({365243: np.nan}, inplace=True)
prev_df['DAYS_LAST_DUE'].replace({365243: np.nan}, inplace=True)
prev_df['DAYS_TERMINATION'].replace({365243: np.nan}, inplace=True)

In [14]:
df = prev_df.copy()

피쳐와 피쳐를 결합하여 새로운 피쳐 생성

In [15]:
df['DAYS_FIRST_OVERDUE'] = df['DAYS_FIRST_DUE'] - df['DAYS_FIRST_DRAWING']

df['DAYS_PAYMENT_PERIOD'] = df['DAYS_LAST_DUE'] - df['DAYS_FIRST_DUE']

df['DAYS_EXPECT_LAST_DUE_GAP'] = df['DAYS_TERMINATION'] - df['DAYS_LAST_DUE']

df['AMT_CREDIT_DIFF'] = df['AMT_APPLICATION'] - df['AMT_CREDIT']
df['AMT_CREDIT_RATIO'] =  df['AMT_CREDIT'] / df['AMT_APPLICATION']

df['AMT_DOWN_PAYMENT_DIFF'] = df['AMT_APPLICATION'] - df['AMT_DOWN_PAYMENT']
df['AMT_DOWN_PAYMENT_RATIO'] = df['AMT_DOWN_PAYMENT'] / df['AMT_APPLICATION']

새로운 피쳐를 생성할 때 생긴 inf값 처리 

In [76]:
df['AMT_CREDIT_RATIO'].replace(to_replace=np.inf,value=0, inplace=True)
df['AMT_CREDIT_RATIO'].replace(to_replace=-np.inf,value=0, inplace=True)

In [84]:
prev_n_df, prev_n_new_columns = cat_encoding(df)

NAME_CONTRACT_STATUS 컬럼이 approved와 refused인 행들만을 포함하는 새로운 데이터 프레임을 생성

In [87]:
approved = prev_n_df[prev_n_df['NAME_CONTRACT_STATUS_Approved'] == 1]
refused = prev_n_df[prev_n_df['NAME_CONTRACT_STATUS_Refused'] == 1]

카테고리형 피쳐들과 수치형 피쳐들에 대해 각각 groupby시 다른 agg 조건 설정

In [85]:
# 기존의 카테고리형 피쳐들에 대한 agg 조건 설정
cat_aggregations = {}

for col in prev_n_new_columns:
    cat_aggregations[col] = ['mean', 'sum']

In [88]:
# 기존에 있던 수치형 피쳐들을 리스트로 저장
prevn_columns = [ _ for _ in prev_n_df.columns if _ not in prev_n_new_columns]

# SK_ID_PREV와 SK_ID_CURR 피쳐를 agg 리스트에서 제외
prevn_columns.remove('SK_ID_PREV')
prevn_columns.remove('SK_ID_CURR')
prevn_columns

['AMT_ANNUITY',
 'AMT_APPLICATION',
 'AMT_CREDIT',
 'AMT_DOWN_PAYMENT',
 'AMT_GOODS_PRICE',
 'HOUR_APPR_PROCESS_START',
 'FLAG_LAST_APPL_PER_CONTRACT',
 'NFLAG_LAST_APPL_IN_DAY',
 'RATE_DOWN_PAYMENT',
 'RATE_INTEREST_PRIMARY',
 'RATE_INTEREST_PRIVILEGED',
 'DAYS_DECISION',
 'SELLERPLACE_AREA',
 'CNT_PAYMENT',
 'DAYS_FIRST_DRAWING',
 'DAYS_FIRST_DUE',
 'DAYS_LAST_DUE_1ST_VERSION',
 'DAYS_LAST_DUE',
 'DAYS_TERMINATION',
 'NFLAG_INSURED_ON_APPROVAL',
 'DAYS_FIRST_OVERDUE',
 'DAYS_PAYMENT_PERIOD',
 'DAYS_EXPECT_LAST_DUE_GAP',
 'AMT_CREDIT_DIFF',
 'AMT_CREDIT_RATIO',
 'AMT_DOWN_PAYMENT_DIFF',
 'AMT_DOWN_PAYMENT_RATIO']

In [89]:
# 수치형 피쳐들에 대한 agg 조건 설정
num_aggregations = {}

for col in prevn_columns:
    num_aggregations[col] = ['min', 'max', 'mean', 'median', 'sum', 'size']

In [91]:
# 위에서 설정한 조건에 따라 SK_ID_PREV를 기준으로 그룹화
prev_app = prev_n_df.groupby(['SK_ID_PREV']).agg({**cat_aggregations, **num_aggregations})

In [93]:
# 컬럼 구분 및 이름 변경
prev_app.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_app.columns.tolist()])

# 확인
prev_app

Unnamed: 0_level_0,Unnamed: 1_level_0,PREV_NAME_CONTRACT_TYPE_Cash loans_MEAN,PREV_NAME_CONTRACT_TYPE_Cash loans_SUM,PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN,PREV_NAME_CONTRACT_TYPE_Consumer loans_SUM,PREV_NAME_CONTRACT_TYPE_Revolving loans_MEAN,PREV_NAME_CONTRACT_TYPE_Revolving loans_SUM,PREV_WEEKDAY_APPR_PROCESS_START_FRIDAY_MEAN,PREV_WEEKDAY_APPR_PROCESS_START_FRIDAY_SUM,PREV_WEEKDAY_APPR_PROCESS_START_MONDAY_MEAN,PREV_WEEKDAY_APPR_PROCESS_START_MONDAY_SUM,...,PREV_AMT_DOWN_PAYMENT_DIFF_MEAN,PREV_AMT_DOWN_PAYMENT_DIFF_MEDIAN,PREV_AMT_DOWN_PAYMENT_DIFF_SUM,PREV_AMT_DOWN_PAYMENT_DIFF_SIZE,PREV_AMT_DOWN_PAYMENT_RATIO_MIN,PREV_AMT_DOWN_PAYMENT_RATIO_MAX,PREV_AMT_DOWN_PAYMENT_RATIO_MEAN,PREV_AMT_DOWN_PAYMENT_RATIO_MEDIAN,PREV_AMT_DOWN_PAYMENT_RATIO_SUM,PREV_AMT_DOWN_PAYMENT_RATIO_SIZE
SK_ID_PREV,SK_ID_CURR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1000001,158271,0,0,1,1,0,0,0,0,0,0,...,58905.00,58905.00,58905.00,1,0.000000,0.000000,0.000000,0.000000,0.000000,1
1000002,101962,0,0,1,1,0,0,0,0,0,0,...,35230.50,35230.50,35230.50,1,0.100011,0.100011,0.100011,0.100011,0.100011,1
1000003,252457,0,0,1,1,0,0,0,0,0,0,...,47052.00,47052.00,47052.00,1,0.000091,0.000091,0.000091,0.000091,0.000091,1
1000004,260094,0,0,1,1,0,0,0,0,0,0,...,28111.50,28111.50,28111.50,1,0.200114,0.200114,0.200114,0.200114,0.200114,1
1000005,176456,0,0,1,1,0,0,0,0,0,0,...,111136.50,111136.50,111136.50,1,0.100008,0.100008,0.100008,0.100008,0.100008,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2845377,250234,0,0,1,1,0,0,0,0,0,0,...,108180.00,108180.00,108180.00,1,0.000000,0.000000,0.000000,0.000000,0.000000,1
2845378,100125,0,0,1,1,0,0,1,1,0,0,...,56695.50,56695.50,56695.50,1,0.300014,0.300014,0.300014,0.300014,0.300014,1
2845379,237546,0,0,1,1,0,0,0,0,0,0,...,56916.00,56916.00,56916.00,1,0.283326,0.283326,0.283326,0.283326,0.283326,1
2845381,140761,0,0,1,1,0,0,0,0,0,0,...,41499.00,41499.00,41499.00,1,0.000000,0.000000,0.000000,0.000000,0.000000,1


SK_ID_PREV를 기준으로 그룹화한 데이터프레임을 SK_ID_CURR, agg 조건은 mean으로 다시 한 번 그룹화

In [94]:
prev_agg = prev_app.groupby(['SK_ID_CURR']).agg('mean')

approved와 refused로 생성된 데이터 프레임에 대해서도 수치형 피쳐들에 대해서만 agg기준 적용 및 피쳐 구분

이후 기존의 prev 데이터프레임에 join

In [95]:
approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')

In [96]:
refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')

지금까지의 작업으로 만들어진 데이터프레임을 joblib 파일로 저장

In [102]:
joblib.dump(filename="prev_agg_n_2.joblib", value=prev_agg)

['prev_agg_n_2.joblib']