In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import joblib

# 카테고리형 변수들을 수치형으로 변경하는 함수 생성
def cat_encoding(df):
    le = LabelEncoder()
    original_columns = list(df.columns)
    
    for col in df:
        # df의 컬럼의 유형이 object인 것들만
        if df[col].dtype == 'object':
            
            # df의 컬럼별로 포함된 개체 수가 2개 이하이면 LabelEncoding
            if len(list(df[col].unique())) <= 2:
                le.fit(df[col])
                df[col] = le.transform(df[col])

            # df의 컬럼별로 포함된 개체 수가 3개 이상이면 ont-hot-encoding
            elif len(list(df[col].unique())) > 2:            
                df = pd.get_dummies(df, columns = [col], dummy_na=False)
    # 새롭게 만들어진 컬럼들의 이름을 리스트로 저장
    new_columns = [c for c in df.columns if c not in original_columns]
    
    # 수치형으로 변경된 df와 새롭게 만들어진 컬럼 이름 리스트를 반환
    return df, new_columns

pos_cash = pd.read_csv('../data/POS_CASH_balance.csv')

데이터의 전반적인 특성 파악

In [4]:
pos_cash.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001358 entries, 0 to 10001357
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   SK_ID_PREV             int64  
 1   SK_ID_CURR             int64  
 2   MONTHS_BALANCE         int64  
 3   CNT_INSTALMENT         float64
 4   CNT_INSTALMENT_FUTURE  float64
 5   NAME_CONTRACT_STATUS   object 
 6   SK_DPD                 int64  
 7   SK_DPD_DEF             int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 610.4+ MB


피쳐와 피쳐를 결합하여 새로운 피쳐 생성

In [5]:
# ['SK_DPD'] - ['SK_DPD_DEF']: 만기일 경과일 중 용인된 날짜 감면된 열 추가
pos_cash['DPD_MIN_DPDDEF'] = pos_cash['SK_DPD'] - pos_cash['SK_DPD_DEF']

# ['MONTHS_BALANCE'] + ['CNT_INSTALMENT']: 음수 -> 양수로 커질 수록 신용도가 높을 것 예상
pos_cash['MONTHS_PLUS_CNT'] = pos_cash['MONTHS_BALANCE'] + pos_cash['CNT_INSTALMENT']

# ['MONTHS_BALANCE'] + ['CNT_INSTALMENT_FUTURE']: 0에 가까울 수록 신용도가 높을 것 예상
pos_cash['MONTHS_PLUS_CNTFUT'] = pos_cash['MONTHS_BALANCE'] + pos_cash['CNT_INSTALMENT_FUTURE']

In [7]:
# 확인
pos_cash

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,DPD_MIN_DPDDEF,MONTHS_PLUS_CNT,MONTHS_PLUS_CNTFUT
0,1803195,182943,-31,48.0,45.0,Active,0,0,0,17.0,14.0
1,1715348,367990,-33,36.0,35.0,Active,0,0,0,3.0,2.0
2,1784872,397406,-32,12.0,9.0,Active,0,0,0,-20.0,-23.0
3,1903291,269225,-35,48.0,42.0,Active,0,0,0,13.0,7.0
4,2341044,334279,-35,36.0,35.0,Active,0,0,0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
10001353,2448283,226558,-20,6.0,0.0,Active,843,0,843,-14.0,-20.0
10001354,1717234,141565,-19,12.0,0.0,Active,602,0,602,-7.0,-19.0
10001355,1283126,315695,-21,10.0,0.0,Active,609,0,609,-11.0,-21.0
10001356,1082516,450255,-22,12.0,0.0,Active,614,0,614,-10.0,-22.0


카테고리형 피쳐들을 수치형 피쳐들로 변환

In [8]:
pos_cash_df, pos_new_columns = cat_encoding(pos_cash)

In [13]:
pos_new_columns

['NAME_CONTRACT_STATUS_Active',
 'NAME_CONTRACT_STATUS_Amortized debt',
 'NAME_CONTRACT_STATUS_Approved',
 'NAME_CONTRACT_STATUS_Canceled',
 'NAME_CONTRACT_STATUS_Completed',
 'NAME_CONTRACT_STATUS_Demand',
 'NAME_CONTRACT_STATUS_Returned to the store',
 'NAME_CONTRACT_STATUS_Signed',
 'NAME_CONTRACT_STATUS_XNA']

카테고리형 피쳐들과 수치형 피쳐들에 대해 그룹화 agg 조건 설정 및 그룹화

SK_ID_PREV, SK_ID_CURR 을 기준으로 먼저 그룹화 한 후, SK_ID_CURR을 기준으로 다시 한 번 그룹화

In [9]:
# 카테고리형 피쳐들을 수치형 피쳐들로 변환하는 과정에서 생성된 피쳐들에 대한 agg 조건 설정
pos_aggregations = {}

for col in pos_new_columns:
    pos_aggregations[col] = ['mean']

In [14]:
# 기존 수치형 피쳐들만을 포함하는 리스트 생성
pos_columns = [c for c in pos_cash_df.columns if c not in pos_new_columns]
pos_columns.remove('SK_ID_PREV')
pos_columns.remove('SK_ID_CURR')
pos_columns

['MONTHS_BALANCE',
 'CNT_INSTALMENT',
 'CNT_INSTALMENT_FUTURE',
 'SK_DPD',
 'SK_DPD_DEF',
 'DPD_MIN_DPDDEF',
 'MONTHS_PLUS_CNT',
 'MONTHS_PLUS_CNTFUT']

In [15]:
# 기존 수치형 피쳐들에 대한 agg 조건 설정
for col in pos_columns:
    pos_aggregations[col] = ['min', 'max', 'mean', 'median', 'sum', 'size']

In [16]:
# 확인
pos_aggregations

{'NAME_CONTRACT_STATUS_Active': ['mean'],
 'NAME_CONTRACT_STATUS_Amortized debt': ['mean'],
 'NAME_CONTRACT_STATUS_Approved': ['mean'],
 'NAME_CONTRACT_STATUS_Canceled': ['mean'],
 'NAME_CONTRACT_STATUS_Completed': ['mean'],
 'NAME_CONTRACT_STATUS_Demand': ['mean'],
 'NAME_CONTRACT_STATUS_Returned to the store': ['mean'],
 'NAME_CONTRACT_STATUS_Signed': ['mean'],
 'NAME_CONTRACT_STATUS_XNA': ['mean'],
 'MONTHS_BALANCE': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'CNT_INSTALMENT': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'CNT_INSTALMENT_FUTURE': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'SK_DPD': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'SK_DPD_DEF': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'DPD_MIN_DPDDEF': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'MONTHS_PLUS_CNT': ['min', 'max', 'mean', 'median', 'sum', 'size'],
 'MONTHS_PLUS_CNTFUT': ['min', 'max', 'mean', 'median', 'sum', 'size']}

In [32]:
# 위에서 설정한 agg 조건과 SK_ID_PREV, SK_ID_CURR을 기준으로 그룹화
pos_cash1 = pos_cash_df.groupby(['SK_ID_PREV', 'SK_ID_CURR']).agg(pos_aggregations)

In [18]:
pos_cash1

Unnamed: 0_level_0,Unnamed: 1_level_0,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Amortized debt,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Canceled,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Returned to the store,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_XNA,MONTHS_BALANCE,...,MONTHS_PLUS_CNT,MONTHS_PLUS_CNT,MONTHS_PLUS_CNT,MONTHS_PLUS_CNT,MONTHS_PLUS_CNTFUT,MONTHS_PLUS_CNTFUT,MONTHS_PLUS_CNTFUT,MONTHS_PLUS_CNTFUT,MONTHS_PLUS_CNTFUT,MONTHS_PLUS_CNTFUT
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,min,...,mean,median,sum,size,min,max,mean,median,sum,size
SK_ID_PREV,SK_ID_CURR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1000001,158271,0.666667,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,-10,...,-0.333333,2.0,-1.0,3,-8.0,2.0,-1.333333,2.0,-4.0,3
1000002,101962,0.800000,0.0,0.0,0.0,0.200000,0.0,0.0,0.0,0.0,-54,...,-46.800000,-46.0,-234.0,5,-50.0,-50.0,-50.000000,-50.0,-250.0,5
1000003,252457,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,-4,...,9.500000,9.5,38.0,4,8.0,8.0,8.000000,8.0,32.0,4
1000004,260094,0.875000,0.0,0.0,0.0,0.125000,0.0,0.0,0.0,0.0,-29,...,-15.875000,-15.5,-127.0,8,-22.0,-19.0,-19.375000,-19.0,-155.0,8
1000005,176456,0.909091,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,-56,...,-41.000000,-41.0,-451.0,11,-46.0,-46.0,-46.000000,-46.0,-506.0,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2843494,292375,0.666667,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,-26,...,7.666667,22.0,23.0,3,-24.0,22.0,6.666667,22.0,20.0,3
2843495,260963,0.875000,0.0,0.0,0.0,0.125000,0.0,0.0,0.0,0.0,-16,...,40.875000,46.5,327.0,8,-9.0,44.0,37.375000,44.0,299.0,8
2843497,451578,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,-21,...,13.000000,13.0,273.0,21,3.0,3.0,3.000000,3.0,63.0,21
2843498,393881,0.857143,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,-48,...,-17.571429,-11.0,-123.0,7,-43.0,-12.0,-20.714286,-12.0,-145.0,7


pos_cash 에서 최종 할부 개월수(CNT_INSTALMENT의 최종 값)를 구하기 위해 

MONTHS_BALANCE에 대해 정렬 후, CNT_INSTALMENT의 마지막 값을 추출

In [19]:
# sk_id_prev, months_balance를 기준으로 정렬
pos_cash2 = pos_cash.sort_values(['SK_ID_PREV', 'MONTHS_BALANCE'])

In [20]:
# months_balance의 최대값, cnt_instalment의 마지막 값을 기준으로 그룹화
a = pos_cash2.groupby(['SK_ID_PREV', 'SK_ID_CURR']).agg({'MONTHS_BALANCE': ['max'], 'CNT_INSTALMENT': ['last']})
a = a.drop('MONTHS_BALANCE', axis=1)

In [21]:
# 확인
a

Unnamed: 0_level_0,Unnamed: 1_level_0,CNT_INSTALMENT
Unnamed: 0_level_1,Unnamed: 1_level_1,last
SK_ID_PREV,SK_ID_CURR,Unnamed: 2_level_2
1000001,158271,2.0
1000002,101962,4.0
1000003,252457,12.0
1000004,260094,7.0
1000005,176456,10.0
...,...,...
2843494,292375,2.0
2843495,260963,7.0
2843497,451578,24.0
2843498,393881,6.0


In [22]:
# 위에서 구한 최종 할부 개월수를 pos_cash 데이터 프레임에 병합
pos_agg = pos_cash1.merge(a, on=['SK_ID_PREV', 'SK_ID_CURR'])

In [23]:
# 확인
pos_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Amortized debt,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Canceled,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Returned to the store,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_XNA,MONTHS_BALANCE,...,MONTHS_PLUS_CNT,MONTHS_PLUS_CNT,MONTHS_PLUS_CNT,MONTHS_PLUS_CNTFUT,MONTHS_PLUS_CNTFUT,MONTHS_PLUS_CNTFUT,MONTHS_PLUS_CNTFUT,MONTHS_PLUS_CNTFUT,MONTHS_PLUS_CNTFUT,CNT_INSTALMENT
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,min,...,median,sum,size,min,max,mean,median,sum,size,last
SK_ID_PREV,SK_ID_CURR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1000001,158271,0.666667,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,-10,...,2.0,-1.0,3,-8.0,2.0,-1.333333,2.0,-4.0,3,2.0
1000002,101962,0.800000,0.0,0.0,0.0,0.200000,0.0,0.0,0.0,0.0,-54,...,-46.0,-234.0,5,-50.0,-50.0,-50.000000,-50.0,-250.0,5,4.0
1000003,252457,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,-4,...,9.5,38.0,4,8.0,8.0,8.000000,8.0,32.0,4,12.0
1000004,260094,0.875000,0.0,0.0,0.0,0.125000,0.0,0.0,0.0,0.0,-29,...,-15.5,-127.0,8,-22.0,-19.0,-19.375000,-19.0,-155.0,8,7.0
1000005,176456,0.909091,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,-56,...,-41.0,-451.0,11,-46.0,-46.0,-46.000000,-46.0,-506.0,11,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2843494,292375,0.666667,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,-26,...,22.0,23.0,3,-24.0,22.0,6.666667,22.0,20.0,3,2.0
2843495,260963,0.875000,0.0,0.0,0.0,0.125000,0.0,0.0,0.0,0.0,-16,...,46.5,327.0,8,-9.0,44.0,37.375000,44.0,299.0,8,7.0
2843497,451578,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,-21,...,13.0,273.0,21,3.0,3.0,3.000000,3.0,63.0,21,24.0
2843498,393881,0.857143,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,-48,...,-11.0,-123.0,7,-43.0,-12.0,-20.714286,-12.0,-145.0,7,6.0


In [24]:
# 컬럼 구분 및 이름 변경
pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
pos_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,POS_NAME_CONTRACT_STATUS_Active_MEAN,POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN,POS_NAME_CONTRACT_STATUS_Approved_MEAN,POS_NAME_CONTRACT_STATUS_Canceled_MEAN,POS_NAME_CONTRACT_STATUS_Completed_MEAN,POS_NAME_CONTRACT_STATUS_Demand_MEAN,POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN,POS_NAME_CONTRACT_STATUS_Signed_MEAN,POS_NAME_CONTRACT_STATUS_XNA_MEAN,POS_MONTHS_BALANCE_MIN,...,POS_MONTHS_PLUS_CNT_MEDIAN,POS_MONTHS_PLUS_CNT_SUM,POS_MONTHS_PLUS_CNT_SIZE,POS_MONTHS_PLUS_CNTFUT_MIN,POS_MONTHS_PLUS_CNTFUT_MAX,POS_MONTHS_PLUS_CNTFUT_MEAN,POS_MONTHS_PLUS_CNTFUT_MEDIAN,POS_MONTHS_PLUS_CNTFUT_SUM,POS_MONTHS_PLUS_CNTFUT_SIZE,POS_CNT_INSTALMENT_LAST
SK_ID_PREV,SK_ID_CURR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1000001,158271,0.666667,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,-10,...,2.0,-1.0,3,-8.0,2.0,-1.333333,2.0,-4.0,3,2.0
1000002,101962,0.800000,0.0,0.0,0.0,0.200000,0.0,0.0,0.0,0.0,-54,...,-46.0,-234.0,5,-50.0,-50.0,-50.000000,-50.0,-250.0,5,4.0
1000003,252457,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,-4,...,9.5,38.0,4,8.0,8.0,8.000000,8.0,32.0,4,12.0
1000004,260094,0.875000,0.0,0.0,0.0,0.125000,0.0,0.0,0.0,0.0,-29,...,-15.5,-127.0,8,-22.0,-19.0,-19.375000,-19.0,-155.0,8,7.0
1000005,176456,0.909091,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,-56,...,-41.0,-451.0,11,-46.0,-46.0,-46.000000,-46.0,-506.0,11,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2843494,292375,0.666667,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,-26,...,22.0,23.0,3,-24.0,22.0,6.666667,22.0,20.0,3,2.0
2843495,260963,0.875000,0.0,0.0,0.0,0.125000,0.0,0.0,0.0,0.0,-16,...,46.5,327.0,8,-9.0,44.0,37.375000,44.0,299.0,8,7.0
2843497,451578,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,-21,...,13.0,273.0,21,3.0,3.0,3.000000,3.0,63.0,21,24.0
2843498,393881,0.857143,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,-48,...,-11.0,-123.0,7,-43.0,-12.0,-20.714286,-12.0,-145.0,7,6.0


SK_ID_PREV를 기준으로 그룹화한 데이터프레임을 SK_ID_CURR, agg 조건은 mean으로 다시 한 번 그룹화

In [25]:
pos_agg = pos_agg.groupby(['SK_ID_CURR']).agg('mean')
pos_agg

Unnamed: 0_level_0,POS_NAME_CONTRACT_STATUS_Active_MEAN,POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN,POS_NAME_CONTRACT_STATUS_Approved_MEAN,POS_NAME_CONTRACT_STATUS_Canceled_MEAN,POS_NAME_CONTRACT_STATUS_Completed_MEAN,POS_NAME_CONTRACT_STATUS_Demand_MEAN,POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN,POS_NAME_CONTRACT_STATUS_Signed_MEAN,POS_NAME_CONTRACT_STATUS_XNA_MEAN,POS_MONTHS_BALANCE_MIN,...,POS_MONTHS_PLUS_CNT_MEDIAN,POS_MONTHS_PLUS_CNT_SUM,POS_MONTHS_PLUS_CNT_SIZE,POS_MONTHS_PLUS_CNTFUT_MIN,POS_MONTHS_PLUS_CNTFUT_MAX,POS_MONTHS_PLUS_CNTFUT_MEAN,POS_MONTHS_PLUS_CNTFUT_MEDIAN,POS_MONTHS_PLUS_CNTFUT_SUM,POS_MONTHS_PLUS_CNTFUT_SIZE,POS_CNT_INSTALMENT_LAST
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,0.775000,0.0,0.0,0.0,0.225000,0.0,0.0,0.000000,0.0,-76.500000,...,-70.750000,-308.500000,4.500000,-73.500000,-73.000000,-73.375000,-73.500000,-320.000000,4.500000,4.000000
100002,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,-19.000000,...,14.000000,266.000000,19.000000,5.000000,5.000000,5.000000,5.000000,95.000000,19.000000,24.000000
100003,0.916667,0.0,0.0,0.0,0.083333,0.0,0.0,0.000000,0.0,-43.333333,...,-29.500000,-314.333333,9.333333,-35.000000,-33.000000,-33.500000,-33.333333,-354.666667,9.333333,8.333333
100004,0.750000,0.0,0.0,0.0,0.250000,0.0,0.0,0.000000,0.0,-27.000000,...,-21.500000,-87.000000,4.000000,-24.000000,-23.000000,-23.250000,-23.000000,-93.000000,4.000000,3.000000
100005,0.818182,0.0,0.0,0.0,0.090909,0.0,0.0,0.090909,0.0,-25.000000,...,-7.500000,-78.000000,11.000000,-15.000000,-12.000000,-12.300000,-12.000000,-123.000000,11.000000,9.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456251,0.777778,0.0,0.0,0.0,0.111111,0.0,0.0,0.111111,0.0,-9.000000,...,3.500000,27.000000,9.000000,-1.000000,0.000000,-0.125000,0.000000,-1.000000,9.000000,7.000000
456252,0.857143,0.0,0.0,0.0,0.142857,0.0,0.0,0.000000,0.0,-82.000000,...,-73.000000,-511.000000,7.000000,-76.000000,-76.000000,-76.000000,-76.000000,-532.000000,7.000000,6.000000
456253,0.861111,0.0,0.0,0.0,0.138889,0.0,0.0,0.000000,0.0,-84.333333,...,-74.666667,-411.000000,5.666667,-80.333333,-79.666667,-80.194444,-80.333333,-437.666667,5.666667,7.333333
456254,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,-10.000000,...,9.500000,93.500000,10.000000,5.000000,5.000000,5.000000,5.000000,48.000000,10.000000,15.000000


결측치 확인 및 처리

In [27]:
# 결측치 확인
pos_agg.isnull().sum()

POS_NAME_CONTRACT_STATUS_Active_MEAN                    0
POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN            0
POS_NAME_CONTRACT_STATUS_Approved_MEAN                  0
POS_NAME_CONTRACT_STATUS_Canceled_MEAN                  0
POS_NAME_CONTRACT_STATUS_Completed_MEAN                 0
POS_NAME_CONTRACT_STATUS_Demand_MEAN                    0
POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN     0
POS_NAME_CONTRACT_STATUS_Signed_MEAN                    0
POS_NAME_CONTRACT_STATUS_XNA_MEAN                       0
POS_MONTHS_BALANCE_MIN                                  0
POS_MONTHS_BALANCE_MAX                                  0
POS_MONTHS_BALANCE_MEAN                                 0
POS_MONTHS_BALANCE_MEDIAN                               0
POS_MONTHS_BALANCE_SUM                                  0
POS_MONTHS_BALANCE_SIZE                                 0
POS_CNT_INSTALMENT_MIN                                 28
POS_CNT_INSTALMENT_MAX                                 28
POS_CNT_INSTAL

In [28]:
# 소수인 결측값을 포함하는 행을 제외
pos_agg = pos_agg.dropna(axis=0)

In [30]:
# 확인
pos_agg

Unnamed: 0_level_0,POS_NAME_CONTRACT_STATUS_Active_MEAN,POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN,POS_NAME_CONTRACT_STATUS_Approved_MEAN,POS_NAME_CONTRACT_STATUS_Canceled_MEAN,POS_NAME_CONTRACT_STATUS_Completed_MEAN,POS_NAME_CONTRACT_STATUS_Demand_MEAN,POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN,POS_NAME_CONTRACT_STATUS_Signed_MEAN,POS_NAME_CONTRACT_STATUS_XNA_MEAN,POS_MONTHS_BALANCE_MIN,...,POS_MONTHS_PLUS_CNT_MEDIAN,POS_MONTHS_PLUS_CNT_SUM,POS_MONTHS_PLUS_CNT_SIZE,POS_MONTHS_PLUS_CNTFUT_MIN,POS_MONTHS_PLUS_CNTFUT_MAX,POS_MONTHS_PLUS_CNTFUT_MEAN,POS_MONTHS_PLUS_CNTFUT_MEDIAN,POS_MONTHS_PLUS_CNTFUT_SUM,POS_MONTHS_PLUS_CNTFUT_SIZE,POS_CNT_INSTALMENT_LAST
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,0.775000,0.0,0.0,0.0,0.225000,0.0,0.0,0.000000,0.0,-76.500000,...,-70.750000,-308.500000,4.500000,-73.500000,-73.000000,-73.375000,-73.500000,-320.000000,4.500000,4.000000
100002,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,-19.000000,...,14.000000,266.000000,19.000000,5.000000,5.000000,5.000000,5.000000,95.000000,19.000000,24.000000
100003,0.916667,0.0,0.0,0.0,0.083333,0.0,0.0,0.000000,0.0,-43.333333,...,-29.500000,-314.333333,9.333333,-35.000000,-33.000000,-33.500000,-33.333333,-354.666667,9.333333,8.333333
100004,0.750000,0.0,0.0,0.0,0.250000,0.0,0.0,0.000000,0.0,-27.000000,...,-21.500000,-87.000000,4.000000,-24.000000,-23.000000,-23.250000,-23.000000,-93.000000,4.000000,3.000000
100005,0.818182,0.0,0.0,0.0,0.090909,0.0,0.0,0.090909,0.0,-25.000000,...,-7.500000,-78.000000,11.000000,-15.000000,-12.000000,-12.300000,-12.000000,-123.000000,11.000000,9.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456251,0.777778,0.0,0.0,0.0,0.111111,0.0,0.0,0.111111,0.0,-9.000000,...,3.500000,27.000000,9.000000,-1.000000,0.000000,-0.125000,0.000000,-1.000000,9.000000,7.000000
456252,0.857143,0.0,0.0,0.0,0.142857,0.0,0.0,0.000000,0.0,-82.000000,...,-73.000000,-511.000000,7.000000,-76.000000,-76.000000,-76.000000,-76.000000,-532.000000,7.000000,6.000000
456253,0.861111,0.0,0.0,0.0,0.138889,0.0,0.0,0.000000,0.0,-84.333333,...,-74.666667,-411.000000,5.666667,-80.333333,-79.666667,-80.194444,-80.333333,-437.666667,5.666667,7.333333
456254,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,-10.000000,...,9.500000,93.500000,10.000000,5.000000,5.000000,5.000000,5.000000,48.000000,10.000000,15.000000


지금까지의 작업으로 만들어진 데이터프레임을 joblib 파일로 저장

In [31]:
joblib.dump(filename="pos_cash_balance_df.joblib", value=pos_agg)

['pos_cash_balance_df.joblib']