In [1]:
import joblib

import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier 
from sklearn.model_selection import train_test_split

### LGBMClassifier 모델에 넣기 위해 최종적인 데이터프레임을 생성

1. 전처리를 끝낸 각각의 데이터프레임 로드


2. 기준이 되는 feature를 정해 데이터 프레임을 결합


3. 데이터프레임을 모델에 넣기 적합하게 수정

In [None]:
# 전처리를 끝낸 train, test 데이터를 합친 데이터프레임
df = joblib.load('df_final_2.joblib')

# 전처리를 끝낸 bureau, bureau_balance 데이터를 합친 데이터프레임
bur_df= joblib.load('bur_df_3_nodrop.joblib')

# 전처리를 끝낸 credit_card_balance 데이터의 데이터프레임
ccb_df=joblib.load('ccb_df_final_2.joblib')

# 전처리를 끝낸 previous_application 데이터의 데이터프레임
prev_df=joblib.load('prev_df_n_2.joblib')

# 전처리를 끝낸 installments_payments 데이터의 데이터프레임
inst_df=joblib.load('ins_df.joblib')

# 전처리를 끝낸 pos_cash_balance 데이터의 데이터프레임
pos_df=joblib.load('POS_CASH_balance_df_final2.joblib')

In [2]:
# 각각의 데이터프레임을 SK_ID_CURR을 기준으로 JOIN
df = df.join(bur_df, how='left', on='SK_ID_CURR')
df = df.join(prev_df, how='left', on='SK_ID_CURR')
df = df.join(pos_df, how='left', on='SK_ID_CURR')
df = df.join(inst_df, how='left', on='SK_ID_CURR')
df = df.join(ccb_df, how='left', on='SK_ID_CURR')

In [3]:
# TARGET을 기준으로 train, test 데이터프레임을 나누어 생성
train = df[df['TARGET'].notnull()]
test = df[df['TARGET'].isnull()]

# train의 TARGET 피쳐를 변수에 저장
train_labels = train['TARGET']

# 모델에 넣기 위해 train,test에서 TARGET 피쳐를 제외
train = train.drop(columns = ['TARGET'])
test = test.drop(columns = ['TARGET'])

In [4]:
# LGBMClassifier 모델에 넣을 수 있게 데이터프레임의 컬럼명을 재설정
train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train.columns]
test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test.columns]

### 모든 데이터프레임을 결합하여 생선된 데이터프레임은 피쳐의 개수가 너무 많기 때문에 

### 모델에 fitting시키고 결과를 도출해내는데 굉장히 많은 시간이 소요됨. 피쳐의 개수를 줄여줄 필요가 있음.

1. feature_importances, 즉 피쳐의 중요도가 0인 피쳐들을 제외

    - LGBMClassifier의 feature_importances_ 옵션을 이용
    
    
2. 피쳐들간의 상관계수를 도출하여 0.9(90%) 이상의 상관계수를 갖는 피쳐들에 대해 두 피쳐중 하나를 데이터프레임에서 제외시킴
    
    - corr() 함수 사용

In [5]:
# 모델에서 추출할 feature importance를 저장할 변수 생성
feature_importances = np.zeros(train.shape[1])

# 모델 생성
model = LGBMClassifier(objective='binary', n_estimators = 10000)

for i in range(2):
    # train과 target을 기준으로 train, valid 데이터 셋으로 나눔
    train_features, valid_features, train_y, valid_y = train_test_split(train, train_labels, test_size = 0.25, random_state = i)
    
    #  모델에 fitting, valid 데이터 셋을 auc를 평가 기준으로 early_stopping 설정
    model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
              eval_metric = 'auc', verbose = 100)
    
    # feature importances를 변수에 입력
    feature_importances += model.feature_importances_

# feature importance를 2회 저장했으므로 2로 나눔    
feature_importances = feature_importances / 2

# feature importance에 저장된 값과 train셋의 컬럼명으로 데이터프레임을 생성
feature_importances = pd.DataFrame({'feature': list(train.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

# feature importance가 0인 feature의 컬럼 이름을 저장
zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])

# feature importance가 0인 피쳐의 수
print('There are %d features with 0.0 importance' % len(zero_features))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.778847	valid_0's binary_logloss: 0.241911
[200]	valid_0's auc: 0.778394	valid_0's binary_logloss: 0.242094
Early stopping, best iteration is:
[141]	valid_0's auc: 0.779138	valid_0's binary_logloss: 0.241835
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.775815	valid_0's binary_logloss: 0.240037
[200]	valid_0's auc: 0.776491	valid_0's binary_logloss: 0.239978
Early stopping, best iteration is:
[179]	valid_0's auc: 0.776661	valid_0's binary_logloss: 0.239866
There are 679 features with 0.0 importance


In [6]:
# 기존의 데이터프레임의 shape 출력
print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)

Training shape:  (306187, 1520)
Testing shape:  (48744, 1520)


In [7]:
# feature importance가 0인 피쳐들을 기존의 데이터프레임에서 제외
train = train.drop(columns = zero_features)
test = test.drop(columns = zero_features)

In [8]:
# 변경한 데이터프레임의 shape 출력
print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)

Training shape:  (306187, 841)
Testing shape:  (48744, 841)


In [11]:
# train 데이터프레임에 TARGET 변수를 다시 생성
train['TARGET'] = train_labels

In [14]:
# train과 test를 결합
df2 = train.append(test)

In [17]:
# 확인
df2

Unnamed: 0_level_0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CCB_CNT_DRAWINGS_POS_CURRENT_RATIO_MEAN,CCB_CNT_DRAWINGS_POS_CURRENT_RATIO_MEDIAN,CCB_CNT_DRAWINGS_POS_CURRENT_RATIO_SUM,CCB_AMT_RECIVABLE_DIFF_MEAN,CCB_SK_DPD_LOW_LOAN_MAX,CCB_NAME_CONTRACT_STATUS_Active_SUM,CCB_NAME_CONTRACT_STATUS_Completed_MEAN,CCB_NAME_CONTRACT_STATUS_Completed_SUM,CCB_NAME_CONTRACT_STATUS_Signed_MEAN,TARGET
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,100002,0,1,0,1,202500.0,406597.5,24700.5,351000.0,0.018801,...,,,,,,,,,,1.0
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,,,,,,,,,,0.0
2,100004,1,1,1,1,67500.0,135000.0,6750.0,135000.0,0.010032,...,,,,,,,,,,0.0
3,100006,0,0,0,1,135000.0,312682.5,29686.5,297000.0,0.008019,...,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0
4,100007,0,1,0,1,121500.0,513000.0,21865.5,513000.0,0.028663,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,0,0,0,1,121500.0,412560.0,17473.5,270000.0,0.002042,...,,,,,,,,,,
48740,456222,0,0,0,0,157500.0,622413.0,31909.5,495000.0,0.035792,...,,,,,,,,,,
48741,456223,0,0,1,1,202500.0,315000.0,33205.5,315000.0,0.026392,...,,,,,,,,,,
48742,456224,0,1,0,0,225000.0,450000.0,25128.0,450000.0,0.018850,...,,,,,,,,,,


In [19]:
# joblib파일로 저장
joblib.dump(filename='df_fi_n_2.joblib', value=df2)

['df_fi_n_2.joblib']

In [4]:
df = joblib.load('df_fi_n_2.joblib')

In [5]:
# 임계치를 0.9로 설정
threshold = 0.9

# 데이터프레임의 변수들간의 상관관계 확인
corr_matrix = df.corr().abs()
corr_matrix.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CCB_CNT_DRAWINGS_POS_CURRENT_RATIO_MEAN,CCB_CNT_DRAWINGS_POS_CURRENT_RATIO_MEDIAN,CCB_CNT_DRAWINGS_POS_CURRENT_RATIO_SUM,CCB_AMT_RECIVABLE_DIFF_MEAN,CCB_SK_DPD_LOW_LOAN_MAX,CCB_NAME_CONTRACT_STATUS_Active_SUM,CCB_NAME_CONTRACT_STATUS_Completed_MEAN,CCB_NAME_CONTRACT_STATUS_Completed_SUM,CCB_NAME_CONTRACT_STATUS_Signed_MEAN,TARGET
SK_ID_CURR,1.0,0.001293,0.000896,0.001472,0.000283,0.001562,0.000503,0.000752,0.000501,0.001176,...,0.002765,0.003598,0.00152,0.002478,0.002748,0.000885,0.001411,0.00446,0.007065,0.002079
NAME_CONTRACT_TYPE,0.001293,1.0,0.006977,0.005874,0.063175,0.012775,0.199356,0.228087,0.167613,0.020265,...,0.021024,0.023767,0.013254,0.000421,0.007507,0.009614,0.175065,0.136193,0.002311,0.029765
CODE_GENDER,0.000896,0.006977,1.0,0.343874,0.041553,0.164441,0.016382,0.076275,0.017703,0.013194,...,0.024633,0.016484,0.023302,0.002193,0.002908,0.002498,0.010384,0.010275,0.002854,0.054992
FLAG_OWN_CAR,0.001472,0.005874,0.343874,1.0,0.002031,0.186674,0.116867,0.143483,0.121483,0.038988,...,0.007414,0.007834,0.012716,4e-06,0.003645,0.043133,0.007449,0.000126,0.002166,0.021908
FLAG_OWN_REALTY,0.000283,0.063175,0.041553,0.002031,1.0,0.006965,0.034521,0.001228,0.042273,0.018172,...,0.006091,0.006891,0.002599,0.001445,0.011076,0.052261,0.01056,0.003755,0.002926,0.006221


In [6]:
# 자체 변수에 대한 상관관계값은 항상 1이며, 중복되는 상관관계값을 없애기 위해 대각선을 포함한 그 아래의 값을 NaN값으로 표시
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CCB_CNT_DRAWINGS_POS_CURRENT_RATIO_MEAN,CCB_CNT_DRAWINGS_POS_CURRENT_RATIO_MEDIAN,CCB_CNT_DRAWINGS_POS_CURRENT_RATIO_SUM,CCB_AMT_RECIVABLE_DIFF_MEAN,CCB_SK_DPD_LOW_LOAN_MAX,CCB_NAME_CONTRACT_STATUS_Active_SUM,CCB_NAME_CONTRACT_STATUS_Completed_MEAN,CCB_NAME_CONTRACT_STATUS_Completed_SUM,CCB_NAME_CONTRACT_STATUS_Signed_MEAN,TARGET
SK_ID_CURR,,0.001293,0.000896,0.001472,0.000283,0.001562,0.000503,0.000752,0.000501,0.001176,...,0.002765,0.003598,0.00152,0.002478,0.002748,0.000885,0.001411,0.00446,0.007065,0.002079
NAME_CONTRACT_TYPE,,,0.006977,0.005874,0.063175,0.012775,0.199356,0.228087,0.167613,0.020265,...,0.021024,0.023767,0.013254,0.000421,0.007507,0.009614,0.175065,0.136193,0.002311,0.029765
CODE_GENDER,,,,0.343874,0.041553,0.164441,0.016382,0.076275,0.017703,0.013194,...,0.024633,0.016484,0.023302,0.002193,0.002908,0.002498,0.010384,0.010275,0.002854,0.054992
FLAG_OWN_CAR,,,,,0.002031,0.186674,0.116867,0.143483,0.121483,0.038988,...,0.007414,0.007834,0.012716,4e-06,0.003645,0.043133,0.007449,0.000126,0.002166,0.021908
FLAG_OWN_REALTY,,,,,,0.006965,0.034521,0.001228,0.042273,0.018172,...,0.006091,0.006891,0.002599,0.001445,0.011076,0.052261,0.01056,0.003755,0.002926,0.006221


In [7]:
# 임계값(0.9)를 초과하는 상관관계 값을 포함하는 변수에 대한 리스트를 생성 
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

# 제거할 변수의 개수 확인
print('There are %d columns to remove.' % (len(to_drop)))

There are 312 columns to remove.


In [8]:
# 상관관계 값에 따라 제외할 변수들을 제외
df = df.drop(columns = to_drop)

# shape 확인
df.shape

(354931, 530)

In [9]:
# 최종 데이터프레임을 joblib 파일로 저장
joblib.dump(filename="df_fi_corr_n_2.joblib", value=df)

['df_fi_corr_n_2.joblib']