In [3]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

import matplotlib as mpl
import matplotlib.font_manager as fm

In [38]:
# 데이터를 읽어온다.
df1 = pd.read_parquet('train_all.parquet')
segment_df = pd.read_parquet('train_segment.parquet')

In [40]:
segment_df

Unnamed: 0,ID,Segment
0,TRAIN_000000,D
1,TRAIN_000001,E
2,TRAIN_000002,C
3,TRAIN_000003,D
4,TRAIN_000004,E
...,...,...
2399995,TRAIN_399995,E
2399996,TRAIN_399996,D
2399997,TRAIN_399997,C
2399998,TRAIN_399998,E


In [7]:
df1

Unnamed: 0,기준년월,ID,이용금액_R3M_신용체크,입회경과개월수_신용,_1순위카드이용금액,회원여부_이용가능_카드론,이용거절여부_카드론,최종카드발급경과월,이용금액_R3M_신용,_1순위카드이용건수,...,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M,잔액_한도소진율,증감율_카드론_분기,증감율_일시불_분기,증감율_체크_분기,증감율_카드론_전월,변동률_RV평잔
0,201807,TRAIN_000000,196,67,3681,0,0,22,196,26,...,0.270752,0.000000,1.044401,1.280543,4.013659,0.143423,-0.787082,-7.880561,0.048726,0.321733
1,201807,TRAIN_000001,13475,12,13323,1,0,18,13475,46,...,-0.670348,0.000000,0.000000,0.000000,7.318237,0.143423,1.121097,0.086480,0.048726,0.606623
2,201807,TRAIN_000002,23988,124,24493,0,0,20,23988,28,...,0.058114,-0.014191,0.524159,1.208420,4.426060,0.143423,0.128676,0.086480,0.048726,0.340561
3,201807,TRAIN_000003,3904,27,5933,0,0,17,3904,1,...,0.258943,0.000000,0.880925,1.657124,6.573680,0.143423,0.429497,0.086480,0.048726,0.321733
4,201807,TRAIN_000004,1190,2,0,1,0,15,0,-2,...,0.000000,0.000000,0.762016,0.986860,-2.044904,0.143423,-4.566167,-1.513947,0.048726,0.321733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,10755,209,5640,1,0,39,7267,3,...,0.000000,0.000000,0.762016,0.986860,-1.890821,0.143423,0.199105,-0.212206,0.048726,0.321733
2399996,201812,TRAIN_399996,27636,17,26357,1,0,24,27636,38,...,-0.159143,0.000000,1.377071,2.533815,-0.785374,0.143423,-1.552244,0.086480,0.048726,0.321733
2399997,201812,TRAIN_399997,23187,115,17171,0,0,18,23187,33,...,0.126581,0.000000,0.000000,0.000000,-0.233282,0.143423,-0.211614,0.086480,0.048726,0.321733
2399998,201812,TRAIN_399998,0,71,0,1,0,27,0,-2,...,0.000000,0.000000,0.762016,0.986860,-1.998546,0.143423,0.199105,0.086480,0.048726,0.321733


In [9]:
df2 = df1.drop('ID', axis=1)

In [11]:
df2

Unnamed: 0,기준년월,이용금액_R3M_신용체크,입회경과개월수_신용,_1순위카드이용금액,회원여부_이용가능_카드론,이용거절여부_카드론,최종카드발급경과월,이용금액_R3M_신용,_1순위카드이용건수,이용금액_R3M_체크,...,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M,잔액_한도소진율,증감율_카드론_분기,증감율_일시불_분기,증감율_체크_분기,증감율_카드론_전월,변동률_RV평잔
0,201807,196,67,3681,0,0,22,196,26,0,...,0.270752,0.000000,1.044401,1.280543,4.013659,0.143423,-0.787082,-7.880561,0.048726,0.321733
1,201807,13475,12,13323,1,0,18,13475,46,0,...,-0.670348,0.000000,0.000000,0.000000,7.318237,0.143423,1.121097,0.086480,0.048726,0.606623
2,201807,23988,124,24493,0,0,20,23988,28,0,...,0.058114,-0.014191,0.524159,1.208420,4.426060,0.143423,0.128676,0.086480,0.048726,0.340561
3,201807,3904,27,5933,0,0,17,3904,1,0,...,0.258943,0.000000,0.880925,1.657124,6.573680,0.143423,0.429497,0.086480,0.048726,0.321733
4,201807,1190,2,0,1,0,15,0,-2,1190,...,0.000000,0.000000,0.762016,0.986860,-2.044904,0.143423,-4.566167,-1.513947,0.048726,0.321733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,10755,209,5640,1,0,39,7267,3,3488,...,0.000000,0.000000,0.762016,0.986860,-1.890821,0.143423,0.199105,-0.212206,0.048726,0.321733
2399996,201812,27636,17,26357,1,0,24,27636,38,0,...,-0.159143,0.000000,1.377071,2.533815,-0.785374,0.143423,-1.552244,0.086480,0.048726,0.321733
2399997,201812,23187,115,17171,0,0,18,23187,33,0,...,0.126581,0.000000,0.000000,0.000000,-0.233282,0.143423,-0.211614,0.086480,0.048726,0.321733
2399998,201812,0,71,0,1,0,27,0,-2,0,...,0.000000,0.000000,0.762016,0.986860,-1.998546,0.143423,0.199105,0.086480,0.048726,0.321733


In [13]:
df2.corr()

Unnamed: 0,기준년월,이용금액_R3M_신용체크,입회경과개월수_신용,_1순위카드이용금액,회원여부_이용가능_카드론,이용거절여부_카드론,최종카드발급경과월,이용금액_R3M_신용,_1순위카드이용건수,이용금액_R3M_체크,...,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M,잔액_한도소진율,증감율_카드론_분기,증감율_일시불_분기,증감율_체크_분기,증감율_카드론_전월,변동률_RV평잔
기준년월,1.000000,-0.013404,0.020216,-0.033708,0.005492,0.005308,0.103400,-0.015632,-0.026123,0.004078,...,-0.062000,0.039949,0.003221,0.004601,-0.055296,-0.018469,-0.134239,-0.007920,0.049775,-0.004327
이용금액_R3M_신용체크,-0.013404,1.000000,0.110286,0.884290,0.055223,0.009768,-0.089840,0.946124,0.682772,0.335934,...,-0.037468,-0.015433,-0.112003,-0.111565,0.364197,-0.012607,0.068013,-0.003616,-0.008321,0.169146
입회경과개월수_신용,0.020216,0.110286,1.000000,0.108974,0.191044,-0.193312,0.247809,0.111852,0.034857,0.015211,...,0.012345,0.000776,-0.067675,-0.065432,-0.082191,0.001086,-0.022305,0.002198,0.004217,0.153633
_1순위카드이용금액,-0.033708,0.884290,0.108974,1.000000,0.067554,-0.004051,-0.104270,0.934391,0.750783,0.012746,...,-0.043696,-0.019861,-0.128469,-0.129369,0.425546,-0.015792,0.082652,-0.011568,-0.012818,0.188945
회원여부_이용가능_카드론,0.005492,0.055223,0.191044,0.067554,1.000000,-0.635622,0.009859,0.058879,0.066882,-0.000740,...,-0.004359,0.012916,0.036009,0.036960,-0.205536,0.038062,0.001659,0.005389,-0.004374,0.085886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
증감율_카드론_분기,-0.018469,-0.012607,0.001086,-0.015792,0.038062,0.050721,0.009966,-0.013199,-0.010952,-0.000538,...,0.003122,0.004447,0.006055,0.005703,-0.059750,1.000000,0.016897,0.005153,-0.232663,-0.026503
증감율_일시불_분기,-0.134239,0.068013,-0.022305,0.082652,0.001659,-0.009410,-0.075472,0.065102,0.078337,0.020655,...,-0.009213,-0.021682,0.018290,0.019212,0.022515,0.016897,1.000000,0.020767,-0.019466,-0.014257
증감율_체크_분기,-0.007920,-0.003616,0.002198,-0.011568,0.005389,-0.002697,0.010859,-0.013423,-0.018118,0.027876,...,0.005388,0.002345,0.000612,0.000292,-0.019422,0.005153,0.020767,1.000000,-0.001804,-0.009194
증감율_카드론_전월,0.049775,-0.008321,0.004217,-0.012818,-0.004374,0.017172,0.014411,-0.009013,-0.006825,0.000522,...,-0.013632,0.019028,0.002447,0.002262,-0.022599,-0.232663,-0.019466,-0.001804,1.000000,-0.010107


### 상관계수 높은 컬럼 쌍 확인

In [15]:
THRESH = 0.8            # 예: |r| ≥ 0.8 이상만 보기

# 1) 숫자형 컬럼만 선택 (object·category 제외)
num_df = df2.select_dtypes(include=[np.number])

# 2) 상관계수 행렬
corr = num_df.corr()

# 3) 상삼각(자기 자신·중복 제거)
#    - np.triu_indices: 대각선을 포함한 위쪽 삼각형 인덱스
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

# 4) |r| 기준으로 정렬 → DataFrame
corr_pairs = (
    upper.stack()                         # (col1, col2)에 대한 r 값만 남김
         .reset_index()
         .rename(columns={"level_0": "col1", "level_1": "col2", 0: "corr"})
         .assign(abs_corr=lambda d: d["corr"].abs())      # 절댓값 열 추가
         .query("abs_corr >= @THRESH")                    # 임계값 필터
         .sort_values("abs_corr", ascending=False)        # 높은 순 정렬
)

print(f"상관계수 |r| ≥ {THRESH} 인 쌍 개수:", len(corr_pairs))
display(corr_pairs.head(20))   # 상위 20개만 미리보기

상관계수 |r| ≥ 0.8 인 쌍 개수: 436


Unnamed: 0,col1,col2,corr,abs_corr
33360,카드론이용건수_누적,카드론이용월수_누적,0.994363,0.994363
21808,쇼핑_온라인_이용금액,_1순위쇼핑업종_이용금액,0.991888,0.991888
46299,방문횟수_앱_B0M,방문일수_앱_B0M,0.991887,0.991887
12920,이용건수_체크_B0M,이용금액_체크_B0M,0.991871,0.991871
43239,대표청구서수령지구분코드,청구서수령방법,0.990924,0.990924
46184,방문횟수_PC_B0M,방문일수_PC_B0M,0.987854,0.987854
36581,이용금액_페이_오프라인_R6M,이용금액_A페이_R6M,0.985415,0.985415
46144,방문일수_앱_R6M,방문횟수_앱_R6M,0.982765,0.982765
2475,_1순위카드이용건수,이용건수_일시불_B0M,0.979249,0.979249
2806,이용금액_R3M_체크,최대이용금액_체크_R12M,0.978989,0.978989


In [32]:
display(corr_pairs)

Unnamed: 0,col1,col2,corr,abs_corr
33360,카드론이용건수_누적,카드론이용월수_누적,0.994363,0.994363
21808,쇼핑_온라인_이용금액,_1순위쇼핑업종_이용금액,0.991888,0.991888
46299,방문횟수_앱_B0M,방문일수_앱_B0M,0.991887,0.991887
12920,이용건수_체크_B0M,이용금액_체크_B0M,0.991871,0.991871
43239,대표청구서수령지구분코드,청구서수령방법,0.990924,0.990924
...,...,...,...,...
42353,선입금원금_B5M,선결제건수_R3M,0.801301,0.801301
2335,이용금액_R3M_신용,정상입금원금_B0M,0.801150,0.801150
16663,이용금액_일시불_R12M,정상청구원금_B5M,0.800286,0.800286
13333,이용금액_일시불_B0M,정상입금원금_B0M,0.800075,0.800075


### Segment 컬럼과 비교
- 상관계수가 높게 나온 컬럼들 중 Segment 컬럼과 비교해 더 높게 나온 컬럼을 제거한다.
- 먼저 Segment 컬럼을 붙여준다.
- 상관계수가 높은 2개의 컬럼과 Segment 컬럼의 상관계수를 비교하여 낮은 쪽을 삭제한다.

In [43]:
"""
전제
------
corr_pairs  : 상관계수 높은 컬럼쌍이 들어 있는 DataFrame
              ┌────────┬────────┐
              │ col1   │ col2   │
              ├────────┼────────┤
              │  ...   │  ...   │
segment_df   : 'Segment' 컬럼만 가진 DataFrame  (df와 index 동일)
df2           : 원본 피처 데이터프레임  (311개 컬럼)

목표
------
1) Segment → 숫자(0~4) 인코딩
2) corr_pairs 순서대로 돌며  
   · col1, col2 두 컬럼 모두 “아직 남아있을 때”만  
     Segment와 |상관| 작은 쪽을 삭제  
3) 삭제된 컬럼 리스트 → drop_col  
   남은 컬럼 리스트 → save_col
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# ─────────────────────────────────────────
# 1) Segment 숫자 인코딩 붙이기
# ─────────────────────────────────────────
le = LabelEncoder()
seg_num = le.fit_transform(segment_df["Segment"])           # 0~4
df_tmp = df2.copy()                                          # 원본 보존
df_tmp["_segment_num"] = seg_num

# ─────────────────────────────────────────
# 2) 컬럼 삭제 로직
# ─────────────────────────────────────────
drop_cols = set()           # 이미 삭제된 컬럼 보관
save_cols = set()           # 최종 살아남은 컬럼 보관

for _, row in corr_pairs.iterrows():
    c1, c2 = row["col1"], row["col2"]

    # 둘 중 하나라도 이미 삭제됐거나 df에 없으면 패스
    if (c1 in drop_cols) or (c2 in drop_cols):
        continue
    if (c1 not in df_tmp.columns) or (c2 not in df_tmp.columns):
        continue

    # Segment와의 절댓값 상관계수
    r1 = abs(df_tmp[c1].corr(df_tmp["_segment_num"]))
    r2 = abs(df_tmp[c2].corr(df_tmp["_segment_num"]))

    # 상관이 더 낮은 컬럼을 삭제
    if r1 < r2:
        drop_cols.add(c1)
    elif r2 < r1:
        drop_cols.add(c2)
    else:                 # 동률이면 임의로 col2 삭제
        drop_cols.add(c2)

# ─────────────────────────────────────────
# 3) 최종 컬럼 리스트
# ─────────────────────────────────────────
all_cols = pd.unique(corr_pairs[["col1", "col2"]].values.ravel())
save_cols = [c for c in all_cols if c not in drop_cols]

# 파이프라인·모델링에 사용할 용도로 변수 저장
drop_col = list(drop_cols)
save_col = save_cols

print("삭제한 컬럼(drop_col) :", drop_col)
print("남은 컬럼(save_col)  :", save_col)
print("삭제 개수 / 남은 개수:", len(drop_col), "/", len(save_col))

삭제한 컬럼(drop_col) : ['인입일수_IB_R6M', '교통_버스지하철이용금액', '이용건수_A페이_R6M', '이용금액_페이_오프라인_R6M', '이용금액_연체_B0M', '_3순위업종_이용금액', '잔액_할부_B0M', '이용개월수_페이_온라인_R6M', '방문횟수_앱_R6M', '청구서발송여부_B0', '월중평잔_일시불', '소지카드수_이용가능_신용', '이용건수_페이_온라인_B0M', '이용건수_할부_무이자_R12M', '평잔_CA_6M', '연체건수_R3M', '선결제건수_R3M', '증감율_이용건수_CA_분기', '_2순위업종_이용금액', '이용건수_C페이_R6M', '이용개월수_결제일_R3M', '청구금액_B0', '할부건수_무이자_3M_R12M', '납부_통신비이용금액', '일시불ONLY전환가능여부', '이용금액_CA_B0M', '방문일수_앱_R6M', '한도증액횟수_R12M', '이용금액_A페이_R6M', '이용횟수_선결제_R6M', '정상청구원금_B0M', '증감율_이용금액_할부_분기', '이용건수_페이_오프라인_R6M', '_1순위쇼핑업종_이용금액', '_1순위카드이용건수', '이용개월수_간편결제_R6M', '이용개월수_D페이_R6M', '평잔_할부_6M', '잔액_할부_B1M', '이용금액_할부_무이자_R12M', 'RP건수_통신_B0M', '할부금액_3M_R12M', '이용건수_페이_오프라인_B0M', '이용건수_할부_무이자_B0M', '할부건수_3M_R12M', 'RV현금서비스이자율_할인전', '이용금액_페이_온라인_R6M', '이용금액_간편결제_B0M', '홈페이지_선결제건수_R3M', '이용건수_할부_유이자_R12M', '이용금액_할부_유이자_R12M', '할부금액_유이자_3M_R12M', '교통_주유이용금액', '이용건수_온라인_B0M', '이용건수_D페이_R6M', '이용금액_오프라인_B0M', '쇼핑_슈퍼마켓_이용금액', '홈페이지_금융건수_R3M', '이용개월수_선결제_R6M', '캠페인접촉건수_R12M', '최종카

In [47]:
df3 = df2.drop(columns=drop_col)
df3

Unnamed: 0,기준년월,이용금액_R3M_신용체크,입회경과개월수_신용,이용거절여부_카드론,최종카드발급경과월,직장시도명,소지카드수_유효_신용,카드신청건수,_2순위카드이용금액,최초한도금액,...,변동률_잔액_B1M,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_B0M,잔액_한도소진율,증감율_카드론_분기,증감율_일시불_분기,증감율_체크_분기,증감율_카드론_전월,변동률_RV평잔
0,201807,196,67,0,22,9,1,0,0,0,...,0.261886,0.270752,0.000000,1.280543,4.013659,0.143423,-0.787082,-7.880561,0.048726,0.321733
1,201807,13475,12,0,18,1,1,0,0,0,...,-0.563388,-0.670348,0.000000,0.000000,7.318237,0.143423,1.121097,0.086480,0.048726,0.606623
2,201807,23988,124,0,20,9,1,0,0,0,...,-0.046516,0.058114,-0.014191,1.208420,4.426060,0.143423,0.128676,0.086480,0.048726,0.340561
3,201807,3904,27,0,17,8,2,1,0,0,...,0.023821,0.258943,0.000000,1.657124,6.573680,0.143423,0.429497,0.086480,0.048726,0.321733
4,201807,1190,2,0,15,4,1,1,0,0,...,0.000000,0.000000,0.000000,0.986860,-2.044904,0.143423,-4.566167,-1.513947,0.048726,0.321733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,10755,209,0,39,11,1,0,0,0,...,0.000000,0.000000,0.000000,0.986860,-1.890821,0.143423,0.199105,-0.212206,0.048726,0.321733
2399996,201812,27636,17,0,24,12,1,0,0,0,...,-0.203251,-0.159143,0.000000,2.533815,-0.785374,0.143423,-1.552244,0.086480,0.048726,0.321733
2399997,201812,23187,115,0,18,9,1,0,0,0,...,0.027319,0.126581,0.000000,0.000000,-0.233282,0.143423,-0.211614,0.086480,0.048726,0.321733
2399998,201812,0,71,0,27,8,1,0,0,0,...,0.000000,0.000000,0.000000,0.986860,-1.998546,0.143423,0.199105,0.086480,0.048726,0.321733


### 남은 컬럼들과 Segment 컬럼과의 상관계수를 확인한다.

In [52]:
"""
df3         : 상관·VIF 기준으로 컬럼을 간추린 ‘피처’ DataFrame
segment_df  : Segment 컬럼만 있는 DataFrame  (df3.index 와 동일하다고 가정)

목표
------
1) df3  +  segment_df → 하나로 합치기
2) Segment → 숫자(0~4) 인코딩
3) [Segment] 과 각 컬럼 간 상관계수를 계산
4) 절댓값이 큰 순으로 정렬해 미리보기
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# ──────────────────────────────────────────────
# 1) Segment 인코딩 & 데이터 결합
# ──────────────────────────────────────────────
le = LabelEncoder()
seg_num = le.fit_transform(segment_df["Segment"])            # 0~4

df4 = df3.copy()
df4["_segment_num"] = seg_num                             # 인코딩된 Segment

# ──────────────────────────────────────────────
# 2) 상관계수 계산 (수치형으로만)
#    - 범주형(object) → category 코드로 임시 변환
# ──────────────────────────────────────────────
df_corr_ready = df4.copy()

for col in df_corr_ready.select_dtypes(include="object").columns:
    df_corr_ready[col] = df_corr_ready[col].astype("category").cat.codes

# 3) Segment(숫자)와 다른 컬럼 간 상관계수
corr_series = df_corr_ready.drop(columns=["_segment_num"])    \
                           .corrwith(df_corr_ready["_segment_num"])

# 4) 정렬 & DataFrame 화
corr_df = (corr_series.to_frame("corr")
           .assign(abs_corr=lambda d: d["corr"].abs())
           .sort_values("abs_corr", ascending=False))

display(corr_df)

Unnamed: 0,corr,abs_corr
정상청구원금_B5M,-6.607380e-01,6.607380e-01
이용금액_R3M_신용체크,-6.228267e-01,6.228267e-01
이용금액대,-5.682901e-01,5.682901e-01
정상입금원금_B0M,-5.492801e-01,5.492801e-01
_2순위쇼핑업종_이용금액,-4.873637e-01,4.873637e-01
...,...,...
시장단기연체여부_R6M,1.307753e-03,1.307753e-03
시장단기연체여부_R3M,1.071772e-03,1.071772e-03
시장연체상환여부_R6M,2.984966e-04,2.984966e-04
기준년월,-6.874129e-20,6.874129e-20


### 남은 컬럼 중 Segment와의 상관계수의 절댓값이  0.1 미만인 컬럼을 제거한다.

In [55]:
# ----------------------------------------------------------
# corr_df : Segment 상관계수 결과 DataFrame  (열: 'corr', 'abs_corr')
# df3     : 현재 피처 선택이 끝난 데이터프레임
#          (Segment 컬럼은 아직 붙어 있지 않음)
# ----------------------------------------------------------

# 1) 절댓값 0.1 미만 컬럼 리스트 추출
low_corr_cols = corr_df.loc[corr_df["abs_corr"] < 0.1].index.tolist()
print("삭제 대상 컬럼 (|r| < 0.1):", low_corr_cols)

# 2) 컬럼 제거 후 새 DataFrame 생성
df5 = df3.drop(columns=low_corr_cols, errors="ignore")

print("원본 df3 shape :", df3.shape)
print("df5  shape     :", df5.shape)   # 컬럼 수 확인

삭제 대상 컬럼 (|r| < 0.1): ['이용금액_B페이_R6M', '일시상환론한도금액', '할인건수_B0M', '여유_Pet이용금액', '여유_항공이용금액', '한도증액금액_R12M', 'RP건수_아파트_B0M', '컨택건수_이용유도_청구서_B0M', '증감율_이용건수_할부_전월', '이용건수_선결제_B0M', 'RV_최대잔액_R12M', '이용금액_연체_R6M', '증감율_이용금액_일시불_전월', '혜택수혜율_B0M', '여유_공원이용금액', '컨택건수_이용유도_TM_B0M', '증감율_이용건수_체크_전월', '컨택건수_이용유도_인터넷_R6M', '할부금액_부분_12M_R12M', '이용건수_카드론_R12M', '연체건수_R6M', '할인건수_R3M', '변동률_일시불평잔', '증감율_이용건수_CA_전월', '월상환론한도금액', '변동률_CA평잔', '컨택건수_신용발급_TM_R6M', '컨택건수_이용유도_LMS_B0M', '카드신청건수', '강제한도감액금액_R12M', 'CL이자율_할인전', 'RV전환가능여부', '이용거절여부_카드론', '증감율_이용건수_할부_분기', 'RV최소결제비율', '증감율_이용금액_CA_분기', 'RP건수_가스_B0M', '컨택건수_포인트소진_TM_R6M', '컨택건수_이용유도_청구서_R6M', '직장시도명', '증감율_이용금액_체크_전월', '청구서수령방법', 'RP건수_전기_B0M', '증감율_카드론_분기', 'RP건수_보험_B0M', '최종카드발급경과월', '증감율_이용금액_할부_전월', '캠페인접촉일수_R12M', '변동률_할부평잔', '최종카드론_거치개월수', '할부금액_14M_R12M', 'RP건수_제휴사서비스직접판매_B0M', '증감율_이용금액_CA_전월', '변동률_잔액_CA_B1M', '연체감액여부_R3M', '할부금액_유이자_14M_R12M', '증감_RP건수_전월', '증감율_체크_분기', '증감율_이용건수_일시불_전월', '자발한도감액금액_R12M', '증감율_카드론_전월', '변동률_잔액_일시불_B1M',

In [57]:
df5

Unnamed: 0,이용금액_R3M_신용체크,입회경과개월수_신용,소지카드수_유효_신용,_2순위카드이용금액,최초한도금액,CA한도금액,CA이자율_할인전,시장연체상환여부_R3M,이용건수_할부_B0M,이용건수_할부_유이자_B0M,...,IB문의건수_사용승인내역_R6M,IB문의건수_선결제_R6M,불만제기후경과월_R12M,홈페이지_금융건수_R6M,홈페이지_선결제건수_R6M,인입횟수_ARS_R6M,컨택건수_이용유도_TM_R6M,컨택건수_이용유도_EM_R6M,잔액_한도소진율,변동률_RV평잔
0,196,67,1,0,0,7270,22.995207,0,1,0,...,0,0,12,0,0,1,3,57,4.013659,0.321733
1,13475,12,1,0,0,5718,14.793821,0,0,0,...,0,0,12,0,0,0,2,2,7.318237,0.606623
2,23988,124,1,0,0,35207,22.014276,0,0,0,...,0,0,12,11,6,0,2,12,4.426060,0.340561
3,3904,27,2,0,0,6531,22.998014,0,1,1,...,0,0,12,0,0,1,2,35,6.573680,0.321733
4,1190,2,1,0,0,47149,14.661948,0,0,0,...,0,0,0,0,0,0,7,0,-2.044904,0.321733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,10755,209,1,0,0,10167,15.243670,0,0,0,...,0,0,0,0,0,0,0,0,-1.890821,0.321733
2399996,27636,17,1,0,0,31159,14.843464,0,0,0,...,0,0,12,0,0,0,0,58,-0.785374,0.321733
2399997,23187,115,1,0,0,19429,17.038599,0,0,0,...,0,0,12,0,0,0,0,0,-0.233282,0.321733
2399998,0,71,1,0,0,4228,15.182880,0,0,0,...,0,0,0,0,0,0,0,0,-1.998546,0.321733


### '기준년월', 'ID' 컬럼을 추가한다.

In [65]:
# ---------------------------------------------------------
# df1 : 원본 데이터프레임  (컬럼: '기준년월', 'ID', ...)
# df5 : 최종 피처 세트      (Segment 관련 필터링까지 끝난 상태)
# 목표 : df1의 '기준년월', 'ID' 두 컬럼을 df5 맨 앞에 붙이기
# ---------------------------------------------------------

# ① df5 복사(선택). 원본 유지가 필요 없다면 생략 가능
df5 = df5.copy()

# ② 맨 앞(col 0)에 '기준년월' 삽입
df5.insert(loc=0, column="기준년월", value=df1["기준년월"].values)

# ③ 그 다음(col 1)에 'ID' 삽입
df5.insert(loc=1, column="ID", value=df1["ID"].values)

# ④ 확인
print("df5 shape :", df5.shape)
display(df5.head())

df5 shape : (2400000, 92)


Unnamed: 0,기준년월,ID,이용금액_R3M_신용체크,입회경과개월수_신용,소지카드수_유효_신용,_2순위카드이용금액,최초한도금액,CA한도금액,CA이자율_할인전,시장연체상환여부_R3M,...,IB문의건수_사용승인내역_R6M,IB문의건수_선결제_R6M,불만제기후경과월_R12M,홈페이지_금융건수_R6M,홈페이지_선결제건수_R6M,인입횟수_ARS_R6M,컨택건수_이용유도_TM_R6M,컨택건수_이용유도_EM_R6M,잔액_한도소진율,변동률_RV평잔
0,201807,TRAIN_000000,196,67,1,0,0,7270,22.995207,0,...,0,0,12,0,0,1,3,57,4.013659,0.321733
1,201807,TRAIN_000001,13475,12,1,0,0,5718,14.793821,0,...,0,0,12,0,0,0,2,2,7.318237,0.606623
2,201807,TRAIN_000002,23988,124,1,0,0,35207,22.014276,0,...,0,0,12,11,6,0,2,12,4.42606,0.340561
3,201807,TRAIN_000003,3904,27,2,0,0,6531,22.998014,0,...,0,0,12,0,0,1,2,35,6.57368,0.321733
4,201807,TRAIN_000004,1190,2,1,0,0,47149,14.661948,0,...,0,0,0,0,0,0,7,0,-2.044904,0.321733


### 저장한다.

In [69]:
df5.to_parquet('train_all_high_corr.parquet')