In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

In [2]:
df = pd.read_parquet('result/test/02_test_승인매출정보_라벨인코딩.parquet')

In [3]:
# 상수값 컬럼 제거
def remove_constant_columns(df):
    return df.loc[:, df.nunique() > 1]

# 결측치 비율 기준 컬럼 제거
def remove_high_na_columns(df, threshold=0.3):
    null_ratio = df.isnull().mean()
    to_drop = null_ratio[null_ratio > threshold].index
    print(f"⤷ 결측치 {int(threshold * 100)}% 초과 컬럼 수: {len(to_drop)}")
    return df.drop(columns=to_drop)

# 낮은 분산 컬럼 제거
def remove_low_variance_columns(df, threshold=0.001):
    selector = VarianceThreshold(threshold=threshold)
    reduced = selector.fit_transform(df)
    return df.loc[:, selector.get_support()]

# 상관관계 높은 컬럼 제거
def remove_highly_correlated(df, threshold=0.7):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print(f"⤷ 상관관계 {threshold} 초과 컬럼 수: {len(to_drop)}")
    return df.drop(columns=to_drop), to_drop

# 전체 전처리 실행 함수
def preprocess_by_correlation_only(df_numeric, na_thresh=0.3, var_thresh=0.001, corr_thresh=0.7):
    print(" 초기 수치형 컬럼 수:", df_numeric.shape[1])

    df1 = remove_constant_columns(df_numeric)
    print(" 상수값 컬럼 제거 후:", df1.shape[1])

    df2 = remove_high_na_columns(df1, threshold=na_thresh)
    print(" 결측치 컬럼 제거 후:", df2.shape[1])

    df3 = remove_low_variance_columns(df2, threshold=var_thresh)
    print(" 낮은 분산 컬럼 제거 후:", df3.shape[1])

    df_final, dropped_corr_cols = remove_highly_correlated(df3, threshold=corr_thresh)
    print(" 상관관계 중복 제거 후:", df_final.shape[1])

    return df_final, dropped_corr_cols

In [4]:
# 수치형 컬럼 추출 
df_numeric = df.select_dtypes(include=[np.number])

# 전처리 실행
df_cleaned, dropped_corr_cols = preprocess_by_correlation_only(df_numeric)

# 결과 확인
print(" 최종 남은 컬럼 수:", df_cleaned.shape[1])
print(" 상관관계로 제거된 컬럼 수:", len(dropped_corr_cols))
print(" 제거된 컬럼 목록:", dropped_corr_cols[:10]) 

 초기 수치형 컬럼 수: 405
 상수값 컬럼 제거 후: 370
⤷ 결측치 30% 초과 컬럼 수: 2
 결측치 컬럼 제거 후: 368
 낮은 분산 컬럼 제거 후: 357
⤷ 상관관계 0.7 초과 컬럼 수: 259
 상관관계 중복 제거 후: 98
 최종 남은 컬럼 수: 98
 상관관계로 제거된 컬럼 수: 259
 제거된 컬럼 목록: ['최종이용일자_신판', '최종이용일자_일시불', '이용건수_신판_B0M', '이용건수_일시불_B0M', '이용건수_할부_무이자_B0M', '이용금액_일시불_B0M', '이용금액_할부_B0M', '이용금액_할부_유이자_B0M', '이용금액_할부_무이자_B0M', '이용금액_CA_B0M']


In [5]:
df_cleaned

Unnamed: 0,기준년월,최종이용일자_기본,최종이용일자_CA,최종이용일자_카드론,최종이용일자_체크,최종이용일자_할부,이용건수_신용_B0M,이용건수_할부_B0M,이용건수_할부_유이자_B0M,이용건수_CA_B0M,...,이용개월수_선결제_R6M,이용횟수_연체_R6M,가맹점매출금액_B1M,연체입금원금_B0M,건수_할부전환_R6M,신청건수_ATM_CA_B0,승인거절건수_B0M,승인거절건수_R3M,승인거절건수_BL_R3M,승인거절건수_기타_R3M
0,201807,20180731,10101,10101,10101,20160913,28,0,0,0,...,0,0,0,2814,0,0,0,0,0,0
1,201807,20180725,20170710,20171107,20180731,20180722,8,2,0,0,...,0,0,0,918,0,2,0,0,0,0
2,201807,20180711,10101,10101,10101,20160703,59,0,0,0,...,0,0,6663,222,0,0,0,3,0,0
3,201807,20180731,20150822,20150801,10101,20160701,37,0,0,0,...,0,3,0,0,0,0,0,0,0,0
4,201807,20180716,20130917,10101,20180707,20180708,19,1,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,201812,20170228,10101,10101,10101,10101,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599996,201812,20181130,10101,10101,10101,10101,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599997,201812,10101,10101,10101,10101,10101,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599998,201812,20181230,10101,10101,20181225,20180608,78,0,0,0,...,6,0,0,16746,0,0,0,0,0,0


In [6]:
df_cleaned.to_parquet("result/test/03_test_승인매출정보_전처리.parquet", index=False)