# 승인매출정보 EDA

In [1]:
import sys
sys.path.append('../utils')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn import set_config
from preprocessing import (
    DateElapsedTransformer, NumericFeaturePreprocessor, ObjectFeaturePreprocessor, NumericTypeOptimizer,
)

date_transformer = DateElapsedTransformer()
numeric_transformer = NumericFeaturePreprocessor()
object_transformer = ObjectFeaturePreprocessor()
post_optimizer = NumericTypeOptimizer(mode='post')

preprocessing_pipeline = Pipeline([
    ('object', object_transformer),
    ('date', date_transformer),
    ('numeric', numeric_transformer),
    ('post_optimizer', post_optimizer),
])

# Set display configuration to visualize the pipeline
set_config(display='diagram')

# Display the pipeline
preprocessing_pipeline

In [3]:
import numpy as np

def optimize_numeric_types(df, verbose=True):
    """
    DataFrame의 수치형(int, float) 컬럼을 숫자 범위에 맞춰 다운캐스팅하여 메모리를 최적화
    
    Parameters:
    - df (pd.DataFrame): 입력 데이터프레임
    - verbose (bool): 최적화 전후 메모리 사용량 출력 여부

    Returns:
    - pd.DataFrame: 최적화된 데이터프레임
    """
    start_mem = df.memory_usage(deep=True).sum() / 1024**2 # MB단위로 변환
    
    for col in df.select_dtypes(include=['int', 'float']).columns:
        col_type = df[col].dtypes
        
        if np.issubdtype(col_type, np.integer):
            c_min = df[col].min()
            c_max = df[col].max()
            
            if c_min >= 0:
                if c_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif c_max < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16)
                elif c_max < np.iinfo(np.uint32).max:
                    df[col] = df[col].astype(np.uint32)
                else:
                    df[col] = df[col].astype(np.uint64)
            else:
                if np.iinfo(np.int8).min <= c_min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif np.iinfo(np.int16).min <= c_min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif np.iinfo(np.int32).min <= c_min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
                    
        elif np.issubdtype(col_type, np.floating):
            c_min = df[col].min()
            c_max = df[col].max()
            
            if np.finfo(np.float16).min <= c_min and c_max <= np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif np.finfo(np.float32).min <= c_min and c_max <= np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    
    if verbose:
        print(f"🔹 메모리 사용량: {start_mem:.2f} MB → {end_mem:.2f} MB  ({100*(start_mem-end_mem)/start_mem:.1f}% 감소)")
    
    return df

In [4]:
from sklearn.preprocessing import LabelEncoder
from data_loader import (
    load_data,
    load_segment
)

tx_df, tx_test_df = load_data("3.승인매출정보")

✅ File: ../../dataset/train/3.승인매출정보\201807_train_승인매출정보.parquet Completed!
✅ File: ../../dataset/train/3.승인매출정보\201808_train_승인매출정보.parquet Completed!
✅ File: ../../dataset/train/3.승인매출정보\201809_train_승인매출정보.parquet Completed!
✅ File: ../../dataset/train/3.승인매출정보\201810_train_승인매출정보.parquet Completed!
✅ File: ../../dataset/train/3.승인매출정보\201811_train_승인매출정보.parquet Completed!
✅ File: ../../dataset/train/3.승인매출정보\201812_train_승인매출정보.parquet Completed!
🔹 Shape : (2400000, 406)

✅ File: ../../dataset/test/3.승인매출정보\201807_test_승인매출정보.parquet Completed!
✅ File: ../../dataset/test/3.승인매출정보\201808_test_승인매출정보.parquet Completed!
✅ File: ../../dataset/test/3.승인매출정보\201809_test_승인매출정보.parquet Completed!
✅ File: ../../dataset/test/3.승인매출정보\201810_test_승인매출정보.parquet Completed!
✅ File: ../../dataset/test/3.승인매출정보\201811_test_승인매출정보.parquet Completed!
✅ File: ../../dataset/test/3.승인매출정보\201812_test_승인매출정보.parquet Completed!
🔹 Shape : (600000, 406)


In [5]:
X = optimize_numeric_types(tx_df)
X_test = optimize_numeric_types(tx_test_df)

🔹 메모리 사용량: 9165.26 MB → 3628.61 MB  (60.4% 감소)
🔹 메모리 사용량: 2288.73 MB → 900.56 MB  (60.7% 감소)


In [6]:
X = X.drop(columns=['ID', '기준년월'])
X_test = X_test.drop(columns=['ID', '기준년월'])

X = preprocessing_pipeline.fit_transform(X)
X_test = preprocessing_pipeline.transform(X_test)

y = load_segment()
le = LabelEncoder()
y_encoded = le.fit_transform(y)

Object Feature Preprocessor Fitting...
Object Feature Preprocessor Transforming...
Date Elapsed Transformer Fitting...
✅ Total date columns: ['최종이용일자_기본', '최종이용일자_신판', '최종이용일자_CA', '최종이용일자_카드론', '최종이용일자_체크', '최종이용일자_일시불', '최종이용일자_할부', '최종카드론_대출일자']
✅ Total date columns to keep: 8
Date Elapsed Transformer Transforming...
✅ Transformed Complete!
🔹 Transformation Time: 5.20 seconds
🔹 Shape after transformation: (2400000, 509)
Numeric Feature Preprocessor Fitting...


  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


✅ Total numeric columns: ['이용건수_신용_B0M', '이용건수_신판_B0M', '이용건수_일시불_B0M', '이용건수_할부_B0M', '이용건수_할부_유이자_B0M', '이용건수_할부_무이자_B0M', '이용건수_CA_B0M', '이용건수_체크_B0M', '이용건수_카드론_B0M', '이용금액_일시불_B0M', '이용금액_할부_B0M', '이용금액_할부_유이자_B0M', '이용금액_할부_무이자_B0M', '이용금액_CA_B0M', '이용금액_체크_B0M', '이용금액_카드론_B0M', '이용후경과월_신용', '이용후경과월_신판', '이용후경과월_일시불', '이용후경과월_할부', '이용후경과월_할부_유이자', '이용후경과월_할부_무이자', '이용후경과월_부분무이자', '이용후경과월_CA', '이용후경과월_체크', '이용후경과월_카드론', '이용건수_신용_R12M', '이용건수_신판_R12M', '이용건수_일시불_R12M', '이용건수_할부_R12M', '이용건수_할부_유이자_R12M', '이용건수_할부_무이자_R12M', '이용건수_부분무이자_R12M', '이용건수_CA_R12M', '이용건수_체크_R12M', '이용건수_카드론_R12M', '이용금액_일시불_R12M', '이용금액_할부_R12M', '이용금액_할부_유이자_R12M', '이용금액_할부_무이자_R12M', '이용금액_부분무이자_R12M', '이용금액_CA_R12M', '이용금액_체크_R12M', '이용금액_카드론_R12M', '최대이용금액_일시불_R12M', '최대이용금액_할부_R12M', '최대이용금액_할부_유이자_R12M', '최대이용금액_할부_무이자_R12M', '최대이용금액_부분무이자_R12M', '최대이용금액_CA_R12M', '최대이용금액_체크_R12M', '최대이용금액_카드론_R12M', '이용개월수_신용_R12M', '이용개월수_신판_R12M', '이용개월수_일시불_R12M', '이용개월수_할부_R12M', '이용개월수_할부_유이자_R12M', '이용개월수_할부_

# Feature Selection

In [7]:
from feature import FeatureSelector

feature_engineering_pipeline = Pipeline([
    ('feature_selector', FeatureSelector()),
    ('optimize', post_optimizer),
])

X = feature_engineering_pipeline.fit_transform(X, y_encoded)
X_test = feature_engineering_pipeline.transform(X_test)

🎯 Fitting RandomForest for Feature Selection...
✅ Selected 82 features out of 478
📊 Top Selected Features by Importance:
1) 	이용건수_신용_B0M (0.0055)
2) 	이용건수_신판_B0M (0.0058)
3) 	이용건수_일시불_B0M (0.0073)
4) 	이용건수_체크_B0M (0.0109)
5) 	이용금액_일시불_B0M (0.0328)
6) 	이용금액_CA_B0M (0.0050)
7) 	이용금액_체크_B0M (0.0099)
8) 	이용후경과월_CA (0.0033)
9) 	이용후경과월_체크 (0.0028)
10) 	이용건수_CA_R12M (0.0033)
11) 	이용금액_일시불_R12M (0.0025)
12) 	이용금액_할부_R12M (0.0030)
13) 	이용금액_할부_무이자_R12M (0.0050)
14) 	이용금액_CA_R12M (0.0081)
15) 	이용금액_체크_R12M (0.0072)
16) 	최대이용금액_일시불_R12M (0.0232)
17) 	최대이용금액_할부_R12M (0.0028)
18) 	최대이용금액_할부_무이자_R12M (0.0030)
19) 	최대이용금액_CA_R12M (0.0086)
20) 	최대이용금액_체크_R12M (0.0180)
21) 	이용개월수_신용_R12M (0.0156)
22) 	이용개월수_신판_R12M (0.0151)
23) 	이용개월수_일시불_R12M (0.0107)
24) 	이용개월수_CA_R12M (0.0045)
25) 	이용개월수_체크_R12M (0.0066)
26) 	이용건수_CA_R6M (0.0021)
27) 	이용건수_체크_R6M (0.0025)
28) 	이용금액_일시불_R6M (0.0028)
29) 	이용금액_CA_R6M (0.0034)
30) 	이용금액_체크_R6M (0.0037)
31) 	이용개월수_체크_R6M (0.0039)
32) 	이용건수_신판_R3M (0.0033)
33) 	이용건수_일시불_



Numeric Type Optimizer Transforming...


  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)


🧠 [mode=post] 메모리 최적화: 1501.46 MB → 375.37 MB (75.0% 감소)




Numeric Type Optimizer Transforming...


  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)
  return arr.astype(dtype, copy=True)


🧠 [mode=post] 메모리 최적화: 375.37 MB → 93.84 MB (75.0% 감소)


  return arr.astype(dtype, copy=True)


In [8]:
import pandas as pd

train_df = pd.concat([tx_df[['ID', '기준년월']], X], axis=1)
test_df = pd.concat([tx_test_df[['ID', '기준년월']], X_test], axis=1)

train_df.to_parquet('../../dataset/train/tx.parquet', index=False)
test_df.to_parquet('../../dataset/test/tx.parquet', index=False)