# 성과정보 EDA

In [1]:
import sys
sys.path.append('../utils')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn import set_config
from preprocessing import (
    DateElapsedTransformer, NumericFeaturePreprocessor, ObjectFeaturePreprocessor, NumericTypeOptimizer,
)

pre_optimizer = NumericTypeOptimizer(mode='pre')
date_transformer = DateElapsedTransformer()
numeric_transformer = NumericFeaturePreprocessor()
object_transformer = ObjectFeaturePreprocessor()
post_optimizer = NumericTypeOptimizer(mode='post')

preprocessing_pipeline = Pipeline([
    ('pre_optimizer', pre_optimizer),
    ('object', object_transformer),
    ('date', date_transformer),
    ('numeric', numeric_transformer),
    ('post_optimizer', post_optimizer),
])

# Set display configuration to visualize the pipeline
set_config(display='diagram')

# Display the pipeline
preprocessing_pipeline

In [3]:
from sklearn.preprocessing import LabelEncoder
from data_loader import (
    load_data,
    load_segment
)

perf_df, perf_test_df = load_data("8.성과정보")

X = perf_df.drop(columns=['ID', '기준년월'])
X = preprocessing_pipeline.fit_transform(X)
X_test = preprocessing_pipeline.transform(perf_test_df.drop(columns=['ID', '기준년월']))

y = load_segment()
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X.info()

✅ File: ../../dataset/train/8.성과정보\201807_train_성과정보.parquet Completed!
✅ File: ../../dataset/train/8.성과정보\201808_train_성과정보.parquet Completed!
✅ File: ../../dataset/train/8.성과정보\201809_train_성과정보.parquet Completed!
✅ File: ../../dataset/train/8.성과정보\201810_train_성과정보.parquet Completed!
✅ File: ../../dataset/train/8.성과정보\201811_train_성과정보.parquet Completed!
✅ File: ../../dataset/train/8.성과정보\201812_train_성과정보.parquet Completed!
🔹 Shape : (2400000, 49)

✅ File: ../../dataset/test/8.성과정보\201807_test_성과정보.parquet Completed!
✅ File: ../../dataset/test/8.성과정보\201808_test_성과정보.parquet Completed!
✅ File: ../../dataset/test/8.성과정보\201809_test_성과정보.parquet Completed!
✅ File: ../../dataset/test/8.성과정보\201810_test_성과정보.parquet Completed!
✅ File: ../../dataset/test/8.성과정보\201811_test_성과정보.parquet Completed!
✅ File: ../../dataset/test/8.성과정보\201812_test_성과정보.parquet Completed!
🔹 Shape : (600000, 49)
Numeric Type Optimizer Transforming...
🧠 [mode=pre] 메모리 최적화: 860.60 MB → 860.60 MB (0.0% 감소)
Object 

## Feature Selection

In [4]:
from feature import FeatureSelector

feature_engineering_pipeline = Pipeline([
    ('feature_selector', FeatureSelector()),
    ('optimize', post_optimizer),
])

X = feature_engineering_pipeline.fit_transform(X, y_encoded)
X_test = feature_engineering_pipeline.transform(X_test)

🎯 Fitting RandomForest for Feature Selection...
✅ Selected 14 features out of 47
📊 Top Selected Features by Importance:
1) 	증감율_이용건수_체크_전월 (0.0274)
2) 	증감율_이용금액_일시불_전월 (0.0213)
3) 	증감율_이용금액_체크_전월 (0.0509)
4) 	잔액_신판평균한도소진율_r6m (0.0365)
5) 	잔액_신판최대한도소진율_r6m (0.0339)
6) 	잔액_신판평균한도소진율_r3m (0.0237)
7) 	잔액_신판최대한도소진율_r3m (0.0493)
8) 	잔액_신판ca평균한도소진율_r6m (0.0492)
9) 	잔액_신판ca최대한도소진율_r6m (0.0685)
10) 	잔액_신판ca평균한도소진율_r3m (0.0508)
11) 	잔액_신판ca최대한도소진율_r3m (0.0681)
12) 	변동률_일시불평잔 (0.0235)
13) 	혜택수혜율_R3M (0.0780)
14) 	혜택수혜율_B0M (0.1301)




Numeric Type Optimizer Transforming...
🧠 [mode=post] 메모리 최적화: 64.09 MB → 64.09 MB (0.0% 감소)




Numeric Type Optimizer Transforming...
🧠 [mode=post] 메모리 최적화: 16.02 MB → 16.02 MB (0.0% 감소)


In [5]:
import pandas as pd

train_df = pd.concat([perf_df[['ID', '기준년월']], X], axis=1)
test_df = pd.concat([perf_test_df[['ID', '기준년월']], X_test], axis=1)

train_df.to_parquet('../../dataset/train/perf.parquet', index=False)
test_df.to_parquet('../../dataset/test/perf.parquet', index=False)