# 채널정보 EDA

In [None]:
import sys
sys.path.append('../utils')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn import set_config
from preprocessing import (
    DateElapsedTransformer, NumericFeaturePreprocessor, ObjectFeaturePreprocessor, NumericTypeOptimizer,
)

pre_optimizer = NumericTypeOptimizer(mode='pre')
date_transformer = DateElapsedTransformer()
numeric_transformer = NumericFeaturePreprocessor()
object_transformer = ObjectFeaturePreprocessor()
post_optimizer = NumericTypeOptimizer(mode='post')

preprocessing_pipeline = Pipeline([
    ('pre_optimizer', pre_optimizer),
    ('object', object_transformer),
    ('date', date_transformer),
    ('numeric', numeric_transformer),
    ('post_optimizer', post_optimizer),
])

# Set display configuration to visualize the pipeline
set_config(display='diagram')

# Display the pipeline
preprocessing_pipeline

In [3]:
from sklearn.preprocessing import LabelEncoder
from data_loader import (
    load_data,
    load_segment
)

channel_df, channel_test_df = load_data("6.채널정보")

X = channel_df.drop(columns=['ID', '기준년월'])
X = preprocessing_pipeline.fit_transform(X)
X_test = preprocessing_pipeline.transform(channel_test_df.drop(columns=['ID', '기준년월']))

y = load_segment()
le = LabelEncoder()
y_encoded = le.fit_transform(y)

✅ File: ../../dataset/train/6.채널정보\201807_train_채널정보.parquet Completed!
✅ File: ../../dataset/train/6.채널정보\201808_train_채널정보.parquet Completed!
✅ File: ../../dataset/train/6.채널정보\201809_train_채널정보.parquet Completed!
✅ File: ../../dataset/train/6.채널정보\201810_train_채널정보.parquet Completed!
✅ File: ../../dataset/train/6.채널정보\201811_train_채널정보.parquet Completed!
✅ File: ../../dataset/train/6.채널정보\201812_train_채널정보.parquet Completed!
🔹 Shape : (2400000, 105)

✅ File: ../../dataset/test/6.채널정보\201807_test_채널정보.parquet Completed!
✅ File: ../../dataset/test/6.채널정보\201808_test_채널정보.parquet Completed!
✅ File: ../../dataset/test/6.채널정보\201809_test_채널정보.parquet Completed!
✅ File: ../../dataset/test/6.채널정보\201810_test_채널정보.parquet Completed!
✅ File: ../../dataset/test/6.채널정보\201811_test_채널정보.parquet Completed!
✅ File: ../../dataset/test/6.채널정보\201812_test_채널정보.parquet Completed!
🔹 Shape : (600000, 105)
Numeric Type Optimizer Transforming...
🧠 [mode=pre] 메모리 최적화: 2914.05 MB → 1362.23 MB (53.3% 감소)
Ob

## Feature Selection

In [4]:
from feature import FeatureSelector

feature_engineering_pipeline = Pipeline([
    ('feature_selector', FeatureSelector()),
    ('optimize', post_optimizer),
])

X = feature_engineering_pipeline.fit_transform(X, y_encoded)
X_test = feature_engineering_pipeline.transform(X_test)

🎯 Fitting RandomForest for Feature Selection...
✅ Selected 24 features out of 82
📊 Top Selected Features by Importance:
1) 	인입일수_ARS_R6M (0.0329)
2) 	인입월수_ARS_R6M (0.0264)
3) 	인입일수_ARS_B0M (0.0131)
4) 	방문월수_PC_R6M (0.0528)
5) 	방문후경과월_PC_R6M (0.0490)
6) 	방문일수_앱_R6M (0.0359)
7) 	방문월수_앱_R6M (0.0314)
8) 	방문후경과월_앱_R6M (0.0280)
9) 	방문횟수_PC_B0M (0.0145)
10) 	방문일수_PC_B0M (0.0126)
11) 	방문횟수_앱_B0M (0.0211)
12) 	방문일수_앱_B0M (0.0155)
13) 	인입횟수_IB_R6M (0.0156)
14) 	상담건수_R6M (0.0199)
15) 	불만제기후경과월_R12M (0.0732)
16) 	당사멤버쉽_방문횟수_R6M (0.0323)
17) 	당사멤버쉽_방문월수_R6M (0.0466)
18) 	홈페이지_금융건수_R6M (0.0497)
19) 	홈페이지_선결제건수_R6M (0.0300)
20) 	홈페이지_금융건수_R3M (0.0371)
21) 	홈페이지_선결제건수_R3M (0.0156)
22) 	방문횟수_PC_R6M_1회 이상 (0.0145)
23) 	OS구분코드_IOS (0.0124)
24) 	OS구분코드_unknown (0.1526)




Numeric Type Optimizer Transforming...
🧠 [mode=post] 메모리 최적화: 109.86 MB → 57.22 MB (47.9% 감소)
Numeric Type Optimizer Transforming...
🧠 [mode=post] 메모리 최적화: 27.47 MB → 14.31 MB (47.9% 감소)




In [5]:
import pandas as pd

train_df = pd.concat([channel_df[['ID', '기준년월']], X], axis=1)
test_df = pd.concat([channel_test_df[['ID', '기준년월']], X_test], axis=1)

train_df.to_parquet('../../dataset/train/channel.parquet', index=False)
test_df.to_parquet('../../dataset/test/channel.parquet', index=False)