In [None]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
data_splits = ["train","test"]
months = ["07","08","09","10","11","12"]
# 각 데이터 유형별 폴더명, 파일 접미사, 변수 접두어 설정
data_categories = {
    "회원정보": {"folder": "1.회원정보", "suffix": "회원정보", "var_prefix": "customer"},
    "신용정보": {"folder": "2.신용정보", "suffix": "신용정보", "var_prefix": "credit"},
    "승인매출정보": {"folder": "3.승인매출정보", "suffix": "승인매출정보", "var_prefix": "sales"},
    "청구정보": {"folder": "4.청구입금정보", "suffix": "청구정보", "var_prefix": "billing"},
    "잔액정보": {"folder": "5.잔액정보", "suffix": "잔액정보", "var_prefix": "balance"},
    "채널정보": {"folder": "6.채널정보", "suffix": "채널정보", "var_prefix": "channel"},
    "마케팅정보": {"folder": "7.마케팅정보", "suffix": "마케팅정보", "var_prefix": "marketing"},
    "성과정보": {"folder": "8.성과정보", "suffix": "성과정보", "var_prefix": "performance"}
}

# local
root_path = '../data/open'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

#### 머신러닝 돌릴 월,카테고리 선택 ⚠️ 이 부분 꼭 확인해서 설정한 뒤에 실행!!

In [None]:
select_category = "회원정보"

folder = data_categories[select_category]["folder"]
suffix = data_categories[select_category]["suffix"]

In [None]:
def get_top_8_features(model, X, y):
    model.fit(X, y)
    importances = model.feature_importances_
    importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': importances
    }).sort_values(by='importance', ascending=False)
    return importance_df['feature'].head(8).tolist()

In [None]:
### 1. 파일 불러오기
# 훈련 데이터
file_name=f"{root_path}/{data_splits[0]}/{folder}/cleaned_{suffix}.parquet"
df=pd.read_parquet(file_name)

### 2. one-hot -> 단일 라벨 복원
df['Segment'] = df[['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E']].idxmax(axis=1)
df['Segment'] = df['Segment'].map({
    'Segment_A':0, 'Segment_B':1, 'Segment_C':2, 'Segment_D':3, 'Segment_E':4
})

### 3. feature columns 설정
exclude_cols = ['ID', '기준년월', 'Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E', 'Segment']
feature_cols = [col for col in df.columns if col not in exclude_cols]
target = 'Segment'  # 통합된 단일 타겟

X = df[feature_cols]
y=df[target]

### 4. feature importance 상위 8개 뽑기
xgb_top8 = get_top_8_features(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), X, y)
#lgb_top8 = get_top_8_features(LGBMClassifier(), X, y)
#cat_top8 = get_top_8_features(CatBoostClassifier(verbose=0), X, y)

In [None]:
# 저장

meta_cols = ['ID', '기준년월']
seg_cols = ['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E']
if suffix == "회원정보":
    df_xgb = df[meta_cols + xgb_top8 + seg_cols]
else :
    df_xgb = df[meta_cols + xgb_top8]
#df_lgb = df[meta_cols + lgb_top8]
#df_cat = df[meta_cols + cat_top8]

output_path = f'{root_path}/{data_splits[0]}/{folder}'
df_xgb.to_parquet(f'{output_path}/xgb_top8_{suffix}.parquet')
#df_lgb.to_parquet(f'{output_path}/lgb_top8_{suffix}.parquet')
#df_cat.to_parquet(f'{output_path}/cat_top8_{suffix}.parquet')

In [None]:
df_xgb