In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

from sklearn.metrics import confusion_matrix, classification_report

In [2]:
data_type = "train"
# month = "07"
# category = "청구입금정보"

# local
root_path = '../data/open'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

drive_folder = f'{root_path}/{data_type}/4.청구입금정보/'

In [3]:
# 1. 데이터 불러오기
df = pd.read_parquet(f'{drive_folder}cleaned_청구정보.parquet')
df

# 2. 컬럼 분리
segment_cols = ['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E']
X = df.drop(columns=['ID'] + segment_cols)
y = df[segment_cols].idxmax(axis=1)  # 'Segment_A' ~ 'Segment_E'

# ❗ 3. 문자열 라벨 인코딩
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # → 0~4 숫자

# 4. train/test 분리
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [4]:
# 5. 모델 학습
model = XGBClassifier(
    objective="multi:softmax",
    num_class=5,
    eval_metric="mlogloss",
    n_estimators=300,
    tree_method='hist',
    device='cuda',
    use_label_encoder=False,
    random_state=42,
)
model.fit(X_train, y_train)

In [10]:
# 6. 중요도 추출
importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("🎯 XGBoost Feature Importance - TOP 8")
print(importances.head(8))

🎯 XGBoost Feature Importance - TOP 8
청구금액_R6M          0.391430
할인건수_R3M          0.065162
포인트_적립포인트_R12M    0.043432
청구서수령방법           0.031252
청구서발송여부_B0        0.023568
포인트_이용포인트_R12M    0.022664
포인트_포인트_건별_R3M    0.022301
청구금액_R3M          0.021837
dtype: float32


In [12]:
# 1. 상위 8개 feature만 추출
top_8_features = importances.head(8).index.tolist()

In [14]:
# 2. X 기준과 행 일치 보장 + ID, 기준년월 포함해서 최종 컬럼 구성
df_selected = df.loc[X.index, ['ID', '기준년월'] + top_8_features]

In [16]:
# 4. 저장
df_selected.to_parquet(f'{drive_folder}xgb_top8_청구정보.parquet', index=False)

In [20]:
# 데이터 불러오기
df100 = pd.read_parquet(f'{drive_folder}xgb_top8_청구정보.parquet')
df100

Unnamed: 0,ID,기준년월,청구금액_R6M,할인건수_R3M,포인트_적립포인트_R12M,청구서수령방법,청구서발송여부_B0,포인트_이용포인트_R12M,포인트_포인트_건별_R3M,청구금액_R3M
0,TRAIN_000000,201807,88693,0,3460,2,1,4008,1827,46588
1,TRAIN_000001,201807,16861,0,0,0,1,0,0,10530
2,TRAIN_000002,201807,165221,0,17135,1,1,8312,5153,85931
3,TRAIN_000003,201807,127371,0,6464,0,1,8815,2523,61518
4,TRAIN_000004,201807,155,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2399995,TRAIN_399995,201812,0,0,2379,0,0,6971,0,0
2399996,TRAIN_399996,201812,99849,0,64696,1,1,59260,10890,37515
2399997,TRAIN_399997,201812,41073,0,0,0,1,0,0,22274
2399998,TRAIN_399998,201812,0,0,0,0,0,0,0,0


In [134]:
# 데이터 불러오기
df200 = pd.read_parquet('../data/open/train/xgb_top8_All.parquet')
df200

Unnamed: 0,기준년월,ID,입회경과개월수_신용,_1순위카드이용금액,이용금액_R3M_신용체크,최종카드발급경과월,회원여부_이용가능_카드론,_1순위카드이용건수,연령,이용거절여부_카드론,...,컨택건수_CA_청구서_R6M,컨택건수_이용유도_인터넷_R6M,변동률_RVCA평잔,증감율_이용금액_일시불_전월,증감율_이용금액_체크_전월,변동률_잔액_CA_B1M,변동률_CA평잔,증감율_이용금액_CA_분기,증감율_이용건수_할부_전월,증감율_이용금액_CA_전월
0,201807,TRAIN_000000,67,3681,196,22,0,26,40,0,...,0,2,0.999998,0.313300,0.000000,0.000000,0.999700,0.000000,1.999996,0.0
1,201807,TRAIN_000001,12,13323,13475,18,1,46,30,0,...,2,6,0.999998,-1.010817,0.000000,0.000000,0.999998,0.000000,0.000000,0.0
2,201807,TRAIN_000002,124,24493,23988,20,0,28,30,0,...,0,3,0.999998,0.289565,0.000000,-0.014191,0.852567,-0.151124,-1.999996,0.0
3,201807,TRAIN_000003,27,5933,3904,17,0,1,40,0,...,0,0,0.999998,0.276947,0.000000,0.000000,0.999877,-0.067794,-1.999996,0.0
4,201807,TRAIN_000004,2,0,1190,15,1,-2,40,0,...,0,1,0.999998,0.000028,0.000000,0.000000,0.999998,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,209,5640,10755,39,1,3,70,0,...,0,0,0.999998,0.000027,-0.063101,0.000000,0.999998,0.000000,0.000000,0.0
2399996,201812,TRAIN_399996,17,26357,27636,24,1,38,50,0,...,0,0,0.999998,0.207099,0.000000,0.000000,0.999998,0.000000,0.000000,0.0
2399997,201812,TRAIN_399997,115,17171,23187,18,0,33,30,0,...,0,0,0.999998,0.229573,0.000000,0.000000,0.999998,0.000000,0.000000,0.0
2399998,201812,TRAIN_399998,71,0,0,27,1,-2,40,0,...,0,1,0.999998,0.000034,0.000000,0.000000,0.999998,0.000000,0.000000,0.0


In [138]:
df200.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400000 entries, 0 to 2399999
Data columns (total 71 columns):
 #   Column              Dtype  
---  ------              -----  
 0   기준년월                int32  
 1   ID                  object 
 2   입회경과개월수_신용          int32  
 3   _1순위카드이용금액          int32  
 4   이용금액_R3M_신용체크       int32  
 5   최종카드발급경과월           int32  
 6   회원여부_이용가능_카드론       int32  
 7   _1순위카드이용건수          int32  
 8   연령                  int32  
 9   이용거절여부_카드론          int32  
 10  Segment_A           bool   
 11  Segment_B           bool   
 12  Segment_C           bool   
 13  Segment_D           bool   
 14  Segment_E           bool   
 15  카드이용한도금액_A수준복합      int32  
 16  일시불ONLY전환가능여부       int32  
 17  RV실사용여부             int32  
 18  상향가능CA한도금액          int32  
 19  카드이용한도금액            int32  
 20  카드이용한도금액_B1M        int32  
 21  CA한도금액              int32  
 22  강제한도감액횟수_R12M       int32  
 23  정상청구원금_B5M          int32  
 24  최대이용금액_체크_R12M      int3