In [2]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import gc

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### 데이터 세팅

In [3]:
train_df = pd.read_parquet("./saved_data/train_df.parquet")
test_df = pd.read_parquet("./saved_data/test_df.parquet")

In [4]:
# 1. X, y 나누기
feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]
X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()

# 2. 타겟 인코딩
from sklearn.preprocessing import LabelEncoder
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

#### 데이터 정리

In [5]:
# X에서 dtype이 object인 열만 선택, 즉 범주형 변수들만 선택해서 리스트로 저장
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

X_test = test_df.copy()

encoders = {}  # 각 컬럼별 encoder 저장

for col in categorical_features:
    le_train = LabelEncoder()
    X[col] = le_train.fit_transform(X[col])
    encoders[col] = le_train
    unseen_labels_val = set(X_test[col]) - set(le_train.classes_)
    # test 데이터에는 train에서 보지 못한 새로운 label이 있을 수 있음
    # 예: train에는 "A", "B"만 있었는데 test에 "C"가 등장한 경우
    if unseen_labels_val:
        le_train.classes_ = np.append(le_train.classes_, list(unseen_labels_val))
    X_test[col] = le_train.transform(X_test[col])

gc.collect()
# 불필요한 메모리 사용량을 줄이기 위해 Garbage Collector 실행

212

In [6]:
# 데이터 분할 (train/test)
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

### Train model 1 :: XGboost (X)

In [9]:
import xgboost as xgb

try:
    xgb_model = xgb.XGBClassifier(
        tree_method='gpu_hist',
        gpu_id=0,
        random_state=42
    )
    print("XGBoost: GPU 사용")
except:
    xgb_model = xgb.XGBClassifier(random_state=42)
    print("XGBoost: CPU 사용")

xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_val)

# 평가
print("🎯 XGBoost 성능:")
print("Accuracy:", accuracy_score(y_val, y_pred_xgb))
print(classification_report(y_val, y_pred_xgb, target_names=le_target.classes_))


XGBoost: GPU 사용



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




🎯 XGBoost 성능:
Accuracy: 0.9230958333333333
              precision    recall  f1-score   support

           A       0.96      0.68      0.79       194
           B       1.00      0.52      0.68        29
           C       0.79      0.69      0.74     25518
           D       0.77      0.73      0.75     69848
           E       0.96      0.97      0.97    384411

    accuracy                           0.92    480000
   macro avg       0.89      0.72      0.78    480000
weighted avg       0.92      0.92      0.92    480000



#### XGBoost에서 중요한 feature 뽑아보기

In [12]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import platform

# 1. 운영체제별 기본 한글 폰트 지정
if platform.system() == 'Windows':
    plt.rc('font', family='Malgun Gothic')  # 윈도우: 맑은 고딕
elif platform.system() == 'Darwin':
    plt.rc('font', family='AppleGothic')    # 맥: 애플 고딕
else:
    # 리눅스 또는 기타: 나눔 고딕 등 설치 필요
    plt.rc('font', family='NanumGothic')

# 2. 마이너스 부호 깨짐 방지
plt.rcParams['axes.unicode_minus'] = False

In [19]:
import matplotlib.pyplot as plt
import pandas as pd

# 1. 특성 중요도 추출
importances = xgb_model.feature_importances_

# 2. 특성 이름과 함께 DataFrame으로 정리
feature_importance_df = pd.DataFrame({
    "feature": X_train.columns,
    "importance": importances
}).sort_values(by="importance", ascending=False)

# 중요도가 0.01보다 큰 행만 필터링
nonzero_importance_df = feature_importance_df[feature_importance_df["importance"] > 0.001].copy()

# 결과 확인
print(nonzero_importance_df.shape)
nonzero_importance_df

(219, 2)


Unnamed: 0,feature,importance
495,정상청구원금_B5M,0.110704
399,이용금액_오프라인_B0M,0.043821
504,연속유실적개월수_기본_24M_카드,0.036203
41,이용금액_R3M_신용체크,0.030365
139,이용금액_체크_B0M,0.023127
...,...,...
603,RV_평균잔액_R6M,0.001025
148,이용후경과월_CA,0.001025
814,증감율_이용건수_체크_전월,0.001021
817,증감율_이용금액_신판_전월,0.001017


#### 재훈련

In [20]:
selected_features = nonzero_importance_df["feature"].tolist()
X_selected = X[selected_features]  # X는 전체 train feature DataFrame

In [21]:
from sklearn.model_selection import train_test_split

X_train_sel, X_val_sel, y_train_sel, y_val_sel = train_test_split(
    X_selected, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

try:
    model_sel = xgb.XGBClassifier(
        tree_method='gpu_hist',
        gpu_id=0,
        random_state=42
    )
    print("XGBoost(GPU) 모델로 학습 시작")
except:
    model_sel = xgb.XGBClassifier(random_state=42)
    print("XGBoost(CPU) 모델로 학습 시작")

model_sel.fit(X_train_sel, y_train_sel)
y_pred_sel = model_sel.predict(X_val_sel)

# 평가
print("🎯 XGBoost(선택된 변수 기반) 성능:")
print("Accuracy:", accuracy_score(y_val_sel, y_pred_sel))
print(classification_report(y_val_sel, y_pred_sel, target_names=le_target.classes_))


XGBoost(GPU) 모델로 학습 시작



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



🎯 XGBoost(선택된 변수 기반) 성능:
Accuracy: 0.9218854166666667
              precision    recall  f1-score   support

           A       0.96      0.66      0.78       194
           B       1.00      0.48      0.65        29
           C       0.78      0.69      0.73     25518
           D       0.77      0.72      0.74     69848
           E       0.96      0.97      0.96    384411

    accuracy                           0.92    480000
   macro avg       0.89      0.71      0.77    480000
weighted avg       0.92      0.92      0.92    480000



### Train model 2 :: Lightgbm (X)

In [22]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(random_state=42)  # GPU 사용 가능 시

lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_val)

# 평가
print("🎯 LightGBM 성능:")
print("Accuracy:", accuracy_score(y_val, y_pred_lgb))
print(classification_report(y_val, y_pred_lgb, target_names=le_target.classes_))

# Accuracy: 0.89410625
## XGBoost가 더 나음

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.125407 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83570
[LightGBM] [Info] Number of data points in the train set: 1920000, number of used features: 737
[LightGBM] [Info] Start training from score -7.811109
[LightGBM] [Info] Start training from score -9.722904
[LightGBM] [Info] Start training from score -2.934402
[LightGBM] [Info] Start training from score -1.927457
[LightGBM] [Info] Start training from score -0.222076
🎯 LightGBM 성능:
Accuracy: 0.89410625
              precision    recall  f1-score   support

           A       0.10      0.06      0.08       194
           B       0.00      0.00      0.00        29
           C       0.71      0.60      0.65     25518
           D       0.68      0.63      0.66     69848
           E       0.94      0.96      0.95    384411

    accuracy                           0.89    480000
   macro avg       0.

### Train model 3 :: Catboost (성능 가장 good)

하이퍼파라미터 튜닝

In [7]:
from catboost import CatBoostClassifier
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import numpy as np
import random

In [9]:
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 내부적으로 또 한 번 train/val 분리 (Optuna용)
X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

def objective(trial):
    params = {
        'iterations': trial.suggest_categorical('iterations', [100, 200, 300, 500]),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'depth': trial.suggest_int('depth', 2, 10),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 15),
        'border_count': trial.suggest_categorical('border_count', [32, 64, 128]),
        'random_strength': trial.suggest_int('random_strength', 1, 20),
        'task_type': 'GPU',
        'verbose': 0,
        'random_state': 42
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train_sub, y_train_sub)
    preds = model.predict(X_val_sub)
    acc = accuracy_score(y_val_sub, preds)
    return acc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True)
print("✅ Best Trial:")
print(study.best_trial.params)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
# 최적 파라미터로 최종 학습
best_params = study.best_trial.params
best_model = CatBoostClassifier(**best_params)
best_model.fit(X_train, y_train)

# 검증 데이터로 성능 확인
y_pred = best_model.predict(X_val)
print("🎯 최적 CatBoost 성능:")
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred, target_names=le_target.classes_))


In [None]:
# 최적 모델로 예측
best_cat_model = random_search.best_estimator_
y_pred = best_cat_model.predict(X_val)

print("🎯 CatBoost 최적화 성능:")
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred, target_names=le_target.classes_))

Train

In [23]:
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier(
    verbose=0,  # 학습 로그 줄이기
    random_state=42,
    task_type="GPU"
)

cat_model.fit(X_train, y_train)
y_pred_cat = cat_model.predict(X_val)

# 평가
print("🎯 CatBoost 성능:")
print("Accuracy:", accuracy_score(y_val, y_pred_cat))
print(classification_report(y_val, y_pred_cat, target_names=le_target.classes_))


🎯 CatBoost 성능:
Accuracy: 0.934975
              precision    recall  f1-score   support

           A       0.97      0.77      0.86       194
           B       0.88      0.76      0.81        29
           C       0.82      0.74      0.78     25518
           D       0.81      0.77      0.79     69848
           E       0.96      0.98      0.97    384411

    accuracy                           0.93    480000
   macro avg       0.89      0.80      0.84    480000
weighted avg       0.93      0.93      0.93    480000



### Prediction :: Test dataset

In [1]:
# # 데이터 정리
# X_test.drop(columns=['ID'],inplace=True)

# # row-level 예측 수행
# y_test_pred = model.predict(X_test)

# # 예측 결과를 변환
# ## 위에서 숫자로 인코딩했던 Segment 클래스 (LabelEncoder)를 원래의 문자 라벨로 되돌림
# ## 예: 0 → 'A', 1 → 'B', 2 → 'C' 이런 식으로 되돌려줌
# y_test_pred_labels = le_target.inverse_transform(y_test_pred)


# # row 단위 예측 결과를 test_data에 추가
# test_data = test_df.copy()  # 원본 유지
# test_data["pred_label"] = y_test_pred_labels

In [13]:
# test_data

Unnamed: 0,기준년월,ID,남녀구분코드,연령,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,소지카드수_이용가능_신용,...,변동률_할부평잔,변동률_CA평잔,변동률_RVCA평잔,변동률_카드론평잔,변동률_잔액_B1M,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M,pred_label
0,201807,TEST_00000,1,40대,1,1,0,1,2,2,...,0.999998,0.999998,0.999998,0.999998,0.209395,0.231043,0.0,1.332770,1.780392,E
1,201807,TEST_00001,1,60대,1,1,0,1,1,1,...,1.044473,1.991974,0.999998,0.926569,-0.269161,-0.247241,0.0,0.000000,0.000000,E
2,201807,TEST_00002,1,40대,1,1,1,1,2,2,...,1.053083,0.999998,0.999998,0.999998,-0.120290,0.029270,0.0,4.123738,5.115589,D
3,201807,TEST_00003,2,40대,1,1,1,1,1,1,...,1.991630,0.999998,0.999998,0.999998,0.035807,-0.013359,0.0,0.093615,0.349994,E
4,201807,TEST_00004,2,40대,1,0,1,1,1,1,...,1.053743,0.999998,0.999998,0.999998,-0.538740,-0.449378,0.0,0.000000,0.000000,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,201812,TEST_99995,2,60대,0,0,0,0,0,0,...,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.0,,,E
599996,201812,TEST_99996,1,30대,1,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.143554,0.233616,0.0,8.564683,11.379632,E
599997,201812,TEST_99997,2,30대,1,1,1,1,1,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,,,E
599998,201812,TEST_99998,1,30대,1,1,1,1,3,3,...,0.333183,0.999998,0.999998,0.999998,-0.038153,-0.106142,0.0,-0.241530,0.499079,C


### Submission

In [None]:
# submission = test_data.groupby("ID")["pred_label"] \
#     .agg(lambda x: x.value_counts().idxmax()) \
#     .reset_index()

# submission.columns = ["ID", "Segment"]

# submission.to_csv('./base_submit.csv',index=False)