In [30]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import optuna
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

In [2]:
# local
root_path = '../data/open/머신러닝'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

In [3]:
# 데이터 불러오기
file_name = 'cleaned_vif_filtered' # 이 부분만 수정하여 모델1,2,3 똑같이 검정
train_df = pd.read_parquet(f'{root_path}/{file_name}.parquet')

In [4]:
# # 학습용 피처/타겟 정의
# X = train_df.drop(columns=['ID', '기준년월', 'Segment', 'Segment1'])
# y = train_df['Segment1']

In [5]:
# 타겟 레이블 재구성
def get_target_label(df):
    segment_columns = ['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E']
    return df[segment_columns].idxmax(axis=1).str[-1]

In [6]:
# 타겟 생성
train_df['Segment'] = get_target_label(train_df)

In [7]:
# 학습용 피처/타겟 정의
X = train_df.drop(columns=['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E', 'Segment', 'ID', '기준년월'])
y = train_df['Segment']

In [8]:
# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### 하이퍼파라미터 튜닝 (Optuna)

In [16]:
def objective(trial):
    params = {
        # boosting 반복 횟수
        "iterations": trial.suggest_int("iterations", 300, 1000),
        # 한번에 얼마나 빠르게 학습하지지
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        # 트리의 깊이, 깊을수록 복잡한 모델이지만 과적합 가능성 O
        "depth": trial.suggest_int("depth", 4, 10), # --> 튜닝
        # L2 정규화 , 높을수록 규제 강화 - 과적합 억제
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        # 수치형 피쳐를 이산화 할 때 사용하는 경계 개수 -> 수치형 많으면 영향력 있음
        "border_count": trial.suggest_int("border_count", 32, 255),
        # 트리 분기시 f.i 계산에 노이즈를 얼마나 줄 것인가
        "random_strength": trial.suggest_float("random_strength", 0.1, 1),
        # 샘플링의 다양성 조절 -> 과소적합일 땐 높이고, 과적합일 땐 낮
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        # 손실 함수 : 다중분류
        "loss_function": "MultiClass",
        # 평가 지표
        "eval_metric": "TotalF1:average=Micro",
        # 학습 로그를 몇 step마다 보여줄지
        "verbose": 100,
        # 트리 분기 시 feature 중요도 계산에 주는 노이즈 크기
        "random_state": 42
    }
    # early_stopping_rounds - 지정된 라운드 동안 성능 개선이 되지 않으면 학습 중지
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=100)

    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average='micro')
    return f1

In [18]:
# Optuna 튜닝 시작
study = optuna.create_study(direction="maximize")
# XGB 때 n_trials=30 으로 했었는데 너무 오래걸려서 일단 10으로 설정
# n_trials는 optuna가 시도할 하이퍼파라미터 조합의 개수 (몇번 학습할건지)
study.optimize(objective, n_trials=3)

[I 2025-07-14 09:46:46,505] A new study created in memory with name: no-name-a75b81b8-5e61-4730-a5ee-a4f6596dbce5


0:	learn: 0.8301745	test: 0.8301500	best: 0.8301500 (0)	total: 943ms	remaining: 11m 48s
100:	learn: 0.8581635	test: 0.8577208	best: 0.8577396 (99)	total: 1m 27s	remaining: 9m 27s
200:	learn: 0.8612724	test: 0.8607792	best: 0.8608000 (199)	total: 2m 54s	remaining: 7m 58s
300:	learn: 0.8626995	test: 0.8619646	best: 0.8619646 (300)	total: 4m 20s	remaining: 6m 30s
400:	learn: 0.8636396	test: 0.8627542	best: 0.8627542 (400)	total: 5m 44s	remaining: 5m 1s
500:	learn: 0.8642240	test: 0.8632958	best: 0.8633375 (493)	total: 7m 7s	remaining: 3m 34s
600:	learn: 0.8647844	test: 0.8638333	best: 0.8638604 (595)	total: 8m 29s	remaining: 2m 7s
700:	learn: 0.8651833	test: 0.8642188	best: 0.8642417 (698)	total: 9m 52s	remaining: 43.1s
751:	learn: 0.8654063	test: 0.8643917	best: 0.8644042 (750)	total: 10m 34s	remaining: 0us

bestTest = 0.8644041667
bestIteration = 750

Shrink model to first 751 iterations.


[I 2025-07-14 09:57:35,218] Trial 0 finished with value: 0.8644041666666666 and parameters: {'iterations': 752, 'learning_rate': 0.056786138248062154, 'depth': 4, 'l2_leaf_reg': 5.922062488968508, 'border_count': 51, 'random_strength': 0.9215326271654452, 'bagging_temperature': 0.6873423807269551}. Best is trial 0 with value: 0.8644041666666666.


0:	learn: 0.8408026	test: 0.8402500	best: 0.8402500 (0)	total: 1.08s	remaining: 13m 48s
100:	learn: 0.8592667	test: 0.8587771	best: 0.8587771 (100)	total: 1m 51s	remaining: 12m 18s
200:	learn: 0.8621573	test: 0.8615063	best: 0.8615063 (200)	total: 3m 37s	remaining: 10m 17s
300:	learn: 0.8636891	test: 0.8630000	best: 0.8630000 (300)	total: 5m 22s	remaining: 8m 22s
400:	learn: 0.8647146	test: 0.8637917	best: 0.8638042 (392)	total: 7m 5s	remaining: 6m 32s
500:	learn: 0.8654469	test: 0.8644792	best: 0.8644792 (500)	total: 8m 46s	remaining: 4m 43s
600:	learn: 0.8662177	test: 0.8650625	best: 0.8650625 (600)	total: 10m 29s	remaining: 2m 58s
700:	learn: 0.8668495	test: 0.8655875	best: 0.8656083 (699)	total: 12m 12s	remaining: 1m 13s
770:	learn: 0.8672734	test: 0.8659854	best: 0.8659854 (770)	total: 13m 25s	remaining: 0us

bestTest = 0.8659854167
bestIteration = 770



[I 2025-07-14 10:11:16,301] Trial 1 finished with value: 0.8659854166666666 and parameters: {'iterations': 771, 'learning_rate': 0.038095700753423875, 'depth': 6, 'l2_leaf_reg': 4.524545678387907, 'border_count': 57, 'random_strength': 0.20511204277041797, 'bagging_temperature': 0.2662357902949276}. Best is trial 1 with value: 0.8659854166666666.


0:	learn: 0.8421380	test: 0.8421458	best: 0.8421458 (0)	total: 2.57s	remaining: 14m 55s
100:	learn: 0.8585755	test: 0.8579187	best: 0.8579187 (100)	total: 4m 15s	remaining: 10m 27s
200:	learn: 0.8615063	test: 0.8608083	best: 0.8608083 (200)	total: 8m 26s	remaining: 6m 12s
300:	learn: 0.8632318	test: 0.8623771	best: 0.8623771 (300)	total: 13m 11s	remaining: 2m 6s
348:	learn: 0.8638562	test: 0.8628833	best: 0.8628833 (348)	total: 15m 26s	remaining: 0us

bestTest = 0.8628833333
bestIteration = 348



[I 2025-07-14 10:26:56,952] Trial 2 finished with value: 0.8628833333333333 and parameters: {'iterations': 349, 'learning_rate': 0.0257227144878083, 'depth': 7, 'l2_leaf_reg': 1.7523413142844229, 'border_count': 91, 'random_strength': 0.988590812305191, 'bagging_temperature': 0.14566319491569157}. Best is trial 1 with value: 0.8659854166666666.


In [20]:
# 결과 출력
print("Best F1 score:", study.best_value)
print("Best parameters:", study.best_params)

Best F1 score: 0.8659854166666666
Best parameters: {'iterations': 771, 'learning_rate': 0.038095700753423875, 'depth': 6, 'l2_leaf_reg': 4.524545678387907, 'border_count': 57, 'random_strength': 0.20511204277041797, 'bagging_temperature': 0.2662357902949276}


In [22]:
# 최적 파라미터로 모델 학습
# Optuna가 튜닝한 것이 아닌 것들은 다시 넣어줘야함.
# Optuna에 넣은 이유는 기본적으로 들어있어야 하기 때문에
best_params = study.best_params
best_params.update({
    "iterations": 500,
    "loss_function": "MultiClass",
    "eval_metric": "TotalF1:average=Micro",
    "random_state": 42,
    "verbose": 100
})

In [24]:
# 실제 모델
model = CatBoostClassifier(**best_params)
model.fit(X_train, y_train,
          eval_set=(X_val, y_val),
          early_stopping_rounds=30)

0:	learn: 0.8408026	test: 0.8402500	best: 0.8402500 (0)	total: 1.42s	remaining: 11m 48s
100:	learn: 0.8592667	test: 0.8587771	best: 0.8587771 (100)	total: 2m 16s	remaining: 9m
200:	learn: 0.8621573	test: 0.8615063	best: 0.8615063 (200)	total: 4m 3s	remaining: 6m 2s
300:	learn: 0.8636891	test: 0.8630000	best: 0.8630000 (300)	total: 5m 48s	remaining: 3m 50s
400:	learn: 0.8647146	test: 0.8637917	best: 0.8638042 (392)	total: 7m 31s	remaining: 1m 51s
499:	learn: 0.8654417	test: 0.8644729	best: 0.8644729 (491)	total: 9m 12s	remaining: 0us

bestTest = 0.8644729167
bestIteration = 491

Shrink model to first 492 iterations.


<catboost.core.CatBoostClassifier at 0x1324113b740>

In [26]:
# 실제 클래스별 개수

# 클래스 라벨 (예: LabelEncoder 사용한 경우)
labels = ['A', 'B', 'C', 'D', 'E']

true_counts = pd.Series(y_val).value_counts().reindex(labels, fill_value=0)
print("\n실제 클래스별 총 개수 (support):")
for label in labels:
    print(f"{label}: {true_counts[label]:,}개")


실제 클래스별 총 개수 (support):
A: 194개
B: 29개
C: 25,518개
D: 69,848개
E: 384,411개


In [44]:
# 예측 및 성능 출력
preds = model.predict(X_val)
print("튜닝 후 Micro F1:", f1_score(y_val, preds, average='micro'))

# 혼동행렬
# 혼동행렬을 DataFrame으로 보기 좋게 출력
cm = confusion_matrix(y_val, y_val, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"실제_{l}" for l in labels], columns=[f"예측_{l}" for l in labels])

print("📊 Confusion Matrix")
display(cm_df)

튜닝 후 Micro F1: 0.8644729166666667
📊 Confusion Matrix


Unnamed: 0,예측_A,예측_B,예측_C,예측_D,예측_E
실제_A,194,0,0,0,0
실제_B,0,29,0,0,0
실제_C,0,0,25518,0,0
실제_D,0,0,0,69848,0
실제_E,0,0,0,0,384411


In [38]:
print(classification_report(y_val, y_val, target_names=['A', 'B', 'C', 'D', 'E']))

              precision    recall  f1-score   support

           A       1.00      1.00      1.00       194
           B       1.00      1.00      1.00        29
           C       1.00      1.00      1.00     25518
           D       1.00      1.00      1.00     69848
           E       1.00      1.00      1.00    384411

    accuracy                           1.00    480000
   macro avg       1.00      1.00      1.00    480000
weighted avg       1.00      1.00      1.00    480000

