In [1]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import optuna

In [2]:
# local
root_path = '../data/open/머신러닝'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

In [3]:
# 데이터 불러오기
file_name = '201812_add_segment3' # 이 부분만 수정하여 모델1,2,3 똑같이 검정
train_df = pd.read_parquet(f'{root_path}/{file_name}.parquet')

In [4]:
# 학습용 피처/타겟 정의
X = train_df.drop(columns=['ID', '기준년월', 'Segment', 'Segment1'])
y = train_df['Segment1']

In [5]:
# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### 하이퍼파라미터 튜닝 (Optuna)

In [57]:
def objective(trial):
    params = {
        # boosting 반복 횟수
        "iterations": trial.suggest_int("iterations", 300, 2000),
        # 한번에 얼마나 빠르게 학습하지지
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        # 트리의 깊이, 깊을수록 복잡한 모델이지만 과적합 가능성 O
        "depth": trial.suggest_int("depth", 4, 10), # --> 튜닝
        # L2 정규화 , 높을수록 규제 강화 - 과적합 억제
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        # 수치형 피쳐를 이산화 할 때 사용하는 경계 개수 -> 수치형 많으면 영향력 있음
        "border_count": trial.suggest_int("border_count", 32, 255),
        # 트리 분기시 f.i 계산에 노이즈를 얼마나 줄 것인가
        "random_strength": trial.suggest_float("random_strength", 0.1, 1),
        # 샘플링의 다양성 조절 -> 과소적합일 땐 높이고, 과적합일 땐 낮
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        # 손실 함수 : 다중분류
        "loss_function": "Logloss",
        # 평가 지표
        "eval_metric": "F1",
        # 학습 로그를 몇 step마다 보여줄지
        "verbose": 100,
        # 트리 분기 시 feature 중요도 계산에 주는 노이즈 크기
        "random_state": 42
    }
    # early_stopping_rounds - 지정된 라운드 동안 성능 개선이 되지 않으면 학습 중지
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=100)

    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average='micro')
    return f1

In [51]:
# Optuna 튜닝 시작
study = optuna.create_study(direction="maximize")
# XGB 때 n_trials=30 으로 했었는데 너무 오래걸려서 일단 10으로 설정
# n_trials는 optuna가 시도할 하이퍼파라미터 조합의 개수 (몇번 학습할건지)
study.optimize(objective, n_trials=30)

[I 2025-07-10 13:51:26,214] A new study created in memory with name: no-name-c56de6fd-ed12-4200-a399-a8ad94ed4194
[I 2025-07-10 13:51:26,421] Trial 0 finished with value: 0.8947368421052632 and parameters: {'learning_rate': 0.01612643468710046, 'depth': 3, 'l2_leaf_reg': 3.3428732594393424}. Best is trial 0 with value: 0.8947368421052632.


0:	learn: 0.8716216	test: 0.8684211	best: 0.8684211 (0)	total: 896us	remaining: 268ms
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.8947368421
bestIteration = 1

Shrink model to first 2 iterations.


[I 2025-07-10 13:51:26,723] Trial 1 finished with value: 0.8947368421052632 and parameters: {'learning_rate': 0.029433345943574094, 'depth': 5, 'l2_leaf_reg': 4.646977783805227}. Best is trial 0 with value: 0.8947368421052632.


0:	learn: 0.9121622	test: 0.8947368	best: 0.8947368 (0)	total: 7.01ms	remaining: 2.1s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.8947368421
bestIteration = 0

Shrink model to first 1 iterations.


[I 2025-07-10 13:51:27,004] Trial 2 finished with value: 0.8947368421052632 and parameters: {'learning_rate': 0.027997124683200057, 'depth': 4, 'l2_leaf_reg': 2.684219327899981}. Best is trial 0 with value: 0.8947368421052632.


0:	learn: 0.9121622	test: 0.8947368	best: 0.8947368 (0)	total: 6.11ms	remaining: 1.83s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.8947368421
bestIteration = 0

Shrink model to first 1 iterations.


[I 2025-07-10 13:51:27,279] Trial 3 finished with value: 0.9210526315789473 and parameters: {'learning_rate': 0.04265249441121411, 'depth': 5, 'l2_leaf_reg': 1.7716789152225405}. Best is trial 3 with value: 0.9210526315789473.


0:	learn: 0.9121622	test: 0.8947368	best: 0.8947368 (0)	total: 5.7ms	remaining: 1.7s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9210526316
bestIteration = 1

Shrink model to first 2 iterations.


[I 2025-07-10 13:51:27,503] Trial 4 finished with value: 0.9210526315789473 and parameters: {'learning_rate': 0.025812051030850193, 'depth': 5, 'l2_leaf_reg': 1.586604214288525}. Best is trial 3 with value: 0.9210526315789473.


0:	learn: 0.9121622	test: 0.8947368	best: 0.8947368 (0)	total: 5.64ms	remaining: 1.69s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9210526316
bestIteration = 1

Shrink model to first 2 iterations.


[I 2025-07-10 13:51:27,806] Trial 5 finished with value: 0.8947368421052632 and parameters: {'learning_rate': 0.012056091847977282, 'depth': 5, 'l2_leaf_reg': 3.9892915003159133}. Best is trial 3 with value: 0.9210526315789473.


0:	learn: 0.9121622	test: 0.8947368	best: 0.8947368 (0)	total: 11.7ms	remaining: 3.5s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.8947368421
bestIteration = 0

Shrink model to first 1 iterations.


[I 2025-07-10 13:51:28,015] Trial 6 finished with value: 0.8947368421052632 and parameters: {'learning_rate': 0.044266562607082435, 'depth': 3, 'l2_leaf_reg': 3.930814533318762}. Best is trial 3 with value: 0.9210526315789473.


0:	learn: 0.8716216	test: 0.8684211	best: 0.8684211 (0)	total: 2.09ms	remaining: 626ms
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.8947368421
bestIteration = 1

Shrink model to first 2 iterations.
0:	learn: 0.8716216	test: 0.8684211	best: 0.8684211 (0)	total: 5.26ms	remaining: 1.57s


[I 2025-07-10 13:51:28,201] Trial 7 finished with value: 0.8947368421052632 and parameters: {'learning_rate': 0.04295587651659916, 'depth': 3, 'l2_leaf_reg': 2.0781451764043446}. Best is trial 3 with value: 0.9210526315789473.


Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.8947368421
bestIteration = 1

Shrink model to first 2 iterations.
0:	learn: 0.9121622	test: 0.8947368	best: 0.8947368 (0)	total: 4.93ms	remaining: 1.47s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.8947368421
bestIteration = 0

Shrink model to first 1 iterations.


[I 2025-07-10 13:51:28,412] Trial 8 finished with value: 0.8947368421052632 and parameters: {'learning_rate': 0.037948850460134076, 'depth': 4, 'l2_leaf_reg': 1.2198805462854696}. Best is trial 3 with value: 0.9210526315789473.
[I 2025-07-10 13:51:28,683] Trial 9 finished with value: 0.8947368421052632 and parameters: {'learning_rate': 0.045557370116368234, 'depth': 5, 'l2_leaf_reg': 4.12057015072966}. Best is trial 3 with value: 0.9210526315789473.


0:	learn: 0.9121622	test: 0.8947368	best: 0.8947368 (0)	total: 7.72ms	remaining: 2.31s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.8947368421
bestIteration = 0

Shrink model to first 1 iterations.


In [55]:
# 결과 출력
print("Best F1 score:", study.best_value)
print("Best parameters:", study.best_params)

Best F1 score: 0.9210526315789473
Best parameters: {'learning_rate': 0.04265249441121411, 'depth': 5, 'l2_leaf_reg': 1.7716789152225405}


In [43]:
# 최적 파라미터로 모델 학습
# Optuna가 튜닝한 것이 아닌 것들은 다시 넣어줘야함.
# Optuna에 넣은 이유는 기본적으로 들어있어야 하기 때문에
best_params = study.best_params
best_params.update({
    "iterations": 500,
    "loss_function": "MultiClass",
    "eval_metric": "TotalF1:average=Micro",
    "random_state": 42,
    "verbose": 100
})

In [45]:
# 실제 모델
model = CatBoostClassifier(**best_params)
model.fit(X_train, y_train,
          eval_set=(X_val, y_val),
          early_stopping_rounds=30)

0:	learn: 0.9121622	test: 0.8947368	best: 0.8947368 (0)	total: 3ms	remaining: 1.49s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9210526316
bestIteration = 1

Shrink model to first 2 iterations.


<catboost.core.CatBoostClassifier at 0x2195faae750>

In [47]:
# 예측 및 성능 출력
preds = model.predict(X_val)
print("튜닝 후 Micro F1:", f1_score(y_val, preds, average='micro'))

튜닝 후 Micro F1: 0.9210526315789473


In [None]:
# 예측
test_df['Segment_pred'] = model1.predict(X_test)

In [None]:
test_df['Segment_pred'].value_counts()