In [1]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import optuna

In [2]:
# local
root_path = '../data/open/머신러닝'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

In [3]:
# 데이터 불러오기
#train_file = '201812_vif_drop_All'
#train_file = '201812_corr_drop_All'
train_file = 'vif_one_segment'
train_df = pd.read_parquet(f'{root_path}/{train_file}.parquet')

test_file = f'{root_path}/test_12월_통합.parquet'
test_df = pd.read_parquet(test_file)

train_df

Unnamed: 0,ID,기준년월,소지카드수_유효_신용,소지카드수_이용가능_신용,입회경과개월수_신용,이용금액_R3M_신용_가족,이용여부_3M_해외겸용_본인,이용여부_3M_해외겸용_신용_본인,2순위신용체크구분_인코딩,CA한도금액,...,_3순위교통업종_이용금액,_1순위여유업종_이용금액,_1순위납부업종_이용금액,RP건수_B0M,RP유형건수_B0M,이용개월수_온라인_R6M,연속유실적개월수_기본_24M_카드,청구서발송여부_B0,잔액_일시불_B2M,Segment
0,TRAIN_000000,201807,1,1,67,0,0,0,0,7270,...,0,0,1883,2,2,1,13,1,1083,D
1,TRAIN_000000,201808,1,1,68,0,0,0,0,6293,...,0,0,1959,2,2,1,13,1,736,D
2,TRAIN_000000,201809,1,1,69,0,0,0,0,6449,...,0,0,2008,2,2,1,17,1,997,D
3,TRAIN_000000,201810,1,1,70,0,0,0,0,5990,...,0,0,1972,2,2,1,17,1,697,D
4,TRAIN_000000,201811,1,1,71,0,0,0,0,5816,...,0,0,1982,2,2,0,17,1,607,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,TRAIN_399999,201808,1,1,4,0,0,0,1,25757,...,0,0,0,0,0,0,2,1,0,E
2399996,TRAIN_399999,201809,1,1,5,0,0,0,1,26818,...,0,0,0,0,0,0,5,1,998,E
2399997,TRAIN_399999,201810,1,1,6,0,0,0,1,27543,...,0,0,0,0,0,0,5,1,814,E
2399998,TRAIN_399999,201811,1,1,7,0,0,0,1,27113,...,0,0,0,0,0,0,5,1,1485,E


### 1단계 -> Segment == E vs Other

In [5]:
train_df["Segment1"] = train_df["Segment"].apply(lambda x: "E" if x == "E" else "other")

In [6]:
exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df.columns
    if col not in exclude_cols
]

In [7]:
# 학습용 피처/타겟 정의
X = train_df[feature_cols]
y = train_df['Segment1']

In [8]:
# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X.columns if col in test_df.columns]
X = X[common_cols]
X_test = test_df[common_cols]

In [9]:
# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
def objective(trial):
    params = {
        # boosting 반복 횟수
        "iterations": trial.suggest_int("iterations", 300, 1500),
        # 한번에 얼마나 빠르게 학습하지지
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        # 트리의 깊이, 깊을수록 복잡한 모델이지만 과적합 가능성 O
        "depth": trial.suggest_int("depth", 4, 10), # --> 튜닝
        # L2 정규화 , 높을수록 규제 강화 - 과적합 억제
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        # 수치형 피쳐를 이산화 할 때 사용하는 경계 개수 -> 수치형 많으면 영향력 있음
        "border_count": trial.suggest_int("border_count", 32, 255),
        # 트리 분기시 f.i 계산에 노이즈를 얼마나 줄 것인가
        "random_strength": trial.suggest_float("random_strength", 0.1, 1),
        # 샘플링의 다양성 조절 -> 과소적합일 땐 높이고, 과적합일 땐 낮게
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        # 손실 함수 : 다중분류
        "loss_function": "Logloss",
        # 평가 지표
        "eval_metric": "F1",
        # 학습 로그를 몇 step마다 보여줄지
        "verbose": 100,
        # 트리 분기 시 feature 중요도 계산에 주는 노이즈 크기
        "random_state": 42
    }
    # early_stopping_rounds - 지정된 라운드 동안 성능 개선이 되지 않으면 학습 중지
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=100)

    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average='micro')
    return f1

In [None]:
# Optuna 튜닝 시작
study = optuna.create_study(direction="maximize")
# XGB 때 n_trials=30 으로 했었는데 너무 오래걸려서 일단 20으로 설정
# n_trials는 optuna가 시도할 하이퍼파라미터 조합의 개수 (몇번 학습할건지)
study.optimize(objective, n_trials=30)

[I 2025-07-14 15:28:44,519] A new study created in memory with name: no-name-bc5fb7db-3054-49bd-b893-6131e3d25f5f


0:	learn: 0.6618653	test: 0.6596647	best: 0.6596647 (0)	total: 623ms	remaining: 6m 47s
100:	learn: 0.7028027	test: 0.6996149	best: 0.6996149 (100)	total: 40.4s	remaining: 3m 41s
200:	learn: 0.7110169	test: 0.7072485	best: 0.7072485 (200)	total: 1m 20s	remaining: 3m 1s
300:	learn: 0.7157627	test: 0.7112245	best: 0.7112245 (300)	total: 1m 59s	remaining: 2m 20s
400:	learn: 0.7200644	test: 0.7140997	best: 0.7141622 (398)	total: 2m 40s	remaining: 1m 41s
500:	learn: 0.7237175	test: 0.7168858	best: 0.7168916 (499)	total: 3m 20s	remaining: 1m 1s
600:	learn: 0.7270859	test: 0.7192302	best: 0.7192302 (600)	total: 4m 1s	remaining: 21.7s
654:	learn: 0.7288778	test: 0.7203919	best: 0.7203919 (654)	total: 4m 22s	remaining: 0us

bestTest = 0.7203918998
bestIteration = 654



[I 2025-07-14 15:33:25,644] Trial 0 finished with value: 0.8986875 and parameters: {'iterations': 655, 'learning_rate': 0.05989790431031373, 'depth': 8, 'l2_leaf_reg': 8.447911809305529, 'border_count': 197, 'random_strength': 0.8981347546296288, 'bagging_temperature': 0.170963903717959}. Best is trial 0 with value: 0.8986875.


0:	learn: 0.6469787	test: 0.6451828	best: 0.6451828 (0)	total: 350ms	remaining: 6m 30s
100:	learn: 0.7027971	test: 0.7000728	best: 0.7000728 (100)	total: 33.6s	remaining: 5m 37s
200:	learn: 0.7094913	test: 0.7063537	best: 0.7063537 (200)	total: 1m 6s	remaining: 5m
300:	learn: 0.7131177	test: 0.7098196	best: 0.7098196 (300)	total: 1m 38s	remaining: 4m 26s
400:	learn: 0.7156864	test: 0.7116901	best: 0.7117315 (396)	total: 2m 11s	remaining: 3m 54s
500:	learn: 0.7182854	test: 0.7135494	best: 0.7136177 (499)	total: 2m 44s	remaining: 3m 21s
600:	learn: 0.7204277	test: 0.7148753	best: 0.7148762 (597)	total: 3m 17s	remaining: 2m 48s
700:	learn: 0.7220802	test: 0.7160989	best: 0.7161783 (691)	total: 3m 50s	remaining: 2m 15s
800:	learn: 0.7238668	test: 0.7178642	best: 0.7178666 (799)	total: 4m 22s	remaining: 1m 43s
900:	learn: 0.7254176	test: 0.7191820	best: 0.7191820 (900)	total: 4m 55s	remaining: 1m 10s
1000:	learn: 0.7271912	test: 0.7199206	best: 0.7200184 (993)	total: 5m 28s	remaining: 37.4s

[I 2025-07-14 15:39:49,793] Trial 1 finished with value: 0.89868125 and parameters: {'iterations': 1115, 'learning_rate': 0.08657051577426762, 'depth': 6, 'l2_leaf_reg': 7.050956470299588, 'border_count': 213, 'random_strength': 0.22983573426023, 'bagging_temperature': 0.6837681451042293}. Best is trial 0 with value: 0.8986875.


0:	learn: 0.6496793	test: 0.6483510	best: 0.6483510 (0)	total: 321ms	remaining: 6m 57s
100:	learn: 0.6736502	test: 0.6708958	best: 0.6708958 (100)	total: 32.2s	remaining: 6m 23s
200:	learn: 0.6842503	test: 0.6822480	best: 0.6822480 (200)	total: 1m 5s	remaining: 5m 58s
300:	learn: 0.6926200	test: 0.6901076	best: 0.6901076 (300)	total: 1m 39s	remaining: 5m 29s
400:	learn: 0.6974526	test: 0.6947449	best: 0.6947449 (400)	total: 2m 11s	remaining: 4m 56s
500:	learn: 0.7007564	test: 0.6973775	best: 0.6973775 (500)	total: 2m 44s	remaining: 4m 22s
600:	learn: 0.7030169	test: 0.7000553	best: 0.7000553 (600)	total: 3m 16s	remaining: 3m 49s
700:	learn: 0.7049020	test: 0.7017721	best: 0.7018249 (699)	total: 3m 49s	remaining: 3m 16s
800:	learn: 0.7062852	test: 0.7029974	best: 0.7030125 (789)	total: 4m 22s	remaining: 2m 44s
900:	learn: 0.7072877	test: 0.7041327	best: 0.7041327 (900)	total: 4m 55s	remaining: 2m 11s
1000:	learn: 0.7082449	test: 0.7051705	best: 0.7051705 (1000)	total: 5m 27s	remaining: 

[I 2025-07-14 15:47:13,922] Trial 2 finished with value: 0.89435625 and parameters: {'iterations': 1302, 'learning_rate': 0.01516121561237823, 'depth': 6, 'l2_leaf_reg': 8.631680950506354, 'border_count': 50, 'random_strength': 0.16332812272715974, 'bagging_temperature': 0.567155328185512}. Best is trial 0 with value: 0.8986875.


0:	learn: 0.6673415	test: 0.6664174	best: 0.6664174 (0)	total: 415ms	remaining: 5m 30s
100:	learn: 0.7031177	test: 0.6997066	best: 0.6997317 (99)	total: 43.7s	remaining: 5m
200:	learn: 0.7127155	test: 0.7082109	best: 0.7082109 (200)	total: 1m 26s	remaining: 4m 16s
300:	learn: 0.7188693	test: 0.7128381	best: 0.7128381 (300)	total: 2m 9s	remaining: 3m 33s
400:	learn: 0.7241839	test: 0.7166328	best: 0.7166328 (400)	total: 2m 52s	remaining: 2m 50s
500:	learn: 0.7288743	test: 0.7197196	best: 0.7197380 (499)	total: 3m 35s	remaining: 2m 7s
600:	learn: 0.7334347	test: 0.7230989	best: 0.7231073 (599)	total: 4m 17s	remaining: 1m 24s
700:	learn: 0.7377615	test: 0.7258133	best: 0.7258133 (700)	total: 5m	remaining: 41.1s
796:	learn: 0.7418043	test: 0.7282667	best: 0.7282886 (793)	total: 5m 40s	remaining: 0us

bestTest = 0.7282885821
bestIteration = 793

Shrink model to first 794 iterations.


[I 2025-07-14 15:53:14,015] Trial 3 finished with value: 0.9014208333333333 and parameters: {'iterations': 797, 'learning_rate': 0.04975559371975358, 'depth': 9, 'l2_leaf_reg': 6.8487298295293, 'border_count': 62, 'random_strength': 0.2371385035335718, 'bagging_temperature': 0.3335986369552645}. Best is trial 3 with value: 0.9014208333333333.


0:	learn: 0.6479260	test: 0.6462606	best: 0.6462606 (0)	total: 393ms	remaining: 9m 41s
100:	learn: 0.7037552	test: 0.7008983	best: 0.7009053 (98)	total: 33.6s	remaining: 7m 38s
200:	learn: 0.7107229	test: 0.7073431	best: 0.7073431 (200)	total: 1m 6s	remaining: 7m 2s
300:	learn: 0.7140275	test: 0.7103370	best: 0.7103370 (300)	total: 1m 39s	remaining: 6m 29s
400:	learn: 0.7169210	test: 0.7124392	best: 0.7124392 (400)	total: 2m 12s	remaining: 5m 57s
500:	learn: 0.7192344	test: 0.7142455	best: 0.7142455 (500)	total: 2m 45s	remaining: 5m 23s
600:	learn: 0.7214660	test: 0.7158435	best: 0.7158461 (598)	total: 3m 18s	remaining: 4m 49s
700:	learn: 0.7236957	test: 0.7172472	best: 0.7172540 (694)	total: 3m 51s	remaining: 4m 16s
800:	learn: 0.7256246	test: 0.7186998	best: 0.7187398 (798)	total: 4m 24s	remaining: 3m 43s
900:	learn: 0.7273300	test: 0.7198921	best: 0.7201358 (889)	total: 4m 57s	remaining: 3m 10s
1000:	learn: 0.7290294	test: 0.7213278	best: 0.7213278 (1000)	total: 5m 30s	remaining: 2m

[I 2025-07-14 16:01:42,433] Trial 4 finished with value: 0.90030625 and parameters: {'iterations': 1479, 'learning_rate': 0.09515030303124601, 'depth': 6, 'l2_leaf_reg': 4.9309524389823345, 'border_count': 171, 'random_strength': 0.23196728139437084, 'bagging_temperature': 0.038162944098501184}. Best is trial 3 with value: 0.9014208333333333.


0:	learn: 0.6615734	test: 0.6601037	best: 0.6601037 (0)	total: 453ms	remaining: 8m 25s
100:	learn: 0.6915684	test: 0.6883547	best: 0.6883547 (100)	total: 45.6s	remaining: 7m 39s
200:	learn: 0.7036452	test: 0.6996811	best: 0.6996811 (200)	total: 1m 30s	remaining: 6m 54s
300:	learn: 0.7092545	test: 0.7050310	best: 0.7050310 (300)	total: 2m 15s	remaining: 6m 8s
400:	learn: 0.7130484	test: 0.7080932	best: 0.7080932 (400)	total: 2m 59s	remaining: 5m 21s
500:	learn: 0.7160195	test: 0.7108328	best: 0.7108328 (500)	total: 3m 44s	remaining: 4m 36s
600:	learn: 0.7186328	test: 0.7129207	best: 0.7129379 (599)	total: 4m 28s	remaining: 3m 51s
700:	learn: 0.7212121	test: 0.7146499	best: 0.7146499 (699)	total: 5m 13s	remaining: 3m 6s
800:	learn: 0.7237336	test: 0.7164798	best: 0.7164798 (800)	total: 5m 57s	remaining: 2m 21s
900:	learn: 0.7263056	test: 0.7183356	best: 0.7183356 (900)	total: 6m 41s	remaining: 1m 36s
1000:	learn: 0.7287007	test: 0.7198163	best: 0.7198937 (999)	total: 7m 24s	remaining: 52

[I 2025-07-14 16:10:20,185] Trial 5 finished with value: 0.8991166666666667 and parameters: {'iterations': 1118, 'learning_rate': 0.025536221404216224, 'depth': 9, 'l2_leaf_reg': 8.043288908041365, 'border_count': 196, 'random_strength': 0.3571735637904898, 'bagging_temperature': 0.25958073760798905}. Best is trial 3 with value: 0.9014208333333333.


0:	learn: 0.6560157	test: 0.6541533	best: 0.6541533 (0)	total: 488ms	remaining: 6m 5s
100:	learn: 0.7072088	test: 0.7027849	best: 0.7027849 (100)	total: 45.2s	remaining: 4m 50s
200:	learn: 0.7164246	test: 0.7110538	best: 0.7110538 (200)	total: 1m 29s	remaining: 4m 3s
300:	learn: 0.7233563	test: 0.7155541	best: 0.7156693 (298)	total: 2m 12s	remaining: 3m 18s
400:	learn: 0.7299117	test: 0.7198499	best: 0.7199323 (399)	total: 2m 56s	remaining: 2m 33s
500:	learn: 0.7355790	test: 0.7236963	best: 0.7236963 (500)	total: 3m 40s	remaining: 1m 49s
600:	learn: 0.7408926	test: 0.7271830	best: 0.7271840 (598)	total: 4m 24s	remaining: 1m 5s
700:	learn: 0.7461368	test: 0.7306040	best: 0.7306040 (700)	total: 5m 8s	remaining: 22s
750:	learn: 0.7485280	test: 0.7320703	best: 0.7320703 (750)	total: 5m 30s	remaining: 0us

bestTest = 0.73207028
bestIteration = 750



[I 2025-07-14 16:16:11,825] Trial 6 finished with value: 0.9026270833333333 and parameters: {'iterations': 751, 'learning_rate': 0.06480350736944289, 'depth': 9, 'l2_leaf_reg': 1.9802737081442907, 'border_count': 184, 'random_strength': 0.7896528030303464, 'bagging_temperature': 0.03367284433922724}. Best is trial 6 with value: 0.9026270833333333.


0:	learn: 0.5991701	test: 0.5971300	best: 0.5971300 (0)	total: 332ms	remaining: 2m 58s
100:	learn: 0.6977640	test: 0.6951247	best: 0.6951247 (100)	total: 32.7s	remaining: 2m 22s
200:	learn: 0.7062014	test: 0.7027999	best: 0.7027999 (200)	total: 1m 5s	remaining: 1m 51s
300:	learn: 0.7097491	test: 0.7064228	best: 0.7064445 (299)	total: 1m 37s	remaining: 1m 17s
400:	learn: 0.7122624	test: 0.7086890	best: 0.7086890 (400)	total: 2m 10s	remaining: 45.2s
500:	learn: 0.7143568	test: 0.7104936	best: 0.7104936 (500)	total: 2m 42s	remaining: 12.7s
539:	learn: 0.7151116	test: 0.7107297	best: 0.7107297 (539)	total: 2m 55s	remaining: 0us

bestTest = 0.7107297126
bestIteration = 539



[I 2025-07-14 16:19:27,302] Trial 7 finished with value: 0.8953625 and parameters: {'iterations': 540, 'learning_rate': 0.059897643923737406, 'depth': 6, 'l2_leaf_reg': 3.6234763000584485, 'border_count': 55, 'random_strength': 0.24913331780909215, 'bagging_temperature': 0.8081008018197535}. Best is trial 6 with value: 0.9026270833333333.


0:	learn: 0.6590962	test: 0.6590933	best: 0.6590933 (0)	total: 426ms	remaining: 6m 58s
100:	learn: 0.7086569	test: 0.7046196	best: 0.7046597 (99)	total: 40.2s	remaining: 5m 51s
200:	learn: 0.7171714	test: 0.7112104	best: 0.7112347 (198)	total: 1m 19s	remaining: 5m 9s
300:	learn: 0.7232236	test: 0.7162676	best: 0.7162676 (300)	total: 1m 58s	remaining: 4m 28s
400:	learn: 0.7284544	test: 0.7194548	best: 0.7194548 (400)	total: 2m 38s	remaining: 3m 50s
500:	learn: 0.7331409	test: 0.7227375	best: 0.7227375 (500)	total: 3m 18s	remaining: 3m 11s
600:	learn: 0.7375835	test: 0.7260366	best: 0.7260366 (600)	total: 3m 58s	remaining: 2m 31s
700:	learn: 0.7418525	test: 0.7288661	best: 0.7289420 (697)	total: 4m 36s	remaining: 1m 51s
800:	learn: 0.7457176	test: 0.7313523	best: 0.7313523 (800)	total: 5m 15s	remaining: 1m 12s
900:	learn: 0.7497058	test: 0.7335314	best: 0.7335314 (900)	total: 5m 54s	remaining: 32.7s
983:	learn: 0.7525563	test: 0.7350791	best: 0.7350791 (983)	total: 6m 27s	remaining: 0us


[I 2025-07-14 16:26:15,577] Trial 8 finished with value: 0.9035125 and parameters: {'iterations': 984, 'learning_rate': 0.09180467564708239, 'depth': 8, 'l2_leaf_reg': 7.96098294417899, 'border_count': 78, 'random_strength': 0.32652565130969546, 'bagging_temperature': 0.3416458552088947}. Best is trial 8 with value: 0.9035125.


0:	learn: 0.6590962	test: 0.6590933	best: 0.6590933 (0)	total: 385ms	remaining: 6m 23s
100:	learn: 0.7013223	test: 0.6980895	best: 0.6980895 (100)	total: 40.3s	remaining: 5m 57s
200:	learn: 0.7103444	test: 0.7068575	best: 0.7068575 (200)	total: 1m 19s	remaining: 5m 15s
300:	learn: 0.7152736	test: 0.7104442	best: 0.7104874 (293)	total: 1m 58s	remaining: 4m 34s
400:	learn: 0.7192947	test: 0.7132563	best: 0.7133238 (397)	total: 2m 37s	remaining: 3m 53s
500:	learn: 0.7230424	test: 0.7155619	best: 0.7155676 (499)	total: 3m 16s	remaining: 3m 14s
600:	learn: 0.7261048	test: 0.7178479	best: 0.7178479 (600)	total: 3m 55s	remaining: 2m 34s
700:	learn: 0.7292692	test: 0.7200998	best: 0.7201251 (696)	total: 4m 34s	remaining: 1m 55s
800:	learn: 0.7321702	test: 0.7223422	best: 0.7223422 (800)	total: 5m 13s	remaining: 1m 16s
900:	learn: 0.7349919	test: 0.7242123	best: 0.7242123 (900)	total: 5m 52s	remaining: 37.2s
995:	learn: 0.7375229	test: 0.7258490	best: 0.7258833 (993)	total: 6m 30s	remaining: 0u

[I 2025-07-14 16:33:05,349] Trial 9 finished with value: 0.90045625 and parameters: {'iterations': 996, 'learning_rate': 0.05303016646912162, 'depth': 8, 'l2_leaf_reg': 1.2970595495804869, 'border_count': 78, 'random_strength': 0.29497073250239125, 'bagging_temperature': 0.23138926837891893}. Best is trial 8 with value: 0.9035125.


0:	learn: 0.6297591	test: 0.6276310	best: 0.6276310 (0)	total: 321ms	remaining: 1m 36s
100:	learn: 0.6946572	test: 0.6925055	best: 0.6925055 (100)	total: 27.5s	remaining: 55.1s
200:	learn: 0.7031898	test: 0.7005754	best: 0.7005754 (200)	total: 54s	remaining: 27.4s
300:	learn: 0.7063434	test: 0.7037648	best: 0.7037648 (300)	total: 1m 21s	remaining: 541ms
302:	learn: 0.7064449	test: 0.7038255	best: 0.7038255 (302)	total: 1m 22s	remaining: 0us

bestTest = 0.703825529
bestIteration = 302



[I 2025-07-14 16:34:46,186] Trial 10 finished with value: 0.89345 and parameters: {'iterations': 303, 'learning_rate': 0.08023826762058159, 'depth': 4, 'l2_leaf_reg': 5.486619161987429, 'border_count': 116, 'random_strength': 0.5445249800243411, 'bagging_temperature': 0.9549985309656361}. Best is trial 8 with value: 0.9035125.


0:	learn: 0.6606092	test: 0.6577862	best: 0.6577862 (0)	total: 914ms	remaining: 12m 24s
100:	learn: 0.7139305	test: 0.7083989	best: 0.7083989 (100)	total: 1m 29s	remaining: 10m 32s
200:	learn: 0.7266176	test: 0.7175872	best: 0.7175872 (200)	total: 2m 56s	remaining: 9m
300:	learn: 0.7380427	test: 0.7255559	best: 0.7255559 (300)	total: 4m 24s	remaining: 7m 31s
400:	learn: 0.7485922	test: 0.7321205	best: 0.7321205 (400)	total: 5m 51s	remaining: 6m 3s
500:	learn: 0.7584208	test: 0.7378672	best: 0.7378672 (500)	total: 7m 19s	remaining: 4m 36s
600:	learn: 0.7674767	test: 0.7434749	best: 0.7434749 (600)	total: 8m 48s	remaining: 3m 8s
700:	learn: 0.7761788	test: 0.7487416	best: 0.7487774 (698)	total: 10m 16s	remaining: 1m 41s
800:	learn: 0.7840046	test: 0.7532957	best: 0.7532957 (800)	total: 11m 44s	remaining: 13.2s
815:	learn: 0.7851483	test: 0.7539210	best: 0.7539409 (814)	total: 11m 57s	remaining: 0us

bestTest = 0.7539409064
bestIteration = 814

Shrink model to first 815 iterations.


[I 2025-07-14 16:47:03,987] Trial 11 finished with value: 0.9104416666666667 and parameters: {'iterations': 816, 'learning_rate': 0.07577895433758029, 'depth': 10, 'l2_leaf_reg': 1.1815910731938999, 'border_count': 254, 'random_strength': 0.8798537958033361, 'bagging_temperature': 0.4226927136882493}. Best is trial 11 with value: 0.9104416666666667.


0:	learn: 0.6625715	test: 0.6601631	best: 0.6601631 (0)	total: 924ms	remaining: 14m 31s
100:	learn: 0.7123427	test: 0.7068795	best: 0.7068795 (100)	total: 1m 33s	remaining: 12m 59s
200:	learn: 0.7241754	test: 0.7161416	best: 0.7161416 (200)	total: 3m 4s	remaining: 11m 22s
300:	learn: 0.7345169	test: 0.7233582	best: 0.7233582 (300)	total: 4m 36s	remaining: 9m 49s
400:	learn: 0.7442273	test: 0.7293951	best: 0.7294360 (397)	total: 6m 8s	remaining: 8m 18s
500:	learn: 0.7529296	test: 0.7352869	best: 0.7352869 (500)	total: 7m 38s	remaining: 6m 45s
600:	learn: 0.7611834	test: 0.7403413	best: 0.7403413 (600)	total: 9m 10s	remaining: 5m 13s
700:	learn: 0.7688424	test: 0.7449163	best: 0.7449163 (700)	total: 10m 42s	remaining: 3m 42s
800:	learn: 0.7760078	test: 0.7491573	best: 0.7492130 (799)	total: 12m 14s	remaining: 2m 11s
900:	learn: 0.7831757	test: 0.7530876	best: 0.7532059 (898)	total: 13m 46s	remaining: 39.4s
943:	learn: 0.7859978	test: 0.7550076	best: 0.7550361 (942)	total: 14m 25s	remaini

[I 2025-07-14 17:01:50,955] Trial 12 finished with value: 0.9107479166666667 and parameters: {'iterations': 944, 'learning_rate': 0.07460152628950004, 'depth': 10, 'l2_leaf_reg': 9.887991072476051, 'border_count': 255, 'random_strength': 0.553095546824256, 'bagging_temperature': 0.4381998247104726}. Best is trial 12 with value: 0.9107479166666667.


0:	learn: 0.6632325	test: 0.6611631	best: 0.6611631 (0)	total: 891ms	remaining: 8m 3s
100:	learn: 0.7119501	test: 0.7067218	best: 0.7067218 (100)	total: 1m 28s	remaining: 6m 28s
200:	learn: 0.7238493	test: 0.7163006	best: 0.7163006 (200)	total: 2m 55s	remaining: 4m 59s
300:	learn: 0.7336287	test: 0.7229050	best: 0.7229208 (299)	total: 4m 21s	remaining: 3m 31s
400:	learn: 0.7433796	test: 0.7289102	best: 0.7289102 (400)	total: 5m 50s	remaining: 2m 4s
500:	learn: 0.7524641	test: 0.7342519	best: 0.7342519 (500)	total: 7m 18s	remaining: 37.7s
543:	learn: 0.7559632	test: 0.7367165	best: 0.7367165 (543)	total: 7m 57s	remaining: 0us

bestTest = 0.7367164864
bestIteration = 543



[I 2025-07-14 17:10:07,156] Trial 13 finished with value: 0.9043979166666667 and parameters: {'iterations': 544, 'learning_rate': 0.07293046754244933, 'depth': 10, 'l2_leaf_reg': 9.609959642113433, 'border_count': 253, 'random_strength': 0.7173934537812818, 'bagging_temperature': 0.4678084740609268}. Best is trial 12 with value: 0.9107479166666667.


0:	learn: 0.6631853	test: 0.6620144	best: 0.6620144 (0)	total: 887ms	remaining: 13m 15s
100:	learn: 0.7027707	test: 0.6988399	best: 0.6988399 (100)	total: 1m 29s	remaining: 11m 49s
200:	learn: 0.7141197	test: 0.7083607	best: 0.7083607 (200)	total: 2m 55s	remaining: 10m 10s
300:	learn: 0.7208799	test: 0.7140629	best: 0.7140629 (300)	total: 4m 21s	remaining: 8m 38s
400:	learn: 0.7267138	test: 0.7181069	best: 0.7181323 (399)	total: 5m 50s	remaining: 7m 14s
500:	learn: 0.7326769	test: 0.7216451	best: 0.7216451 (500)	total: 7m 16s	remaining: 5m 45s
600:	learn: 0.7382494	test: 0.7255608	best: 0.7255608 (600)	total: 8m 43s	remaining: 4m 18s
700:	learn: 0.7436296	test: 0.7287731	best: 0.7287731 (700)	total: 10m 9s	remaining: 2m 51s
800:	learn: 0.7486776	test: 0.7319208	best: 0.7320009 (798)	total: 11m 36s	remaining: 1m 24s
897:	learn: 0.7536420	test: 0.7355886	best: 0.7356521 (894)	total: 13m 1s	remaining: 0us

bestTest = 0.7356521041
bestIteration = 894

Shrink model to first 895 iterations.


[I 2025-07-14 17:23:27,580] Trial 14 finished with value: 0.90403125 and parameters: {'iterations': 898, 'learning_rate': 0.03952389587309868, 'depth': 10, 'l2_leaf_reg': 3.188652353245689, 'border_count': 244, 'random_strength': 0.5502698980015268, 'bagging_temperature': 0.5063266063933292}. Best is trial 12 with value: 0.9107479166666667.


0:	learn: 0.6613395	test: 0.6596480	best: 0.6596480 (0)	total: 928ms	remaining: 17m 59s
100:	learn: 0.7127977	test: 0.7076007	best: 0.7076007 (100)	total: 1m 26s	remaining: 15m 10s
200:	learn: 0.7249776	test: 0.7168420	best: 0.7168420 (200)	total: 2m 51s	remaining: 13m 43s
300:	learn: 0.7361241	test: 0.7234043	best: 0.7234043 (300)	total: 4m 17s	remaining: 12m 18s
400:	learn: 0.7464443	test: 0.7304606	best: 0.7305370 (398)	total: 5m 44s	remaining: 10m 55s
500:	learn: 0.7553442	test: 0.7357434	best: 0.7357434 (500)	total: 7m 9s	remaining: 9m 29s
600:	learn: 0.7641902	test: 0.7409710	best: 0.7409710 (600)	total: 8m 34s	remaining: 8m 3s
700:	learn: 0.7719327	test: 0.7458394	best: 0.7459165 (699)	total: 10m 1s	remaining: 6m 38s
800:	learn: 0.7796775	test: 0.7503805	best: 0.7503805 (800)	total: 11m 26s	remaining: 5m 12s
900:	learn: 0.7870346	test: 0.7548655	best: 0.7548655 (900)	total: 12m 52s	remaining: 3m 46s
1000:	learn: 0.7937903	test: 0.7588111	best: 0.7588111 (1000)	total: 14m 17s	rem

[I 2025-07-14 17:40:29,824] Trial 15 finished with value: 0.9141666666666667 and parameters: {'iterations': 1165, 'learning_rate': 0.07478361216767605, 'depth': 10, 'l2_leaf_reg': 4.086922676710861, 'border_count': 226, 'random_strength': 0.9749389319660008, 'bagging_temperature': 0.6217145182218249}. Best is trial 15 with value: 0.9141666666666667.


0:	learn: 0.6580936	test: 0.6562820	best: 0.6562820 (0)	total: 868ms	remaining: 18m 19s


In [None]:
# 이렇게 하면 총 10번 추가로 돌아감
# study.optimize(objective, n_trials=10)

In [None]:
# 결과 출력 (궁금하니까 --> 여기선 F1이 micro가 아니라 낮을 수 밖에 없다.)
print("Best F1 score:", study.best_value)
print("Best parameters:", study.best_params)

In [None]:
# 최적 파라미터로 모델 학습
# Optuna가 튜닝한 것이 아닌 것들은 다시 넣어줘야함.
# Optuna에 넣은 이유는 기본적으로 들어있어야 하기 때문에
best_params = study.best_params
best_params.update({
    "loss_function": "Logloss",
    "eval_metric": "F1",
    "random_state": 42,
    "verbose": 100
})

In [None]:
# 실제 모델
model = CatBoostClassifier(**best_params)
model.fit(X_train, y_train,
          eval_set=(X_val, y_val),
          early_stopping_rounds=50)

In [None]:
# 예측
test_df['Segment_pred'] = model.predict(X_test)

In [None]:
test_df['Segment_pred'].value_counts()

### 2단계 -> Segment == C or D vs Other (E 제거)

In [None]:
train_df2= train_df[train_df['Segment'] != 'E'].copy()
train_df2['Segment1'] = train_df2['Segment'].apply(lambda x: x if x in ['C', 'D'] else 'other')

In [None]:
exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df.columns
    if col not in exclude_cols
]

In [None]:
# 학습용 피처/타겟 정의
X2 = train_df2[feature_cols]
y2 = train_df2['Segment1']

In [None]:
# 예측할 test subset (1단계 결과가 Other인 경우)
test_df2 = test_df[test_df['Segment_pred'] == 'other'].copy()

In [None]:
# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X2.columns if col in test_df2.columns]
X2 = X2[common_cols]
X_test2 = test_df2[common_cols]

In [None]:
# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X2, y2, test_size=0.2, random_state=42, stratify=y2)

In [None]:
def objective2(trial):
    params = {
        # boosting 반복 횟수
        "iterations": trial.suggest_int("iterations", 300, 1000),
        # 한번에 얼마나 빠르게 학습하지지
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        # 트리의 깊이, 깊을수록 복잡한 모델이지만 과적합 가능성 O
        "depth": trial.suggest_int("depth", 4, 8), # --> 튜닝
        # L2 정규화 , 높을수록 규제 강화 - 과적합 억제
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 3, 10),
        # 수치형 피쳐를 이산화 할 때 사용하는 경계 개수 -> 수치형 많으면 영향력 있음
        "border_count": trial.suggest_int("border_count", 32, 128),
        # 트리 분기시 f.i 계산에 노이즈를 얼마나 줄 것인가
        "random_strength": trial.suggest_float("random_strength", 0.5, 1.0),
        # 샘플링의 다양성 조절 -> 과소적합일 땐 높이고, 과적합일 땐 낮게
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.2, 1.0),
        # 손실 함수 : 다중분류
        "loss_function": "MultiClass",
        # 평가 지표
        "eval_metric": "TotalF1:average=Micro",
        # 학습 로그를 몇 step마다 보여줄지
        "verbose": 100,
        # 트리 분기 시 feature 중요도 계산에 주는 노이즈 크기
        "random_state": 42
    }
    # early_stopping_rounds - 지정된 라운드 동안 성능 개선이 되지 않으면 학습 중지
    model2 = CatBoostClassifier(**params)
    model2.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=30, verbose=100)

    preds = model2.predict(X_val)
    f1 = f1_score(y_val, preds, average='micro')
    return f1

In [None]:
# Optuna 튜닝 시작
study2 = optuna.create_study(direction="maximize")
# XGB 때 n_trials=30 으로 했었는데 너무 오래걸려서 일단 20으로 설정
# n_trials는 optuna가 시도할 하이퍼파라미터 조합의 개수 (몇번 학습할건지)
study2.optimize(objective2, n_trials=20)

In [None]:
# 결과 출력 (궁금하니까 --> 여기선 F1이 micro가 아니라 낮을 수 밖에 없다.)
print("Best F1 score:", study2.best_value)
print("Best parameters:", study2.best_params)

In [None]:
# 최적 파라미터로 모델 학습
# Optuna가 튜닝한 것이 아닌 것들은 다시 넣어줘야함.
# Optuna에 넣은 이유는 기본적으로 들어있어야 하기 때문에
best_params2 = study2.best_params
best_params2.update({
    "loss_function": "MultiClass",
    "eval_metric": "TotalF1:average=Micro",
    "random_state": 42,
    "verbose": 100
})

In [None]:
# 실제 모델
model2 = CatBoostClassifier(**best_params2)
model2.fit(X_train, y_train,
          eval_set=(X_val, y_val),
          early_stopping_rounds=30)

In [None]:
# 예측
test_df2['Segment_pred'] = model2.predict(X_test2).flatten()

In [None]:
test_df2['Segment_pred'].value_counts()

In [None]:
# 예측 결과 병합
# 중복된 ID 제거: 마지막 값을 기준으로 유지
test_df2_dedup = test_df2.drop_duplicates(subset='ID', keep='last')
test_df2_indexed = test_df2_dedup.set_index('ID')

# test_df의 Segment_pred를 ID 기준으로 업데이트
test_df.loc[test_df['ID'].isin(test_df2_indexed.index), 'Segment_pred'] = \
    test_df.loc[test_df['ID'].isin(test_df2_indexed.index), 'ID'].map(test_df2_indexed['Segment_pred'])

### 3단계 -> Segment == a or B vs Other (C, D, E 제거)

In [None]:
# 학습용 피처/타겟 정의
train_df3 = train_df[train_df['Segment'].isin(['A', 'B'])].copy()
train_df3['Segment1'] = train_df3['Segment']

In [None]:
exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df.columns
    if col not in exclude_cols
]

In [None]:
# 학습용 피처/타겟 정의
X3 = train_df3[feature_cols]
y3 = train_df3['Segment1']

In [None]:
# 예측할 test subset (1단계 결과가 Other인 경우)
test_df3 = test_df[test_df['Segment_pred'] == 'other'].copy()

In [None]:
# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X3.columns if col in test_df3.columns]
X3 = X3[common_cols]
X_test3 = test_df3[common_cols]

In [None]:
# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X3, y3, test_size=0.2, random_state=42, stratify=y3)

In [None]:
def objective3(trial):
    params = {
        # boosting 반복 횟수
        "iterations": trial.suggest_int("iterations", 100, 500),
        # 한번에 얼마나 빠르게 학습하지지
        "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.2),
        # 트리의 깊이, 깊을수록 복잡한 모델이지만 과적합 가능성 O
        "depth": trial.suggest_int("depth", 3, 6), # --> 튜닝
        # L2 정규화 , 높을수록 규제 강화 - 과적합 억제
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 5, 20),
        # 수치형 피쳐를 이산화 할 때 사용하는 경계 개수 -> 수치형 많으면 영향력 있음
        "border_count": trial.suggest_int("border_count", 32, 64),
        # 트리 분기시 f.i 계산에 노이즈를 얼마나 줄 것인가
        "random_strength": trial.suggest_float("random_strength", 0.8, 2.0),
        # 샘플링의 다양성 조절 -> 과소적합일 땐 높이고, 과적합일 땐 낮게
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.5, 1.0),
        # 손실 함수 : 다중분류
        "loss_function": "MultiClass",
        # 평가 지표
        "eval_metric": "TotalF1:average=Micro",
        # 학습 로그를 몇 step마다 보여줄지
        "verbose": 100,
        # 트리 분기 시 feature 중요도 계산에 주는 노이즈 크기
        "random_state": 42
    }
    # early_stopping_rounds - 지정된 라운드 동안 성능 개선이 되지 않으면 학습 중지
    model3 = CatBoostClassifier(**params)
    model3.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=20, verbose=100)

    preds = model3.predict(X_val)
    f1 = f1_score(y_val, preds, average='micro')
    return f1

In [None]:
# Optuna 튜닝 시작
study3 = optuna.create_study(direction="maximize")
# XGB 때 n_trials=30 으로 했었는데 너무 오래걸려서 일단 20으로 설정
# n_trials는 optuna가 시도할 하이퍼파라미터 조합의 개수 (몇번 학습할건지)
study3.optimize(objective3, n_trials=15)

In [None]:
# 결과 출력 (궁금하니까 --> 여기선 F1이 micro가 아니라 낮을 수 밖에 없다.)
print("Best F1 score:", study3.best_value)
print("Best parameters:", study3.best_params)

In [None]:
# 최적 파라미터로 모델 학습
# Optuna가 튜닝한 것이 아닌 것들은 다시 넣어줘야함.
# Optuna에 넣은 이유는 기본적으로 들어있어야 하기 때문에
best_params3 = study3.best_params
best_params3.update({
    "loss_function": "MultiClass",
    "eval_metric": "TotalF1:average=Micro",
    "random_state": 42,
    "verbose": 100
})

In [None]:
# 실제 모델
model3 = CatBoostClassifier(**best_params3)
model3.fit(X_train, y_train,
          eval_set=(X_val, y_val),
          early_stopping_rounds=20)

In [None]:
# 예측
test_df3['Segment_pred'] = model3.predict(X_test3).flatten()

In [None]:
test_df3['Segment_pred'].value_counts()

In [None]:
# 예측 결과 병합
# 중복된 ID 제거: 마지막 값을 기준으로 유지
test_df3_dedup = test_df3.drop_duplicates(subset='ID', keep='last')
test_df3_indexed = test_df3_dedup.set_index('ID')

# test_df의 Segment_pred를 ID 기준으로 업데이트
test_df.loc[test_df['ID'].isin(test_df3_indexed.index), 'Segment_pred'] = \
    test_df.loc[test_df['ID'].isin(test_df3_indexed.index), 'ID'].map(test_df3_indexed['Segment_pred'])

# 최종 예측 결과 저장
submission = test_df[['ID', 'Segment_pred']].copy()
submission.rename(columns={'Segment_pred': 'Segment'}, inplace=True)
submission.to_parquet(f'../results/{train_file}_catboost_model3_submission5.parquet', index=False)
print(f"{train_file}_catboost_model3_submission 저장 완료!")