In [None]:
# 기본 라이브러리
import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score, log_loss, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score

from catboost import CatBoostClassifier

sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 준비

In [46]:
# 데이터 불러오기
X_df = pd.read_parquet('train_all.parquet')
y_df = pd.read_parquet('segment_12.parquet')

In [47]:
# 타겟 변수와 피처 나누기
X = X_df.drop(columns=['기준년월','ID'])
y = y_df['Segment']
# 6배 복제
y = pd.concat([y] * 6, ignore_index=True)

In [50]:
del X_df, y_df
gc.collect()

500

In [51]:
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [52]:
# y가 'E'인 것의 인덱스
e_indices = y == 'E'

e_indices

0          False
1           True
2          False
3          False
4           True
           ...  
2399995     True
2399996    False
2399997    False
2399998     True
2399999     True
Name: Segment, Length: 2400000, dtype: bool

In [53]:
# 'E'를 제거한 나머지를 새로운 변수에 저장
X = X[e_indices == False]
y = y[e_indices == False]

In [56]:
# Label Encoder
from sklearn.preprocessing import LabelEncoder

# 타겟 값 y 인코딩
le_y = LabelEncoder()
y = le_y.fit_transform(y)

# 타겟 값 X 인코딩 => 이미 인코딩되어 있는 값이 아닌가?
# cat_cols = X.select_dtypes(include='object').columns
# le = LabelEncoder()
# for col in cat_cols:
#     X[col] = le.fit_transform(X[col].astype(str))

In [10]:
# # 인코더 저장
# import pickle

# with open('d_encoder.dat', 'wb') as fp:
#     pickle.dump(le_y, fp)


In [58]:
# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [12]:
# # 스케일러 저장
# import pickle

# with open('d_scaler.dat', 'wb') as fp:
#     pickle.dump(scaler, fp)


|**변수**|**내용**|
|---|---|
|X, y|병합 및 인코딩된 전체 데이터셋|
|le_y|y(LabelEncoder 객체)|
|X_train_scaled, X_val_scaled, y_train, y_val|스케일링된 train/val 데이터셋|

# E가 아닌 것 中 Segment 'D' vs 나머지

In [59]:
y_train_d = np.where(y_train == 3, 1, 0)
y_val_d = np.where(y_val == 3, 1, 0)

In [14]:
# 교차검증 설정(kfold)
skf2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

- catboost

In [None]:
def objective_step2_c(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 3, 12),
        'iterations': trial.suggest_int('iterations', 100, 800, step=100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10.0, log=True),
        'rsm': trial.suggest_float('rsm', 0.5, 1.0),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 10),
        'loss_function': 'Logloss',    
        'verbose': 0,
        'random_state': 42
    }

    model_cat = CatBoostClassifier(**params)
    scores = cross_val_score(model_cat, X_train_scaled, y_train_d, cv=skf2, scoring='f1', error_score='raise')
    return scores.mean()

In [16]:
study_c = optuna.create_study(direction='maximize')
study_c.optimize(objective_step2_c, n_trials=10)

[I 2025-07-11 11:43:02,394] A new study created in memory with name: no-name-79e146a9-2f9b-42a0-9edd-68fa29ed6ff6


[I 2025-07-11 11:44:36,917] Trial 0 finished with value: 0.8980676804895898 and parameters: {'learning_rate': 0.28314481687107873, 'depth': 5, 'iterations': 100, 'subsample': 0.8531344301565364, 'l2_leaf_reg': 7.48896621054211, 'bagging_temperature': 0.5537237759998361, 'random_strength': 9.763406531468368, 'rsm': 0.5894248857179345, 'leaf_estimation_iterations': 10}. Best is trial 0 with value: 0.8980676804895898.
[I 2025-07-11 11:46:02,282] Trial 1 finished with value: 0.9007652561493685 and parameters: {'learning_rate': 0.12911163390660862, 'depth': 5, 'iterations': 200, 'subsample': 0.5916703187479357, 'l2_leaf_reg': 8.606491442618916, 'bagging_temperature': 0.6634754260522897, 'random_strength': 1.7434861923778218e-09, 'rsm': 0.6913624979360065, 'leaf_estimation_iterations': 4}. Best is trial 1 with value: 0.9007652561493685.
[I 2025-07-11 11:57:57,994] Trial 2 finished with value: 0.8988283735445529 and parameters: {'learning_rate': 0.050396196745816614, 'depth': 10, 'iterations'

In [None]:
# 최적 파라미터로 모델 학습
best_params_cat = study_c.best_params
model_cat_best = CatBoostClassifier(**best_params_cat,  random_state=42, verbose=100)
model_cat_best.fit(X_train_scaled, y_train_d)

In [61]:
# 예측
pred_step2_c = model_cat_best.predict(X_val_scaled)

In [62]:
# 정확도
accc = accuracy_score(y_val_d, pred_step2_c)
print("Accuracy:", accc)

# F1-score (macro 평균 사용: 클래스 불균형에 강함)
f1c = f1_score(y_val_d, pred_step2_c, average='macro')
print("F1 Score (macro):", f1c)

# Confusion matrix
cmc = confusion_matrix(y_val_d, pred_step2_c)
print("Confusion Matrix:")
print(cmc)

# 상세 보고서
print("\nClassification Report:")
print(classification_report(y_val_d, pred_step2_c))

Accuracy: 0.8804372842347525
F1 Score (macro): 0.8398704103007986
Confusion Matrix:
[[18024  7717]
 [ 3712 66137]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.70      0.76     25741
           1       0.90      0.95      0.92     69849

    accuracy                           0.88     95590
   macro avg       0.86      0.82      0.84     95590
weighted avg       0.88      0.88      0.88     95590



In [None]:
# import pickle
# with open('step2_model.dat', 'wb') as fp:
#     pickle.dump(model_cat_best, fp)