In [38]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

from sklearn.metrics import confusion_matrix, classification_report

In [2]:
#data_type = "train"
# month = "07"
# category = "잔액정보"

# local
root_path = '../data/open/머신러닝'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

drive_folder = f'{root_path}'

In [3]:
# 통합 parquet
df1 = pd.read_parquet(f'{drive_folder}/xgb_top8_All.parquet')

## 학습 (LGBM + Optuna)

In [8]:
# 1. Segment 라벨 만들기
segment_cols = ['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E']
y = df1[segment_cols].idxmax(axis=1).map({
    'Segment_A': 0,
    'Segment_B': 1,
    'Segment_C': 2,
    'Segment_D': 3,
    'Segment_E': 4
})

In [10]:
# 2. X 구성
X = df1.drop(columns=segment_cols + ['ID', '기준년월'])

In [12]:
# 3. 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [48]:
import optuna
from lightgbm import early_stopping

def objective(trial):
    # 5. 실험할 파라미터 후보들을 정의
    params = {
        # 다중분류
        'objective': 'multiclass',  
        # 클래스 수 -> segment 5개
        'num_class': 5,
        # 평가 기준을 지정
        'metric': 'multi_logloss',
        # 학습 속도 --> 작을수록 안정적, 클수록 빠르지만 위험
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2), 
        # early stopping이 있으므로 길게 설정
        'n_estimators': 500,
        # 트리 깊이 제한 -> 과적합/과소적합 조절 -> 정확도 영향 높음 (보통 5~8)
        'max_depth': trial.suggest_int('max_depth', 3, 10), 
        # 트리에서 사용가능한 노드 -> 클수록 복잡한 규칙 학습 가능 -> 과적합 위험도 증가
        'num_leaves': trial.suggest_int('num_leaves', 20, 256), 
        # 최소 샘플 -> 작을수록 복잡한 패턴 학습 가능 -> 과적합 증가 / 정확도 영향 있음
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 1000), 
        # 학습 데이터 중 몇 %만 랜덤하게 샘플링할지 -> 과적합 방지 -> 0.7~0.9 추천
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        # 피처를 랜덤 샘플링해서 트리마다 다르게 사용 -> 일반화 도움 -> 보통 0.7~0.9
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        # 사실 위와 거의 동일 (LightGBM만 씀)
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        # L1 정규화 -> 가중치를 작게 만듬
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 5.0),
        # L2 정규화 (릿지) -> 가중치 크기를 억제 -> 과적합 방지 -> 튜닝 2순위 정도
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 5.0),
        # 고정값, 로그 줄이기 -> 튜닝과는 무관
        'random_state': 42,
        'verbosity': -1
    }

    # 6. LGBM 모델 학습
    model = LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='multi_logloss',
        callbacks=[early_stopping(50)],
    )

    # 7. validation 정확도 계산
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    return acc  # Optuna가 이 값을 maximize할 것

In [50]:
# 8. 탐색 설정 + 실행
study = optuna.create_study(direction='maximize') # 정확도 -> 높을수록 좋다 !
study.optimize(objective, n_trials=30)  # 실험 횟수는 상황에 따라 조절 !

[I 2025-07-08 16:17:59,661] A new study created in memory with name: no-name-d0b82f69-b59e-4e4b-bc50-cda4de9813a9


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.262118


[I 2025-07-08 16:24:04,167] Trial 0 finished with value: 0.896775 and parameters: {'learning_rate': 0.02445798199718479, 'max_depth': 5, 'num_leaves': 113, 'min_data_in_leaf': 478, 'subsample': 0.7058175671687938, 'colsample_bytree': 0.9767952286768465, 'feature_fraction': 0.6361658202798193, 'lambda_l1': 1.8146040541404884, 'lambda_l2': 3.40230776580253}. Best is trial 0 with value: 0.896775.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.258854


[I 2025-07-08 16:28:46,282] Trial 1 finished with value: 0.89790625 and parameters: {'learning_rate': 0.048904393154552556, 'max_depth': 4, 'num_leaves': 224, 'min_data_in_leaf': 583, 'subsample': 0.9090993552682257, 'colsample_bytree': 0.9291679056257426, 'feature_fraction': 0.8589717121889116, 'lambda_l1': 4.705866635841337, 'lambda_l2': 1.0609988442959606}. Best is trial 1 with value: 0.89790625.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.229034


[I 2025-07-08 16:34:23,100] Trial 2 finished with value: 0.90931875 and parameters: {'learning_rate': 0.12519307363879148, 'max_depth': 6, 'num_leaves': 243, 'min_data_in_leaf': 255, 'subsample': 0.8180668947641132, 'colsample_bytree': 0.9884186762230243, 'feature_fraction': 0.8048564113603496, 'lambda_l1': 4.163426089960941, 'lambda_l2': 4.401714618094852}. Best is trial 2 with value: 0.90931875.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.254625


[I 2025-07-08 16:38:46,236] Trial 3 finished with value: 0.8999333333333334 and parameters: {'learning_rate': 0.16077702374332473, 'max_depth': 3, 'num_leaves': 233, 'min_data_in_leaf': 677, 'subsample': 0.8234769362436538, 'colsample_bytree': 0.6586455105153587, 'feature_fraction': 0.6216834562200153, 'lambda_l1': 2.566316156957007, 'lambda_l2': 0.6016734899696385}. Best is trial 2 with value: 0.90931875.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.28466


[I 2025-07-08 16:43:35,752] Trial 4 finished with value: 0.8886645833333333 and parameters: {'learning_rate': 0.012335320121332628, 'max_depth': 4, 'num_leaves': 72, 'min_data_in_leaf': 379, 'subsample': 0.6230622393039723, 'colsample_bytree': 0.7400207766709231, 'feature_fraction': 0.8822616062290418, 'lambda_l1': 3.1154568410917127, 'lambda_l2': 1.4102220782273212}. Best is trial 2 with value: 0.90931875.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.276393


[I 2025-07-08 16:47:37,406] Trial 5 finished with value: 0.8913208333333333 and parameters: {'learning_rate': 0.030484733218012962, 'max_depth': 3, 'num_leaves': 234, 'min_data_in_leaf': 501, 'subsample': 0.9695955997987118, 'colsample_bytree': 0.6601631042867661, 'feature_fraction': 0.7819064290751117, 'lambda_l1': 4.6640495923809215, 'lambda_l2': 1.7523245753782501}. Best is trial 2 with value: 0.90931875.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.238254


[I 2025-07-08 16:53:17,763] Trial 6 finished with value: 0.9053395833333333 and parameters: {'learning_rate': 0.13803849243142358, 'max_depth': 5, 'num_leaves': 148, 'min_data_in_leaf': 502, 'subsample': 0.7546211752915659, 'colsample_bytree': 0.6556958024552347, 'feature_fraction': 0.620055963917556, 'lambda_l1': 1.6447166352103437, 'lambda_l2': 3.0660206531463348}. Best is trial 2 with value: 0.90931875.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.245202


[I 2025-07-08 16:58:52,669] Trial 7 finished with value: 0.9029354166666667 and parameters: {'learning_rate': 0.07352225779271027, 'max_depth': 5, 'num_leaves': 163, 'min_data_in_leaf': 774, 'subsample': 0.6605188447185963, 'colsample_bytree': 0.8948743161393868, 'feature_fraction': 0.8158790968186029, 'lambda_l1': 0.08468750649202217, 'lambda_l2': 4.797792679962195}. Best is trial 2 with value: 0.90931875.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.2486


[I 2025-07-08 17:03:17,176] Trial 8 finished with value: 0.9016270833333333 and parameters: {'learning_rate': 0.10415480970816081, 'max_depth': 4, 'num_leaves': 103, 'min_data_in_leaf': 726, 'subsample': 0.663340035893479, 'colsample_bytree': 0.7371621903262968, 'feature_fraction': 0.7631966198267467, 'lambda_l1': 4.412073893782353, 'lambda_l2': 1.2587762926777142}. Best is trial 2 with value: 0.90931875.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.229415


[I 2025-07-08 17:10:49,444] Trial 9 finished with value: 0.91695 and parameters: {'learning_rate': 0.18444096092491774, 'max_depth': 8, 'num_leaves': 187, 'min_data_in_leaf': 574, 'subsample': 0.8691228891130385, 'colsample_bytree': 0.6370740133128872, 'feature_fraction': 0.804983162692035, 'lambda_l1': 2.069296392622274, 'lambda_l2': 0.06603823633119954}. Best is trial 9 with value: 0.91695.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.23883


[I 2025-07-08 17:16:06,558] Trial 10 finished with value: 0.90748125 and parameters: {'learning_rate': 0.18629184860851794, 'max_depth': 9, 'num_leaves': 23, 'min_data_in_leaf': 954, 'subsample': 0.8695422130915562, 'colsample_bytree': 0.6018958974379303, 'feature_fraction': 0.9583552128085203, 'lambda_l1': 0.8019347376096158, 'lambda_l2': 2.236323719855494}. Best is trial 9 with value: 0.91695.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.208779


[I 2025-07-08 17:23:48,963] Trial 11 finished with value: 0.91755625 and parameters: {'learning_rate': 0.12118794825804258, 'max_depth': 8, 'num_leaves': 187, 'min_data_in_leaf': 158, 'subsample': 0.7993232489658922, 'colsample_bytree': 0.848335381239864, 'feature_fraction': 0.7179140058975393, 'lambda_l1': 3.304483751231339, 'lambda_l2': 4.490487563766552}. Best is trial 11 with value: 0.91755625.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.212086


[I 2025-07-08 17:32:49,646] Trial 12 finished with value: 0.9251604166666667 and parameters: {'learning_rate': 0.1994756065112091, 'max_depth': 9, 'num_leaves': 183, 'min_data_in_leaf': 180, 'subsample': 0.757262749926822, 'colsample_bytree': 0.8254043496029254, 'feature_fraction': 0.706080869021128, 'lambda_l1': 3.490499587490567, 'lambda_l2': 0.023128215968533095}. Best is trial 12 with value: 0.9251604166666667.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.1871


[I 2025-07-08 17:41:01,563] Trial 13 finished with value: 0.92821875 and parameters: {'learning_rate': 0.19999168488425814, 'max_depth': 10, 'num_leaves': 191, 'min_data_in_leaf': 137, 'subsample': 0.7620767275689855, 'colsample_bytree': 0.8383679745027931, 'feature_fraction': 0.7061333827416832, 'lambda_l1': 3.3562440940365557, 'lambda_l2': 3.7503909863427727}. Best is trial 13 with value: 0.92821875.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.187452


[I 2025-07-08 17:49:12,281] Trial 14 finished with value: 0.9284520833333333 and parameters: {'learning_rate': 0.19667473997679916, 'max_depth': 10, 'num_leaves': 194, 'min_data_in_leaf': 115, 'subsample': 0.7506404220591687, 'colsample_bytree': 0.8084048370488958, 'feature_fraction': 0.6991441067821852, 'lambda_l1': 3.6444121593475796, 'lambda_l2': 3.5623594199166893}. Best is trial 14 with value: 0.9284520833333333.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.187897


[I 2025-07-08 17:57:43,954] Trial 15 finished with value: 0.9271145833333333 and parameters: {'learning_rate': 0.16351709632030567, 'max_depth': 10, 'num_leaves': 207, 'min_data_in_leaf': 100, 'subsample': 0.7483135688283566, 'colsample_bytree': 0.7741716482881111, 'feature_fraction': 0.6853535873156255, 'lambda_l1': 3.6770315735836223, 'lambda_l2': 3.7520005975193484}. Best is trial 14 with value: 0.9284520833333333.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.20602


[W 2025-07-08 18:05:36,960] Trial 16 failed with parameters: {'learning_rate': 0.16337507858713318, 'max_depth': 10, 'num_leaves': 133, 'min_data_in_leaf': 312, 'subsample': 0.7089774060053964, 'colsample_bytree': 0.9030156882727944, 'feature_fraction': 0.6685739468973757, 'lambda_l1': 2.650283465028716, 'lambda_l2': 2.7814353724210132} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\win11\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\win11\AppData\Local\Temp\ipykernel_28308\2661718865.py", line 48, in objective
    preds = model.predict(X_val)
            ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\win11\anaconda3\Lib\site-packages\lightgbm\sklearn.py", line 1597, in predict
    result = self.predict_proba(
             ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\win11\anaconda3\Lib\site-packages\lightgbm\sklearn.py", line 1

KeyboardInterrupt: 

In [52]:
# 9. 결과 출력
print("🎯 Best Accuracy:", study.best_value)
print("✅ Best Parameters:")
print(study.best_params)

🎯 Best Accuracy: 0.9284520833333333
✅ Best Parameters:
{'learning_rate': 0.19667473997679916, 'max_depth': 10, 'num_leaves': 194, 'min_data_in_leaf': 115, 'subsample': 0.7506404220591687, 'colsample_bytree': 0.8084048370488958, 'feature_fraction': 0.6991441067821852, 'lambda_l1': 3.6444121593475796, 'lambda_l2': 3.5623594199166893}


In [56]:
# 10. 전체 데이터로 재학습 (test 예측용)
best_model = LGBMClassifier(
    **study.best_params,
    objective='multiclass',
    num_class=5,
    metric='multi_logloss',
    n_estimators=500,
    random_state=42
)
best_model.fit(X, y)

In [58]:
df2 = pd.read_parquet(f'{drive_folder}/test_12월_통합.parquet')

In [60]:
# 1. 학습에 사용된 피처
train_features = best_model.feature_name_

In [62]:
# 2. test 데이터에서 ID/기준년월 제거
X_test = df2.drop(columns=['ID', '기준년월'], errors='ignore')

In [64]:
# 3. 누락된 컬럼 0으로 추가
missing_cols = [col for col in train_features if col not in X_test.columns]
for col in missing_cols:
    X_test[col] = 0

In [66]:
# 4. 순서 맞추기
X_test = X_test[train_features]

In [68]:
# 예측
test_preds = best_model.predict(X_test)

In [70]:
# 숫자 → 문자 라벨 매핑
label_map = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}
df2['Segment'] = pd.Series(test_preds).map(label_map)

In [72]:
# 9. 최종 결과만 저장
df2[['ID', 'Segment']].to_parquet(f'{drive_folder}/LGBM_Optuna_segment_pred.parquet', index=False)
print("✅ 저장 완료: LGBM_Optuna_segment_pred.parquet (ID, Segment)")

✅ 저장 완료: LGBM_Optuna_segment_pred.parquet (ID, Segment)


In [74]:
df2 = pd.read_parquet(f'{drive_folder}/LGBM_Optuna_segment_pred.parquet')
df2

Unnamed: 0,ID,Segment
0,TEST_00000,E
1,TEST_00001,E
2,TEST_00002,E
3,TEST_00003,E
4,TEST_00004,E
...,...,...
99995,TEST_99995,E
99996,TEST_99996,E
99997,TEST_99997,E
99998,TEST_99998,C
