In [38]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

import matplotlib as mpl
import matplotlib.font_manager as fm

In [2]:
df1 = pd.read_parquet('train_all_high_corr.parquet')
df2 = pd.read_parquet('train_segment.parquet')

In [3]:
df3 = pd.concat([df1, df2['Segment']], axis=1)

In [4]:
df3.drop(columns=['ID'], inplace=True)

In [5]:
# 0. X, y 분리
#─────────────────────────────────────────────
X = df3.drop(columns=['Segment'])
y = df3['Segment']

#─────────────────────────────────────────────
# 1. 학습/검증 세트로 먼저 나누기 (Stratified)
#─────────────────────────────────────────────
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,            # 80 : 20
    stratify=y,               # 클래스 비율 유지
    random_state=42
)

print('Train class distribution:\n', y_train.value_counts(normalize=True))
print('Val   class distribution:\n', y_val.value_counts(normalize=True))

Train class distribution:
 Segment
E    0.800855
D    0.145518
C    0.053163
A    0.000405
B    0.000060
Name: proportion, dtype: float64
Val   class distribution:
 Segment
E    0.800856
D    0.145517
C    0.053163
A    0.000404
B    0.000060
Name: proportion, dtype: float64


In [6]:
# 2. 학습 세트를 두 개 그룹으로 분할
#─────────────────────────────────────────────
group_AB  = ['A', 'B']
group_CDE = ['C', 'D', 'E']

# ── A, B
mask_AB_train = y_train.isin(group_AB)
mask_AB_val   = y_val.isin(group_AB)

X_train_AB, y_train_AB = X_train[mask_AB_train], y_train[mask_AB_train]
X_val_AB,   y_val_AB   = X_val[mask_AB_val],   y_val[mask_AB_val]

# ── C, D, E
mask_CDE_train = y_train.isin(group_CDE)
mask_CDE_val   = y_val.isin(group_CDE)

X_train_CDE, y_train_CDE = X_train[mask_CDE_train], y_train[mask_CDE_train]
X_val_CDE,   y_val_CDE   = X_val[mask_CDE_val],   y_val[mask_CDE_val]

In [7]:
# ──────────────────────────────────────
# 0. 라이브러리
# ──────────────────────────────────────
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# ──────────────────────────────────────
# 1. A, B 모델 (Binary)
# ──────────────────────────────────────
le_ab = LabelEncoder()
y_train_AB_enc = le_ab.fit_transform(y_train_AB)   # 'A','B' → 0,1
y_val_AB_enc   = le_ab.transform(y_val_AB)

model_AB = lgb.LGBMClassifier(
    objective='binary',
    class_weight='balanced',   # 자동으로 소수 클래스에 가중치
    n_estimators=1000,
    learning_rate=0.05,
    random_state=42
)

model_AB.fit(
    X_train_AB, y_train_AB_enc,
    eval_set=[(X_val_AB, y_val_AB_enc)],
    eval_metric='binary_logloss',
    callbacks=[lgb.early_stopping(100, verbose=False)]
)

# ──────────────────────────────────────
# 2. C, D, E 모델 (Multi-class)
# ──────────────────────────────────────
le_cde = LabelEncoder()
y_train_CDE_enc = le_cde.fit_transform(y_train_CDE)   # 'C','D','E' → 0,1,2
y_val_CDE_enc   = le_cde.transform(y_val_CDE)

model_CDE = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    class_weight='balanced',
    n_estimators=1500,
    learning_rate=0.05,
    random_state=42
)

model_CDE.fit(
    X_train_CDE, y_train_CDE_enc,
    eval_set=[(X_val_CDE, y_val_CDE_enc)],
    eval_metric='multi_logloss',
    callbacks=[lgb.early_stopping(100, verbose=False)]
)

# ──────────────────────────────────────
# 3. 검증 세트 성능 확인
# ──────────────────────────────────────
print("── Group AB ──")
y_pred_AB = le_ab.inverse_transform(model_AB.predict(X_val_AB))
print(classification_report(y_val_AB, y_pred_AB))
print(confusion_matrix(y_val_AB, y_pred_AB))

print("\n── Group CDE ──")
y_pred_CDE = le_cde.inverse_transform(model_CDE.predict(X_val_CDE))
print(classification_report(y_val_CDE, y_pred_CDE))
print(confusion_matrix(y_val_CDE, y_pred_CDE))

[LightGBM] [Info] Number of positive: 115, number of negative: 778
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001991 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9701
[LightGBM] [Info] Number of data points in the train set: 893, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.135603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16131
[LightGBM] [Info] Number of data points in the train set: 1919107, number of used features: 90
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098

### C,D,E 모델 (SMOTE + 가중치)

In [9]:
# 0) 준비
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# 숫자 라벨 매핑
idx_C, idx_D, idx_E = le_cde.transform(['C','D','E'])   # 예: 0,1,2
n_E = (y_train_CDE_enc == idx_E).sum()                  # 다수 클래스(E) 개수
target_n = int(n_E * 0.9)                               # E의 90 %까지 증폭

# 1) SMOTE : C·D 를 target_n 개수까지 오버샘플링
sm = SMOTE(
    sampling_strategy={idx_C: target_n, idx_D: target_n},
    random_state=42, k_neighbors=5
)
X_res, y_res = sm.fit_resample(X_train_CDE, y_train_CDE_enc)

# 2) 클래스 가중치 : D 쪽을 한 번 더 강조
cw = {idx_C: 1.0, idx_D: 2.5, idx_E: 0.5}

model_CDE_tuned = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    class_weight=cw,
    n_estimators=1800,
    learning_rate=0.04,
    num_leaves=95,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.9,
    random_state=42
)

model_CDE_tuned.fit(
    X_res, y_res,
    eval_set=[(X_val_CDE, y_val_CDE_enc)],
    eval_metric='multi_logloss',
    callbacks=[lgb.early_stopping(120, verbose=False)]
)

print("\n── CDE (SMOTE + 가중치) ──")
pred = model_CDE_tuned.predict(X_val_CDE)
print(classification_report(y_val_CDE_enc, pred, target_names=['C','D','E']))
print(confusion_matrix(y_val_CDE_enc, pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.322415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16666
[LightGBM] [Info] Number of data points in the train set: 4305393, number of used features: 90
[LightGBM] [Info] Start training from score -1.400088
[LightGBM] [Info] Start training from score -0.483797
[LightGBM] [Info] Start training from score -1.987874

── CDE (SMOTE + 가중치) ──
              precision    recall  f1-score   support

           C       0.83      0.73      0.77     25518
           D       0.52      0.89      0.65     69848
           E       0.98      0.86      0.92    384411

    accuracy                           0.86    479777
   macro avg       0.78      0.83      0.78    479777
weighted avg       0.91      0.86      0.87    479777

[[ 18589   6314    615]
 [  2185  62263   5400]
 [  1701  51931 330779]]


### Optuna 적용

In [11]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install -U lightgbm

Note: you may need to restart the kernel to use updated packages.


In [40]:
def objective(trial):
    params = {
        "objective": "multiclass",
        "num_class": 3,
        "metric": "multi_logloss",          # ★ 지표 지정
        "verbosity": -1,
        "learning_rate": trial.suggest_float("lr", 0.02, 0.15, log=True),
        "num_leaves": trial.suggest_int("leaves", 31, 191, step=16),
        "min_child_samples": trial.suggest_int("mcs", 10, 60),
        "feature_fraction": trial.suggest_float("ff", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bf", 0.6, 1.0),
        "lambda_l2": trial.suggest_float("l2", 0.0, 5.0),
        "class_weight": {idx_C:1.0, idx_D:2.0, idx_E:0.5},
        "seed": 42,
    }

    cv = StratifiedKFold(4, shuffle=True, random_state=42)
    f1s = []
    for tr_idx, va_idx in cv.split(X_res, y_res):
        mdl = lgb.LGBMClassifier(**params, n_estimators=500)
        mdl.fit(
            X_res.iloc[tr_idx], y_res[tr_idx],
            eval_set=[(X_res.iloc[va_idx], y_res[va_idx])],
            eval_metric='multi_logloss',            # ★ fit에도 전달 OK
            callbacks=[lgb.early_stopping(30, verbose=False)]
        )
        preds = mdl.predict(X_res.iloc[va_idx])
        f1s.append(f1_score(y_res[va_idx], preds, average="macro"))
    return np.mean(f1s)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)   # 빠르게 10회만
print("Best params:", study.best_params)

[I 2025-07-10 08:25:47,235] A new study created in memory with name: no-name-30340b20-6242-4206-b403-2f914cb1efc1
[I 2025-07-10 08:43:53,851] Trial 0 finished with value: 0.8278086859106778 and parameters: {'lr': 0.0566875982011417, 'leaves': 31, 'mcs': 27, 'ff': 0.880842589464308, 'bf': 0.8034440659430521, 'l2': 1.5931584886576071}. Best is trial 0 with value: 0.8278086859106778.


Best params: {'lr': 0.0566875982011417, 'leaves': 31, 'mcs': 27, 'ff': 0.880842589464308, 'bf': 0.8034440659430521, 'l2': 1.5931584886576071}


### Optuna로 찾은 최적의 파라미터

In [42]:
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# ───────────────────────────────────────────────
# 1. 레이블 인코딩
# ───────────────────────────────────────────────
le_cde = LabelEncoder()
y_train_CDE_enc = le_cde.fit_transform(y_train_CDE)
y_val_CDE_enc   = le_cde.transform(y_val_CDE)

# ───────────────────────────────────────────────
# 2. 모델 정의 (Optuna 최적 파라미터 적용)
# ───────────────────────────────────────────────
model_CDE = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    learning_rate=0.0567,
    num_leaves=31,
    min_child_samples=27,
    feature_fraction=0.8808,
    bagging_fraction=0.8034,
    reg_lambda=1.5931,
    class_weight='balanced',
    n_estimators=1500,
    random_state=42
)

# ───────────────────────────────────────────────
# 3. 학습
# ───────────────────────────────────────────────
model_CDE.fit(
    X_train_CDE, y_train_CDE_enc,
    eval_set=[(X_val_CDE, y_val_CDE_enc)],
    eval_metric='multi_logloss',
    callbacks=[lgb.early_stopping(100, verbose=False)]
)

# ───────────────────────────────────────────────
# 4. 검증 성능 평가
# ───────────────────────────────────────────────
y_pred_CDE = model_CDE.predict(X_val_CDE)
y_pred_CDE_label = le_cde.inverse_transform(y_pred_CDE)

print("── CDE 모델 평가 (Optuna 튜닝 적용) ──")
print(classification_report(y_val_CDE, y_pred_CDE_label))
print(confusion_matrix(y_val_CDE, y_pred_CDE_label))


── CDE 모델 평가 (Optuna 튜닝 적용) ──
              precision    recall  f1-score   support

           C       0.62      0.84      0.71     25518
           D       0.53      0.78      0.63     69848
           E       0.98      0.87      0.92    384411

    accuracy                           0.86    479777
   macro avg       0.71      0.83      0.76    479777
weighted avg       0.89      0.86      0.87    479777

[[ 21342   3638    538]
 [  8620  54245   6983]
 [  4619  43610 336182]]


### GATE 모델 학습

In [3]:
# 1. 라벨 준비
y_train_gate = y_train.apply(lambda x: 'AB' if x in ['A', 'B'] else 'CDE')
y_val_gate   = y_val.apply(lambda x: 'AB' if x in ['A', 'B'] else 'CDE')

# 2. 인코딩
from sklearn.preprocessing import LabelEncoder
le_gate = LabelEncoder()
y_train_gate_enc = le_gate.fit_transform(y_train_gate)  # 'AB'→0, 'CDE'→1 또는 반대일 수 있음
y_val_gate_enc   = le_gate.transform(y_val_gate)

# 3. LightGBM 분류기
import lightgbm as lgb
gate_model = lgb.LGBMClassifier(
    objective='binary',
    n_estimators=1000,
    learning_rate=0.05,
    class_weight='balanced',
    random_state=42
)

# 4. 학습
gate_model.fit(
    X_train, y_train_gate_enc,
    eval_set=[(X_val, y_val_gate_enc)],
    eval_metric='binary_logloss',
    callbacks=[lgb.early_stopping(100, verbose=False)]
)

# 5. 확인 (선택)
from sklearn.metrics import classification_report
y_pred_gate = le_gate.inverse_transform(gate_model.predict(X_val))
print("=== Gate Model 평가 (AB vs CDE) ===")
print(classification_report(y_val_gate, y_pred_gate))

NameError: name 'y_train' is not defined

### GATE 모델 적용

In [5]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# 1) 게이트 확률
gate_proba = gate_model.predict_proba(X_val)

# 클래스 순서 확인 후 AB의 인덱스 가져오기
ab_index = np.where(le_gate.classes_ == 'AB')[0][0]
p_ab_gate = gate_proba[:, ab_index]
p_cde_gate = 1 - p_ab_gate

# 서브모델 확률
proba_ab   = model_AB.predict_proba(X_val)
proba_cde  = model_CDE.predict_proba(X_val)

# 클래스 인덱스 매핑
idx_A = le_ab.transform(['A'])[0]
idx_B = le_ab.transform(['B'])[0]
idx_C = le_cde.transform(['C'])[0]
idx_D = le_cde.transform(['D'])[0]
idx_E = le_cde.transform(['E'])[0]

# 조건부 확률 결합
p_final = np.column_stack([
    proba_ab[:, idx_A] * p_ab_gate,
    proba_ab[:, idx_B] * p_ab_gate,
    proba_cde[:, idx_C] * p_cde_gate,
    proba_cde[:, idx_D] * p_cde_gate,
    proba_cde[:, idx_E] * p_cde_gate
])
p_final /= p_final.sum(axis=1, keepdims=True)  # 정규화

# 예측 & 평가
label_order = np.array(['A', 'B', 'C', 'D', 'E'])
y_pred = label_order[p_final.argmax(axis=1)]

print("=== 최종 5-클래스 평가 (게이트 + 서브모델) ===")
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

NameError: name 'gate_model' is not defined

### 튜닝한 CDE 모델 사용한 GATE 모델 적용

In [65]:
# 튜닝된 CDE 모델 적용
model_CDE = model_CDE_tuned  # ★ 중요: 여기서 기존 모델을 튜닝된 걸로 교체!

# ─────────────────────────────────────────────
# 1. 확률 가져오기 & 결합
# ─────────────────────────────────────────────
import numpy as np
from sklearn.metrics import classification_report

# 게이트 확률
gate_proba = gate_model.predict_proba(X_val)

# 클래스 순서 확인 후 AB의 인덱스 가져오기
ab_index = np.where(le_gate.classes_ == 'AB')[0][0]
p_ab_gate = gate_proba[:, ab_index]
p_cde_gate = 1 - p_ab_gate


# 서브모델 확률
proba_ab  = model_AB.predict_proba(X_val)           # (n, 2)
proba_cde = model_CDE.predict_proba(X_val)          # (n, 3)

# 클래스 인덱스 매칭
idx_A = le_ab.transform(['A'])[0]
idx_B = le_ab.transform(['B'])[0]
idx_C = le_cde.transform(['C'])[0]
idx_D = le_cde.transform(['D'])[0]
idx_E = le_cde.transform(['E'])[0]

# 조건부 확률 결합
p_final = np.column_stack([
    proba_ab[:, idx_A] * p_ab_gate,
    proba_ab[:, idx_B] * p_ab_gate,
    proba_cde[:, idx_C] * p_cde_gate,
    proba_cde[:, idx_D] * p_cde_gate,
    proba_cde[:, idx_E] * p_cde_gate
])
p_final /= p_final.sum(axis=1, keepdims=True)

# 최종 예측
label_order = np.array(['A', 'B', 'C', 'D', 'E'])
y_pred = label_order[p_final.argmax(axis=1)]

print("✅ [최종 평가] 게이트 + 튜닝된 CDE 모델")
print(classification_report(y_val, y_pred))

✅ [최종 평가] 게이트 + 튜닝된 CDE 모델
              precision    recall  f1-score   support

           A       0.88      0.95      0.91       194
           B       0.93      0.93      0.93        29
           C       0.83      0.73      0.77     25518
           D       0.52      0.89      0.65     69848
           E       0.98      0.86      0.92    384411

    accuracy                           0.86    480000
   macro avg       0.83      0.87      0.84    480000
weighted avg       0.91      0.86      0.87    480000



### C,D 서브모델 학습

In [55]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier

# 1) C vs D 데이터 추출
cd_mask = y_train_CDE.isin(['C', 'D'])
X_train_CD = X_train_CDE[cd_mask]
y_train_CD = y_train_CDE[cd_mask]

# 2) Label 인코딩
le_cd = LabelEncoder()
y_train_CD_enc = le_cd.fit_transform(y_train_CD)  # 'C'→0, 'D'→1

# 3) SMOTE 적용
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_CD, y_train_CD_enc)

# 4) 모델 학습
model_CD = LGBMClassifier(
    objective='binary',
    class_weight='balanced',
    n_estimators=1000,
    learning_rate=0.05,
    random_state=42
)

model_CD.fit(X_res, y_res)


In [59]:
# 기존 CDE 확률
proba_cde = model_CDE.predict_proba(X_val)

# 1) 게이트 확률
gate_proba = gate_model.predict_proba(X_val)

# 클래스 순서 확인 후 AB의 인덱스 가져오기
ab_index = np.where(le_gate.classes_ == 'AB')[0][0]
p_ab_gate = gate_proba[:, ab_index]
p_cde_gate = 1 - p_ab_gate

# 기존 서브 확률
idx_C = le_cde.transform(['C'])[0]
idx_D = le_cde.transform(['D'])[0]
idx_E = le_cde.transform(['E'])[0]

# C+D 총합 확률
p_c_or_d = proba_cde[:, idx_C] + proba_cde[:, idx_D]

# CD 서브모델 확률
proba_cd = model_CD.predict_proba(X_val)

idx_c = le_cd.transform(['C'])[0]
idx_d = le_cd.transform(['D'])[0]

# C, D 보정 확률
p_c_final = p_cde_gate * p_c_or_d * proba_cd[:, idx_c]
p_d_final = p_cde_gate * p_c_or_d * proba_cd[:, idx_d]
p_e_final = p_cde_gate * proba_cde[:, idx_E]

# AB 확률
proba_ab = model_AB.predict_proba(X_val)
idx_A = le_ab.transform(['A'])[0]
idx_B = le_ab.transform(['B'])[0]

p_a_final = p_ab_gate * proba_ab[:, idx_A]
p_b_final = p_ab_gate * proba_ab[:, idx_B]

# 결합
p_final = np.column_stack([p_a_final, p_b_final, p_c_final, p_d_final, p_e_final])
p_final /= p_final.sum(axis=1, keepdims=True)

# 예측
label_order = np.array(['A', 'B', 'C', 'D', 'E'])
y_pred = label_order[p_final.argmax(axis=1)]

print("=== 최종 5-클래스 평가 (CD 보정 포함) ===")
print(classification_report(y_val, y_pred))

=== 최종 5-클래스 평가 (CD 보정 포함) ===
              precision    recall  f1-score   support

           A       0.88      0.95      0.91       194
           B       0.93      0.93      0.93        29
           C       0.67      0.72      0.69     25518
           D       0.52      0.81      0.63     69848
           E       0.98      0.87      0.92    384411

    accuracy                           0.86    480000
   macro avg       0.79      0.86      0.82    480000
weighted avg       0.90      0.86      0.87    480000



In [67]:
import pickle

# 모델 저장
with open('방법1/model_AB_lgbm.dat', 'wb') as f:
    pickle.dump(model_AB, f)

with open('방법1/model_CDE_tuned_lgbm.dat', 'wb') as f:
    pickle.dump(model_CDE_tuned, f)

with open('방법1/gate_model_lgbm.dat', 'wb') as f:
    pickle.dump(gate_model, f)

# 레이블 인코더도 저장
with open('방법1/label_encoder_AB.dat', 'wb') as f:
    pickle.dump(le_ab, f)

with open('방법1/label_encoder_CDE.dat', 'wb') as f:
    pickle.dump(le_cde, f)

In [1]:
p_ab = gate_model.predict_proba(X_test)[:, 1]
print("AB로 분류된 샘플 수:", (p_ab > 0.5).sum())
print("CDE로 분류된 샘플 수:", (p_ab <= 0.5).sum())

NameError: name 'gate_model' is not defined