In [1]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, classification_report, make_scorer
import joblib

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 데이터 로드
train_processed = pd.read_csv("./processed_train.csv")
test_processed = pd.read_csv("./processed_test.csv")

In [3]:
# Feature & Target 분리
X = train_processed.drop(columns=['허위매물여부', 'ID'])  # Feature
y = train_processed['허위매물여부']  # Target
X_test = test_processed.drop(columns=['ID'])  # Test Feature

In [4]:
# === 1. Optuna를 활용한 하이퍼파라미터 최적화 ===
def objective(trial):
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }
    model = XGBClassifier(**param, use_label_encoder=False, eval_metric="mlogloss", random_state=42)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=skf, scoring=make_scorer(f1_score, average='macro'))
    return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
print("Best Hyperparameters:", study.best_params)

best_params = study.best_params
model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric="mlogloss", random_state=42)


[I 2025-02-22 17:09:18,736] A new study created in memory with name: no-name-2b279b85-3445-4ada-866b-d37fb1dac93f
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-02-22 17:09:25,166] Trial 0 finished with value: 0.9115413118815183 and parameters: {'n_estimators': 500, 'learning_rate': 0.1895011467472017, 'max_depth': 8, 'subsample': 0.789953982705476, 'colsample_bytree': 0.7867590530044832}. Best is trial 0 with value: 0.9115413118815183.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not 

Best Hyperparameters: {'n_estimators': 700, 'learning_rate': 0.04986719114621643, 'max_depth': 10, 'subsample': 0.7965343904624916, 'colsample_bytree': 0.9162911857836075}


In [5]:
# === 2. K-Fold Cross Validation 적용 ===
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=skf, scoring=make_scorer(f1_score, average='macro'))
print(f"Cross-Validation Macro F1 Scores: {scores}")
print(f"Mean Macro F1 Score: {np.mean(scores):.4f}")

# 최적 모델 학습 및 저장
model.fit(X, y)
joblib.dump(model, "./best_model.pkl")
print("최적 모델 저장 완료!")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-Validation Macro F1 Scores: [0.92722692 0.90350741 0.93122807 0.90479686 0.90783699]
Mean Macro F1 Score: 0.9149


Parameters: { "use_label_encoder" } are not used.



최적 모델 저장 완료!


In [6]:
# === 3. 테스트 데이터 예측 ===
test_features = test_processed.drop(columns=['ID'], errors='ignore')
test_features = test_features[X.columns]  # 훈련 데이터 컬럼과 동일하게 맞춤

test_predictions = model.predict(test_features)
test_submission = pd.DataFrame({"ID": test_processed['ID'], "허위매물여부": test_predictions})
test_submission.to_csv("./XGBoost_submission.csv", index=False)

print("모델 학습 및 평가 완료! 제출 파일 저장 완료!")

모델 학습 및 평가 완료! 제출 파일 저장 완료!
