## Model 학습

In [36]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [37]:
# 📌 1. 데이터 로드
train = pd.read_csv("processed_train.csv")
test = pd.read_csv("processed_test.csv")

# 📌 2. Feature & Target 분리
X = train.drop(columns=["ID", "가격(백만원)"])  # 입력 변수
y = train["가격(백만원)"]  # 타겟 변수

# 📌 3. Train/Validation 데이터 분할 (8:2 비율)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [38]:
# 📌 4. Optuna를 활용한 하이퍼파라미터 튜닝
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 10.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.1, 10.0),
        "objective": "reg:squarederror",  # 회귀 문제를 위한 설정
        "eval_metric": "rmse",  # RMSE를 평가 지표로 설정
        "random_state": 42
    }
    
    model = xgb.XGBRegressor(**params)

    # 📌 `early_stopping` 없이 모델 학습
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    
    return rmse

In [39]:
# 📌 5. Optuna 실행 (최적의 하이퍼파라미터 찾기)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)  # 30번의 탐색 진행

# 최적의 하이퍼파라미터 출력
best_params = study.best_params
print("✅ Best Hyperparameters:", best_params)

[I 2025-01-31 15:16:52,217] A new study created in memory with name: no-name-538bc379-dbbe-4cf9-9aee-a0cb32ae1839
[I 2025-01-31 15:16:54,569] Trial 0 finished with value: 1.681689089508803 and parameters: {'n_estimators': 750, 'max_depth': 15, 'learning_rate': 0.01911905950563548, 'subsample': 0.798604446860558, 'colsample_bytree': 0.5387701922409799, 'reg_lambda': 7.3292402051526615, 'reg_alpha': 6.048731707950487}. Best is trial 0 with value: 1.681689089508803.
[I 2025-01-31 15:16:54,722] Trial 1 finished with value: 10.355702591784665 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.012940236345443546, 'subsample': 0.9442519861094846, 'colsample_bytree': 0.9230802344028752, 'reg_lambda': 0.16798749926694265, 'reg_alpha': 5.9595299663481125}. Best is trial 0 with value: 1.681689089508803.
[I 2025-01-31 15:16:55,544] Trial 2 finished with value: 1.6164462712371774 and parameters: {'n_estimators': 750, 'max_depth': 5, 'learning_rate': 0.012917975841499632, 'subs

✅ Best Hyperparameters: {'n_estimators': 750, 'max_depth': 8, 'learning_rate': 0.018860104948008546, 'subsample': 0.8376120157293532, 'colsample_bytree': 0.8496722701475468, 'reg_lambda': 7.707175013795221, 'reg_alpha': 6.27960219205632}


In [40]:
# 📌 6. 최적의 XGBoost 모델 학습
best_model = xgb.XGBRegressor(**best_params, random_state=42)
best_model.fit(X_train, y_train)

In [41]:
# 📌 7. 검증 데이터 예측 및 성능 평가
y_pred = best_model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print("✅ 최적화된 XGBoost Validation RMSE:", rmse)

✅ 최적화된 XGBoost Validation RMSE: 1.4033061352674312


In [42]:
# 📌 8. 테스트 데이터 예측
test_features = test.drop(columns=["ID"])
test_predictions = best_model.predict(test_features)

In [43]:
# 📌 9. 제출 파일 생성
submission = pd.read_csv("sample_submission.csv")
submission["가격(백만원)"] = test_predictions

# 제출 파일 저장
submission.to_csv("submission.csv", index=False)
print("✅ 제출 파일 생성 완료! → submission.csv")

✅ 제출 파일 생성 완료! → submission.csv
