In [1]:
# --- 1. 라이브러리, 경로, 로거 설정 ---

# 데이터 분석 및 처리를 위한 필수 라이브러리
import pandas as pd
import numpy as np
import random
import os, sys
from datetime import datetime                                         
from tqdm import tqdm
import warnings
import joblib
import json

# 시각화를 위한 라이브러리
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.model_selection import learning_curve

# 모델링 및 기계 학습을 위한 라이브러리
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error

# 하이퍼파라미터 최적화를 위한 라이브러리
import optuna 

# 불필요한 경고 메시지 무시
warnings.filterwarnings('ignore')
plt.style.use('ggplot')


# --- 1.1. 로거(실행 기록 로그 저장) 임포트 ---
# 사용자가 요청한 외부 logger.py 모듈을 임포합니다.
try:
    src_path = os.path.abspath(os.path.join(os.getcwd(), "../../src/log"))
    sys.path.insert(0, src_path)
    from logger import Logger
except ImportError:
    print("오류: 'logger.py'를 찾을 수 없습니다. 'src/log' 경로를 확인하세요.")
    # 간단한 대체 로거 정의
    class Logger:
        def __init__(self, log_path): print(f"대체 로거 활성화. 로그는 기록되지 않습니다.")
        def write(self, message, **kwargs): print(message)
        def start_redirect(self): pass
        def close(self): pass


# --- 1.2. 경로 및 환경 변수 설정 (사용자 요청 기반) ---
# 현재 시간 기준 년월일_시각 문자열 생성
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# 실행 로그 저장 경로 설정
LOG_DIR                         = '../../data/logs/price_prediction_7_logs'
LOG_FILENAME                    = f"price_prediction_{timestamp}.log"
LOG_PATH                        = os.path.join(LOG_DIR, LOG_FILENAME)
os.makedirs(LOG_DIR, exist_ok=True)
logger = Logger(log_path=LOG_PATH)

# 데이터 및 결과물 경로 설정
RAW_DIR                         = '../../data/processed/clean_data'
TRAIN_FILENAME                  = 'train.csv'
TEST_FILENAME                   = 'test.csv'
TRAIN_PATH                      = os.path.join(RAW_DIR, TRAIN_FILENAME)
TEST_PATH                       = os.path.join(RAW_DIR, TEST_FILENAME)

PARAMS_DIR                      = '../../data/processed/params'
PARAMS_FILENAME                 = 'best_params_7.json'
PARAMS_PATH                     = os.path.join(PARAMS_DIR, PARAMS_FILENAME)

SUBMISSION_DIR                  = '../../data/processed/submissions'
SUBMISSION_TEMPLATE_FILENAME    = 'baseline_code_sample_submission.csv'
SUBMISSION_FILENAME             = f'price_prediction_7_submission_{timestamp}.csv'
SUBMISSION_TEMPLATE_PATH        = os.path.join(SUBMISSION_DIR, SUBMISSION_TEMPLATE_FILENAME)
SUBMISSION_PATH                 = os.path.join(SUBMISSION_DIR, SUBMISSION_FILENAME)

IMAGE_DIR                       = '../../images/price_prediction_7/1'
IMAGE_FILENAME                  = 'price_prediction_7_model.pkl'
IMAGE_PATH                      = os.path.join(IMAGE_DIR, IMAGE_FILENAME)

MODEL_DIR                       = '../../model'
MODEL_FILENAME                  = 'price_prediction_7_model.pkl'
MODEL_PATH                      = os.path.join(MODEL_DIR, MODEL_FILENAME)

# 결과 저장 디렉토리 생성
os.makedirs(PARAMS_DIR, exist_ok=True)
os.makedirs(SUBMISSION_DIR, exist_ok=True)
os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

logger.start_redirect()
logger.write("="*60)
logger.write(">> [price_prediction8] 아파트 가격 예측 모델링 시작")
logger.write("="*60)

2025-07-17 00:50:07 | >> 표준 출력 및 오류를 로그 파일로 리디렉션 시작
2025-07-17 00:50:07 | >> [price_prediction8] 아파트 가격 예측 모델링 시작
2025-07-17 00:50:24 | >> [1단계 완료] 라이브러리, 경로, 로거 초기화 및 시드 고정 성공!
2025-07-17 00:50:28 | >> [2단계 시작] 데이터 로드를 시작합니다.
2025-07-17 00:50:29 | >> 원본 데이터 Shape - Train: (1110101, 32), Test: (9272, 32)
2025-07-17 00:50:30 | >> Train/Test 병합 후 Shape: (1119373, 32)
2025-07-17 00:50:30 | >> [2단계 완료] 데이터 로드 성공.
2025-07-17 00:50:34 | >> [3단계 시작] 피처 엔지니어링을 시작합니다.
2025-07-17 00:50:34 | >> 4.1. 날짜/기본/주기성 피처 생성 완료.
2025-07-17 00:50:34 | >> 4.2. 교통 가중합 피처 생성 완료.
2025-07-17 00:50:39 | >> 4.3. K-Means 군집화 피처 생성 완료.
2025-07-17 00:50:40 | >> 4.4. 시차(Lag) 및 이동평균(Rolling) 피처 생성 완료.
2025-07-17 00:50:40 | >> 4.5. 상호작용 피처 생성 완료.
2025-07-17 00:50:40 | >> 인코딩 대상 범주형 피처: ['계약일자', '자치구', '법정동', '브랜드등급']
2025-07-17 00:50:40 | >> 결측치 처리 전, NA 개수: 1179953
2025-07-17 00:50:41 | >> 결측치 처리 후, NA 개수: 0
2025-07-17 00:50:41 | >> 최종 피처 수: 47
2025-07-17 00:50:41 | >> 4.6. 범주형 인코딩 및 최종 데이터 분리 완료.
2025-07-17 00:50:41 |

[I 2025-07-17 00:51:04,951] A new study created in memory with name: no-name-253cebbd-24bd-4ea6-bad0-bc135ce337eb


2025-07-17 00:51:04 | >> [5단계 시작] Optuna 최적화를 시작합니다. (탐색 횟수: 30)
2025-07-17 00:51:04 | >> Optuna용 데이터 분할 - Train: (888080, 47), Validation: (222021, 47)


[I 2025-07-17 00:51:14,992] Trial 0 finished with value: 0.3009236043011199 and parameters: {'learning_rate': 0.0396865946088216, 'feature_fraction': 0.8863148419276745, 'bagging_fraction': 0.9557073318187208, 'bagging_freq': 1, 'num_leaves': 55, 'max_depth': 15, 'min_child_samples': 26}. Best is trial 0 with value: 0.3009236043011199.


2025-07-17 00:51:14 | Trial 0 | RMSE: 0.30092 | Params: {'learning_rate': 0.0396865946088216, 'feature_fraction': 0.8863148419276745, 'bagging_fraction': 0.9557073318187208, 'bagging_freq': 1, 'num_leaves': 55, 'max_depth': 15, 'min_child_samples': 26}


[I 2025-07-17 00:51:22,826] Trial 1 finished with value: 0.3010525719034734 and parameters: {'learning_rate': 0.026975331024314604, 'feature_fraction': 0.9355034488491221, 'bagging_fraction': 0.7901697242383773, 'bagging_freq': 7, 'num_leaves': 49, 'max_depth': 8, 'min_child_samples': 42}. Best is trial 0 with value: 0.3009236043011199.


2025-07-17 00:51:22 | Trial 1 | RMSE: 0.30105 | Params: {'learning_rate': 0.026975331024314604, 'feature_fraction': 0.9355034488491221, 'bagging_fraction': 0.7901697242383773, 'bagging_freq': 7, 'num_leaves': 49, 'max_depth': 8, 'min_child_samples': 42}


[I 2025-07-17 00:51:32,988] Trial 2 finished with value: 0.30093951325843976 and parameters: {'learning_rate': 0.022054572082914017, 'feature_fraction': 0.8706660404851607, 'bagging_fraction': 0.9815456654967585, 'bagging_freq': 2, 'num_leaves': 35, 'max_depth': 13, 'min_child_samples': 36}. Best is trial 0 with value: 0.3009236043011199.


2025-07-17 00:51:32 | Trial 2 | RMSE: 0.30094 | Params: {'learning_rate': 0.022054572082914017, 'feature_fraction': 0.8706660404851607, 'bagging_fraction': 0.9815456654967585, 'bagging_freq': 2, 'num_leaves': 35, 'max_depth': 13, 'min_child_samples': 36}


[I 2025-07-17 00:51:41,531] Trial 3 finished with value: 0.3009632363072088 and parameters: {'learning_rate': 0.029072814311615867, 'feature_fraction': 0.9320534612923851, 'bagging_fraction': 0.8100613900844122, 'bagging_freq': 3, 'num_leaves': 49, 'max_depth': 18, 'min_child_samples': 23}. Best is trial 0 with value: 0.3009236043011199.


2025-07-17 00:51:41 | Trial 3 | RMSE: 0.30096 | Params: {'learning_rate': 0.029072814311615867, 'feature_fraction': 0.9320534612923851, 'bagging_fraction': 0.8100613900844122, 'bagging_freq': 3, 'num_leaves': 49, 'max_depth': 18, 'min_child_samples': 23}


[I 2025-07-17 00:51:51,484] Trial 4 finished with value: 0.30051711051598373 and parameters: {'learning_rate': 0.047569972958855844, 'feature_fraction': 0.7462846006812045, 'bagging_fraction': 0.8896775778423445, 'bagging_freq': 6, 'num_leaves': 70, 'max_depth': 17, 'min_child_samples': 15}. Best is trial 4 with value: 0.30051711051598373.


2025-07-17 00:51:51 | Trial 4 | RMSE: 0.30052 | Params: {'learning_rate': 0.047569972958855844, 'feature_fraction': 0.7462846006812045, 'bagging_fraction': 0.8896775778423445, 'bagging_freq': 6, 'num_leaves': 70, 'max_depth': 17, 'min_child_samples': 15}


[I 2025-07-17 00:52:01,664] Trial 5 finished with value: 0.3009983903957259 and parameters: {'learning_rate': 0.027275251805191136, 'feature_fraction': 0.894790626947719, 'bagging_fraction': 0.746612380226238, 'bagging_freq': 6, 'num_leaves': 84, 'max_depth': 14, 'min_child_samples': 17}. Best is trial 4 with value: 0.30051711051598373.


2025-07-17 00:52:01 | Trial 5 | RMSE: 0.30100 | Params: {'learning_rate': 0.027275251805191136, 'feature_fraction': 0.894790626947719, 'bagging_fraction': 0.746612380226238, 'bagging_freq': 6, 'num_leaves': 84, 'max_depth': 14, 'min_child_samples': 17}


[I 2025-07-17 00:52:13,846] Trial 6 finished with value: 0.3010575511091315 and parameters: {'learning_rate': 0.038353410274342525, 'feature_fraction': 0.9536969551180067, 'bagging_fraction': 0.9067542379156972, 'bagging_freq': 1, 'num_leaves': 97, 'max_depth': 12, 'min_child_samples': 11}. Best is trial 4 with value: 0.30051711051598373.


2025-07-17 00:52:13 | Trial 6 | RMSE: 0.30106 | Params: {'learning_rate': 0.038353410274342525, 'feature_fraction': 0.9536969551180067, 'bagging_fraction': 0.9067542379156972, 'bagging_freq': 1, 'num_leaves': 97, 'max_depth': 12, 'min_child_samples': 11}


[I 2025-07-17 00:52:25,519] Trial 7 finished with value: 0.3007896255333186 and parameters: {'learning_rate': 0.03851389010547768, 'feature_fraction': 0.7858228178991106, 'bagging_fraction': 0.8089051506648363, 'bagging_freq': 3, 'num_leaves': 73, 'max_depth': 14, 'min_child_samples': 50}. Best is trial 4 with value: 0.30051711051598373.


2025-07-17 00:52:25 | Trial 7 | RMSE: 0.30079 | Params: {'learning_rate': 0.03851389010547768, 'feature_fraction': 0.7858228178991106, 'bagging_fraction': 0.8089051506648363, 'bagging_freq': 3, 'num_leaves': 73, 'max_depth': 14, 'min_child_samples': 50}


[I 2025-07-17 00:52:38,653] Trial 8 finished with value: 0.30092174511601066 and parameters: {'learning_rate': 0.022392747946764754, 'feature_fraction': 0.8710452509751077, 'bagging_fraction': 0.9669325046010785, 'bagging_freq': 1, 'num_leaves': 86, 'max_depth': 17, 'min_child_samples': 14}. Best is trial 4 with value: 0.30051711051598373.


2025-07-17 00:52:38 | Trial 8 | RMSE: 0.30092 | Params: {'learning_rate': 0.022392747946764754, 'feature_fraction': 0.8710452509751077, 'bagging_fraction': 0.9669325046010785, 'bagging_freq': 1, 'num_leaves': 86, 'max_depth': 17, 'min_child_samples': 14}


[I 2025-07-17 00:52:46,966] Trial 9 finished with value: 0.3012340044923166 and parameters: {'learning_rate': 0.041358222119844, 'feature_fraction': 0.9480873587577867, 'bagging_fraction': 0.9000012787867286, 'bagging_freq': 2, 'num_leaves': 41, 'max_depth': 13, 'min_child_samples': 50}. Best is trial 4 with value: 0.30051711051598373.


2025-07-17 00:52:46 | Trial 9 | RMSE: 0.30123 | Params: {'learning_rate': 0.041358222119844, 'feature_fraction': 0.9480873587577867, 'bagging_fraction': 0.9000012787867286, 'bagging_freq': 2, 'num_leaves': 41, 'max_depth': 13, 'min_child_samples': 50}


[I 2025-07-17 00:52:56,912] Trial 10 finished with value: 0.3004617338827768 and parameters: {'learning_rate': 0.04963909869150565, 'feature_fraction': 0.7045430381564848, 'bagging_fraction': 0.8578592091558065, 'bagging_freq': 5, 'num_leaves': 69, 'max_depth': 20, 'min_child_samples': 20}. Best is trial 10 with value: 0.3004617338827768.


2025-07-17 00:52:56 | Trial 10 | RMSE: 0.30046 | Params: {'learning_rate': 0.04963909869150565, 'feature_fraction': 0.7045430381564848, 'bagging_fraction': 0.8578592091558065, 'bagging_freq': 5, 'num_leaves': 69, 'max_depth': 20, 'min_child_samples': 20}


[I 2025-07-17 00:53:06,319] Trial 11 finished with value: 0.3004479149078333 and parameters: {'learning_rate': 0.04955777190985409, 'feature_fraction': 0.7005744962702186, 'bagging_fraction': 0.8710546547083553, 'bagging_freq': 5, 'num_leaves': 68, 'max_depth': 20, 'min_child_samples': 20}. Best is trial 11 with value: 0.3004479149078333.


2025-07-17 00:53:06 | Trial 11 | RMSE: 0.30045 | Params: {'learning_rate': 0.04955777190985409, 'feature_fraction': 0.7005744962702186, 'bagging_fraction': 0.8710546547083553, 'bagging_freq': 5, 'num_leaves': 68, 'max_depth': 20, 'min_child_samples': 20}


[I 2025-07-17 00:53:20,185] Trial 12 finished with value: 0.3003901642860133 and parameters: {'learning_rate': 0.012324243515471595, 'feature_fraction': 0.7008455924888016, 'bagging_fraction': 0.8500095937221054, 'bagging_freq': 5, 'num_leaves': 60, 'max_depth': 20, 'min_child_samples': 22}. Best is trial 12 with value: 0.3003901642860133.


2025-07-17 00:53:20 | Trial 12 | RMSE: 0.30039 | Params: {'learning_rate': 0.012324243515471595, 'feature_fraction': 0.7008455924888016, 'bagging_fraction': 0.8500095937221054, 'bagging_freq': 5, 'num_leaves': 60, 'max_depth': 20, 'min_child_samples': 22}


[I 2025-07-17 00:53:33,231] Trial 13 finished with value: 0.3006945636988546 and parameters: {'learning_rate': 0.013069244390429141, 'feature_fraction': 0.7897699884040081, 'bagging_fraction': 0.7066061397313775, 'bagging_freq': 5, 'num_leaves': 62, 'max_depth': 20, 'min_child_samples': 31}. Best is trial 12 with value: 0.3003901642860133.


2025-07-17 00:53:33 | Trial 13 | RMSE: 0.30069 | Params: {'learning_rate': 0.013069244390429141, 'feature_fraction': 0.7897699884040081, 'bagging_fraction': 0.7066061397313775, 'bagging_freq': 5, 'num_leaves': 62, 'max_depth': 20, 'min_child_samples': 31}


[I 2025-07-17 00:53:52,246] Trial 14 finished with value: 0.30045284724362425 and parameters: {'learning_rate': 0.010189539159855626, 'feature_fraction': 0.7016096375498521, 'bagging_fraction': 0.8505799331466516, 'bagging_freq': 4, 'num_leaves': 81, 'max_depth': 19, 'min_child_samples': 28}. Best is trial 12 with value: 0.3003901642860133.


2025-07-17 00:53:52 | Trial 14 | RMSE: 0.30045 | Params: {'learning_rate': 0.010189539159855626, 'feature_fraction': 0.7016096375498521, 'bagging_fraction': 0.8505799331466516, 'bagging_freq': 4, 'num_leaves': 81, 'max_depth': 19, 'min_child_samples': 28}


[I 2025-07-17 00:54:04,502] Trial 15 finished with value: 0.3005524192370435 and parameters: {'learning_rate': 0.015768350242790774, 'feature_fraction': 0.805054917319801, 'bagging_fraction': 0.8719774594158092, 'bagging_freq': 5, 'num_leaves': 59, 'max_depth': 10, 'min_child_samples': 34}. Best is trial 12 with value: 0.3003901642860133.


2025-07-17 00:54:04 | Trial 15 | RMSE: 0.30055 | Params: {'learning_rate': 0.015768350242790774, 'feature_fraction': 0.805054917319801, 'bagging_fraction': 0.8719774594158092, 'bagging_freq': 5, 'num_leaves': 59, 'max_depth': 10, 'min_child_samples': 34}


[I 2025-07-17 00:54:14,981] Trial 16 finished with value: 0.30049765585754373 and parameters: {'learning_rate': 0.032438327377184675, 'feature_fraction': 0.7464258471034285, 'bagging_fraction': 0.9310248454702008, 'bagging_freq': 4, 'num_leaves': 78, 'max_depth': 16, 'min_child_samples': 22}. Best is trial 12 with value: 0.3003901642860133.


2025-07-17 00:54:14 | Trial 16 | RMSE: 0.30050 | Params: {'learning_rate': 0.032438327377184675, 'feature_fraction': 0.7464258471034285, 'bagging_fraction': 0.9310248454702008, 'bagging_freq': 4, 'num_leaves': 78, 'max_depth': 16, 'min_child_samples': 22}


[I 2025-07-17 00:54:26,012] Trial 17 finished with value: 0.3004656952353524 and parameters: {'learning_rate': 0.01917788298056522, 'feature_fraction': 0.7394936462017243, 'bagging_fraction': 0.8251075177996439, 'bagging_freq': 7, 'num_leaves': 64, 'max_depth': 18, 'min_child_samples': 10}. Best is trial 12 with value: 0.3003901642860133.


2025-07-17 00:54:26 | Trial 17 | RMSE: 0.30047 | Params: {'learning_rate': 0.01917788298056522, 'feature_fraction': 0.7394936462017243, 'bagging_fraction': 0.8251075177996439, 'bagging_freq': 7, 'num_leaves': 64, 'max_depth': 18, 'min_child_samples': 10}


[I 2025-07-17 00:54:37,410] Trial 18 finished with value: 0.30069134799714664 and parameters: {'learning_rate': 0.033139454120094954, 'feature_fraction': 0.8242669296024254, 'bagging_fraction': 0.7583758295510976, 'bagging_freq': 6, 'num_leaves': 96, 'max_depth': 20, 'min_child_samples': 19}. Best is trial 12 with value: 0.3003901642860133.


2025-07-17 00:54:37 | Trial 18 | RMSE: 0.30069 | Params: {'learning_rate': 0.033139454120094954, 'feature_fraction': 0.8242669296024254, 'bagging_fraction': 0.7583758295510976, 'bagging_freq': 6, 'num_leaves': 96, 'max_depth': 20, 'min_child_samples': 19}


[I 2025-07-17 00:54:46,623] Trial 19 finished with value: 0.300559448224418 and parameters: {'learning_rate': 0.044496807324798666, 'feature_fraction': 0.7288905510698559, 'bagging_fraction': 0.8405735772276233, 'bagging_freq': 5, 'num_leaves': 53, 'max_depth': 11, 'min_child_samples': 24}. Best is trial 12 with value: 0.3003901642860133.


2025-07-17 00:54:46 | Trial 19 | RMSE: 0.30056 | Params: {'learning_rate': 0.044496807324798666, 'feature_fraction': 0.7288905510698559, 'bagging_fraction': 0.8405735772276233, 'bagging_freq': 5, 'num_leaves': 53, 'max_depth': 11, 'min_child_samples': 24}


[I 2025-07-17 00:54:56,198] Trial 20 finished with value: 0.3004273126466398 and parameters: {'learning_rate': 0.03448170619726, 'feature_fraction': 0.7690508631730698, 'bagging_fraction': 0.9329756036445532, 'bagging_freq': 3, 'num_leaves': 42, 'max_depth': 18, 'min_child_samples': 30}. Best is trial 12 with value: 0.3003901642860133.


2025-07-17 00:54:56 | Trial 20 | RMSE: 0.30043 | Params: {'learning_rate': 0.03448170619726, 'feature_fraction': 0.7690508631730698, 'bagging_fraction': 0.9329756036445532, 'bagging_freq': 3, 'num_leaves': 42, 'max_depth': 18, 'min_child_samples': 30}


[I 2025-07-17 00:55:05,590] Trial 21 finished with value: 0.3003969850592194 and parameters: {'learning_rate': 0.03543648107449196, 'feature_fraction': 0.7696625640321834, 'bagging_fraction': 0.9391682789608273, 'bagging_freq': 3, 'num_leaves': 32, 'max_depth': 18, 'min_child_samples': 31}. Best is trial 12 with value: 0.3003901642860133.


2025-07-17 00:55:05 | Trial 21 | RMSE: 0.30040 | Params: {'learning_rate': 0.03543648107449196, 'feature_fraction': 0.7696625640321834, 'bagging_fraction': 0.9391682789608273, 'bagging_freq': 3, 'num_leaves': 32, 'max_depth': 18, 'min_child_samples': 31}


[I 2025-07-17 00:55:14,368] Trial 22 finished with value: 0.30034835899456774 and parameters: {'learning_rate': 0.03445908547306797, 'feature_fraction': 0.771314941441125, 'bagging_fraction': 0.9256481173143276, 'bagging_freq': 3, 'num_leaves': 31, 'max_depth': 18, 'min_child_samples': 40}. Best is trial 22 with value: 0.30034835899456774.


2025-07-17 00:55:14 | Trial 22 | RMSE: 0.30035 | Params: {'learning_rate': 0.03445908547306797, 'feature_fraction': 0.771314941441125, 'bagging_fraction': 0.9256481173143276, 'bagging_freq': 3, 'num_leaves': 31, 'max_depth': 18, 'min_child_samples': 40}


[I 2025-07-17 00:55:23,311] Trial 23 finished with value: 0.3007322156800647 and parameters: {'learning_rate': 0.03546545880812925, 'feature_fraction': 0.8240072409081879, 'bagging_fraction': 0.9340677870669357, 'bagging_freq': 2, 'num_leaves': 32, 'max_depth': 16, 'min_child_samples': 40}. Best is trial 22 with value: 0.30034835899456774.


2025-07-17 00:55:23 | Trial 23 | RMSE: 0.30073 | Params: {'learning_rate': 0.03546545880812925, 'feature_fraction': 0.8240072409081879, 'bagging_fraction': 0.9340677870669357, 'bagging_freq': 2, 'num_leaves': 32, 'max_depth': 16, 'min_child_samples': 40}


[I 2025-07-17 00:55:33,052] Trial 24 finished with value: 0.3005803194663453 and parameters: {'learning_rate': 0.031343279456587284, 'feature_fraction': 0.7726296525541818, 'bagging_fraction': 0.9944045041510977, 'bagging_freq': 3, 'num_leaves': 39, 'max_depth': 19, 'min_child_samples': 44}. Best is trial 22 with value: 0.30034835899456774.


2025-07-17 00:55:33 | Trial 24 | RMSE: 0.30058 | Params: {'learning_rate': 0.031343279456587284, 'feature_fraction': 0.7726296525541818, 'bagging_fraction': 0.9944045041510977, 'bagging_freq': 3, 'num_leaves': 39, 'max_depth': 19, 'min_child_samples': 44}


[I 2025-07-17 00:55:42,846] Trial 25 finished with value: 0.30088767885635065 and parameters: {'learning_rate': 0.025080173528265966, 'feature_fraction': 0.8379361364942928, 'bagging_fraction': 0.9504052321155165, 'bagging_freq': 4, 'num_leaves': 30, 'max_depth': 16, 'min_child_samples': 37}. Best is trial 22 with value: 0.30034835899456774.


2025-07-17 00:55:42 | Trial 25 | RMSE: 0.30089 | Params: {'learning_rate': 0.025080173528265966, 'feature_fraction': 0.8379361364942928, 'bagging_fraction': 0.9504052321155165, 'bagging_freq': 4, 'num_leaves': 30, 'max_depth': 16, 'min_child_samples': 37}


[I 2025-07-17 00:55:52,828] Trial 26 finished with value: 0.30026362921165267 and parameters: {'learning_rate': 0.03583567852577661, 'feature_fraction': 0.7239517342245836, 'bagging_fraction': 0.9114303614123138, 'bagging_freq': 4, 'num_leaves': 46, 'max_depth': 19, 'min_child_samples': 46}. Best is trial 26 with value: 0.30026362921165267.


2025-07-17 00:55:52 | Trial 26 | RMSE: 0.30026 | Params: {'learning_rate': 0.03583567852577661, 'feature_fraction': 0.7239517342245836, 'bagging_fraction': 0.9114303614123138, 'bagging_freq': 4, 'num_leaves': 46, 'max_depth': 19, 'min_child_samples': 46}


[I 2025-07-17 00:56:01,545] Trial 27 finished with value: 0.30140784129741005 and parameters: {'learning_rate': 0.04228446225555124, 'feature_fraction': 0.9994426348050525, 'bagging_fraction': 0.9104842675840319, 'bagging_freq': 4, 'num_leaves': 47, 'max_depth': 19, 'min_child_samples': 45}. Best is trial 26 with value: 0.30026362921165267.


2025-07-17 00:56:01 | Trial 27 | RMSE: 0.30141 | Params: {'learning_rate': 0.04228446225555124, 'feature_fraction': 0.9994426348050525, 'bagging_fraction': 0.9104842675840319, 'bagging_freq': 4, 'num_leaves': 47, 'max_depth': 19, 'min_child_samples': 45}


[I 2025-07-17 00:56:13,249] Trial 28 finished with value: 0.30028693568528364 and parameters: {'learning_rate': 0.01693419110682041, 'feature_fraction': 0.724391005191311, 'bagging_fraction': 0.8844572040975355, 'bagging_freq': 4, 'num_leaves': 57, 'max_depth': 17, 'min_child_samples': 47}. Best is trial 26 with value: 0.30026362921165267.


2025-07-17 00:56:13 | Trial 28 | RMSE: 0.30029 | Params: {'learning_rate': 0.01693419110682041, 'feature_fraction': 0.724391005191311, 'bagging_fraction': 0.8844572040975355, 'bagging_freq': 4, 'num_leaves': 57, 'max_depth': 17, 'min_child_samples': 47}


[I 2025-07-17 00:56:24,340] Trial 29 finished with value: 0.3003471297959803 and parameters: {'learning_rate': 0.019168528210882312, 'feature_fraction': 0.7226002348538336, 'bagging_fraction': 0.8917252729269521, 'bagging_freq': 4, 'num_leaves': 54, 'max_depth': 15, 'min_child_samples': 47}. Best is trial 26 with value: 0.30026362921165267.


2025-07-17 00:56:24 | Trial 29 | RMSE: 0.30035 | Params: {'learning_rate': 0.019168528210882312, 'feature_fraction': 0.7226002348538336, 'bagging_fraction': 0.8917252729269521, 'bagging_freq': 4, 'num_leaves': 54, 'max_depth': 15, 'min_child_samples': 47}
2025-07-17 00:56:24 | >> Optuna 최적화 완료. 총 30번의 trial 실행.
2025-07-17 00:56:24 | >> 최적 RMSE: 0.30026
2025-07-17 00:56:24 | >> 최적 하이퍼파라미터: {'learning_rate': 0.03583567852577661, 'feature_fraction': 0.7239517342245836, 'bagging_fraction': 0.9114303614123138, 'bagging_freq': 4, 'num_leaves': 46, 'max_depth': 19, 'min_child_samples': 46}
2025-07-17 00:56:24 | >> 최적 파라미터를 '../../data/processed/params/best_params_7.json'에 저장했습니다.
2025-07-17 00:56:24 | >> [5단계 완료] 하이퍼파라미터 최적화 성공.
2025-07-17 00:59:54 | >> [6단계 시작] 최종 모델 학습 및 예측을 시작합니다. (CV 폴드 수: 4)
2025-07-17 00:59:54 | --- Fold 1/4 학습 시작 ---
2025-07-17 00:59:54 | - Train Index: 0 ~ 222020 (size: 222021)
2025-07-17 00:59:54 | - Validation Index: 222021 ~ 444040 (size: 222020)
2025-07-17 01:00:0

In [2]:
# ==============================================================================
# --- 2. 🚀 하이퍼파라미터 및 실행 환경 설정 (Config 클래스) ---
# ==============================================================================
class Config:
    IS_SAMPLING = False
    SAMPLING_FRAC = 0.3
    SEED = 42
    N_SPLITS_TS = 4
    N_TOP_FEATURES = 100
    N_TRIALS_OPTUNA = 30

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(Config.SEED)
logger.write(">> [1단계 완료] 라이브러리, 경로, 로거 초기화 및 시드 고정 성공!")

In [3]:
# ==============================================================================
# --- 3. 데이터 로드 및 병합 ---
# ==============================================================================
try:
    logger.write("\n>> [2단계 시작] 데이터 로드를 시작합니다.")
    train_df = pd.read_csv(TRAIN_PATH)
    test_df = pd.read_csv(TEST_PATH)
    submission_df = pd.read_csv(SUBMISSION_TEMPLATE_PATH)

    logger.write(f">> 원본 데이터 Shape - Train: {train_df.shape}, Test: {test_df.shape}")

    if Config.IS_SAMPLING:
        logger.write(f">> 샘플링 모드 활성화: 데이터의 {Config.SAMPLING_FRAC * 100}%만 사용합니다.")
        train_df = train_df.sample(frac=Config.SAMPLING_FRAC, random_state=Config.SEED).reset_index(drop=True)
        logger.write(f">> 샘플링 후 Train Shape: {train_df.shape}")

    all_df = pd.concat([train_df.drop(columns=['target']), test_df], axis=0).reset_index(drop=True)
    logger.write(f">> Train/Test 병합 후 Shape: {all_df.shape}")
    logger.write(">> [2단계 완료] 데이터 로드 성공.")

except FileNotFoundError as e:
    logger.write(f">> [오류] 데이터 파일 로드 실패: {e}. '{TRAIN_PATH}' 또는 '{TEST_PATH}' 경로를 확인하세요.", print_error=True)
    logger.close()
    sys.exit()
except Exception as e:
    logger.write(f">> [오류] 2단계(데이터 로드) 중 문제 발생: {e}", print_error=True)
    logger.close()
    sys.exit()

In [None]:
# ==============================================================================
# --- 4. 🛠️ 피처 엔지니어링 (v6 아이디어 포함 및 개선) ---
# ==============================================================================
try:
    logger.write("\n>> [3단계 시작] 피처 엔지니어링을 시작합니다.")

    # --- 4.1. 날짜 및 기본 파생 변수 ---
    try:
        # '계약년월'을 날짜 타입으로 변환 (YYYYMM 형식이라고 가정)
        all_df['계약년월'] = pd.to_datetime(all_df['계약년월'], format='%Y%m')
        all_df['계약년'] = all_df['계약년월'].dt.year
        all_df['계약월'] = all_df['계약년월'].dt.month
        
        # '건축년도'를 '연식'으로 수정하여 '건물나이' 계산
        all_df['건물나이'] = all_df['계약년'] - all_df['연식'] 
        
        # 주기성 피처 생성
        all_df['계약월_sin'] = np.sin(2 * np.pi * all_df['계약월'] / 12)
        all_df['계약월_cos'] = np.cos(2 * np.pi * all_df['계약월'] / 12)
        logger.write(">> 4.1. 날짜/기본/주기성 피처 생성 완료.")
    except Exception as e:
        logger.write(f">> [오류] 4.1(날짜 피처) 생성 중 문제 발생: {e}", print_error=True)

    # --- 4.2. 교통 가중합 피처 (v6 아이디어) ---
    try:
        transport_cols = ['반경_1km_지하철역_수', '반경_500m_지하철역_수', '반경_300m_지하철역_수',
                          '반경_1km_버스정류장_수', '반경_500m_버스정류장_수', '반경_300m_버스정류장_수']
        if all(col in all_df.columns for col in transport_cols):
            all_df['가중지하철'] = all_df['반경_1km_지하철역_수'] * 1.0 + all_df['반경_500m_지하철역_수'] * 1.5 + all_df['반경_300m_지하철역_수'] * 2.0
            all_df['가중버스'] = all_df['반경_1km_버스정류장_수'] * 1.0 + all_df['반경_500m_버스정류장_수'] * 1.5 + all_df['반경_300m_버스정류장_수'] * 2.0
            logger.write(">> 4.2. 교통 가중합 피처 생성 완료.")
        else:
            logger.write(">> 4.2. 교통 관련 컬럼이 없어 가중합 피처를 생성하지 않았습니다.")
    except Exception as e:
        logger.write(f">> [오류] 4.2(교통 피처) 생성 중 문제 발생: {e}", print_error=True)

    # --- 4.3. K-Means 군집화 피처 (v6 아이디어) ---
    try:
        cluster_features = ['좌표X', '좌표Y', '전용면적', '건물나이', '층']
        if all(col in all_df.columns for col in cluster_features):
            scaler = StandardScaler()
            df_scaled = scaler.fit_transform(all_df[cluster_features])
            kmeans = KMeans(n_clusters=10, random_state=Config.SEED, n_init=10)
            all_df['아파트군집'] = kmeans.fit_predict(df_scaled)
            logger.write(">> 4.3. K-Means 군집화 피처 생성 완료.")
        else:
            logger.write(">> 4.3. 군집화 관련 컬럼이 없어 군집 피처를 생성하지 않았습니다.")
    except Exception as e:
        logger.write(f">> [오류] 4.3(K-Means) 생성 중 문제 발생: {e}", print_error=True)
        
    # --- 4.4. [개선] 베이스라인 아이디어 기반 통계 피처 생성 ---
    # Lag/Rolling 방식 대신, 시장 변화에 더 안정적인 통계 피처를 생성합니다.
    try:
        # 1. 훈련 데이터에만 '면적당가격' 임시 피처 생성 (Data Leakage 방지)
        train_only_df = all_df[all_df['target'].notna()].copy()
        # 타겟 변수에 로그 변환 후 면적당 가격 계산
        train_only_df['면적당가격'] = np.log1p(train_only_df['target']) / train_only_df['전용면적']

        # 2. 법정동별 통계 피처 생성
        dong_stats = train_only_df.groupby('법정동').agg(
            동별_평균_면적당가격=('면적당가격', 'mean'),
            동별_std_면적당가격=('면적당가격', 'std')
        ).reset_index()

        # 3. 자치구별 통계 피처 생성
        gu_stats = train_only_df.groupby('자치구').agg(
            구별_평균_면적당가격=('면적당가격', 'mean'),
            구별_std_면적당가격=('면적당가격', 'std')
        ).reset_index()

        # 4. 생성된 통계 피처를 전체 데이터에 병합
        all_df = pd.merge(all_df, dong_stats, on='법정동', how='left')
        all_df = pd.merge(all_df, gu_stats, on='자치구', how='left')
        
        logger.write(">> 4.4. [개선] 면적당 가격 기반 통계 피처 생성 완료.")

    except Exception as e:
        logger.write(f">> [오류] 4.4(통계 피처) 생성 중 문제 발생: {e}", print_error=True)
        
    # --- 군집별 통계 피처 생성 ---
    try:
        cluster_stats = train_only_df.groupby('아파트군집').agg(
            군집별_평균_면적당가격=('면적당가격', 'mean'),
            군집별_std_면적당가격=('면적당가격', 'std')
        ).reset_index()

        all_df = pd.merge(all_df, cluster_stats, on='아파트군집', how='left')
        logger.write(">> [추가] K-Means 군집 기반 통계 피처 생성 완료.")
    except Exception as e:
        logger.write(f">> [오류] 군집 기반 통계 피처 생성 중 문제 발생: {e}", print_error=True)

    # --- 4.5. 상호작용 및 기타 피처 ---
    try:
        all_df['면적_x_나이'] = all_df['전용면적'] * all_df['건물나이']
        all_df['면적_x_층'] = all_df['전용면적'] * all_df['층']
        all_df['강남_x_면적'] = all_df['강남3구여부'] * all_df['전용면적']
        logger.write(">> 4.5. 상호작용 피처 생성 완료.")
    except Exception as e:
        logger.write(f">> [오류] 4.5(상호작용 피처) 생성 중 문제 발생: {e}", print_error=True)

    # --- 4.6. 최종 처리 ---
    try:
        categorical_features = all_df.select_dtypes(include=['object']).columns.tolist()
        logger.write(f">> 인코딩 대상 범주형 피처: {categorical_features}")
        for col in categorical_features:
            all_df[col] = LabelEncoder().fit_transform(all_df[col].astype(str))

        # 불필요한 원본 컬럼이 있다면 여기서 제거할 수 있습니다.
        all_df = all_df.drop(columns=['계약년월'])
        
        # 결측치 처리 후 데이터 분리
        logger.write(f">> 결측치 처리 전, NA 개수: {all_df.isna().sum().sum()}")
        all_df = all_df.fillna(0) # 시계열 피처에서 발생한 NA를 0으로 채움
        logger.write(f">> 결측치 처리 후, NA 개수: {all_df.isna().sum().sum()}")
        
        X_train = all_df.iloc[:len(train_df)].copy()
        X_test = all_df.iloc[len(train_df):].copy()
        y_train_log = np.log1p(train_df['target'])
        
        logger.write(f">> 최종 피처 수: {len(X_train.columns)}")
        logger.write(">> 4.6. 범주형 인코딩 및 최종 데이터 분리 완료.")
    except Exception as e:
        logger.write(f">> [오류] 4.6(최종 처리) 중 문제 발생: {e}", print_error=True)

    logger.write(">> [3단계 완료] 피처 엔지니어링 성공.")

except Exception as e:
    logger.write(f">> [오류] 3단계(피처 엔지니어링) 전체 프로세스 중 문제 발생: {e}", print_error=True)
    logger.close()
    sys.exit()

In [5]:
# ==============================================================================
# --- 5. ⚡️ 빠른 피처 선택 ---
# ==============================================================================

try:
    logger.write(f"\n>> [4단계 시작] 상위 {Config.N_TOP_FEATURES}개 피처를 선택합니다.")
    temp_model = lgb.LGBMRegressor(random_state=Config.SEED)
    temp_model.fit(X_train, y_train_log)
    feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': temp_model.feature_importances_}).sort_values('importance', ascending=False)
    
    logger.write(">> 피처 중요도 :")
    logger.write(str(feature_importances))

    all_features = X_train.columns.tolist()
    top_features = feature_importances['feature'].head(Config.N_TOP_FEATURES).tolist()
    discarded_features = [f for f in all_features if f not in top_features]
    
    X_train = X_train[top_features]
    X_test = X_test[top_features]
    
    logger.write(f">> 피처 선택 완료. 선택된 피처 수: {len(top_features)}")
    logger.write(f">> 총 {len(all_features)}개 중 {len(top_features)}개 선택, {len(discarded_features)}개 제외.")
    logger.write(f">> 선택된 피처 목록 : {top_features}")
    logger.write(f">> 제외된 피처 목록 : {discarded_features}")
    logger.write(">> [4단계 완료] 피처 선택 성공.")
except Exception as e:
    logger.write(f">> [오류] 4단계(피처 선택) 중 문제 발생: {e}", print_error=True)
    logger.close()
    sys.exit()

In [6]:
# ==============================================================================
# --- 6. 🧠 Optuna 하이퍼파라미터 최적화 ---
# ==============================================================================

def objective(trial):
    params = {
        'device': 'cuda',
        'objective': 'regression_l1', 'metric': 'rmse', 'n_estimators': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'num_leaves': trial.suggest_int('num_leaves', 30, 100),
        'max_depth': trial.suggest_int('max_depth', 7, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'verbose': -1, 'n_jobs': -1, 'seed': Config.SEED,
    }
    
    model = lgb.LGBMRegressor(**params)
    model.fit(optuna_X_train, optuna_y_train, eval_set=[(optuna_X_val, optuna_y_val)], callbacks=[lgb.early_stopping(100, verbose=False)])
    rmse = np.sqrt(mean_squared_error(optuna_y_val, model.predict(optuna_X_val)))
    
    # 각 trial의 결과를 로깅
    logger.write(f"  Trial {trial.number} | RMSE: {rmse:.5f} | Params: {trial.params}")
    
    return rmse
        
try:
    logger.write(f"\n>> [5단계 시작] Optuna 최적화를 시작합니다. (탐색 횟수: {Config.N_TRIALS_OPTUNA})")
    optuna_X_train, optuna_X_val, optuna_y_train, optuna_y_val = train_test_split(X_train, y_train_log, test_size=0.2, shuffle=False)
    logger.write(f">> Optuna용 데이터 분할 - Train: {optuna_X_train.shape}, Validation: {optuna_X_val.shape}")

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=Config.N_TRIALS_OPTUNA)
    
    best_params = study.best_params
    logger.write(f">> Optuna 최적화 완료. 총 {len(study.trials)}번의 trial 실행.")
    logger.write(f">> 최적 RMSE: {study.best_value:.5f}")
    logger.write(f">> 최적 하이퍼파라미터: {best_params}")

    # 최적 파라미터 저장
    with open(PARAMS_PATH, 'w') as f:
        json.dump(best_params, f, indent=4)
    logger.write(f">> 최적 파라미터를 '{PARAMS_PATH}'에 저장했습니다.")
    logger.write(">> [5단계 완료] 하이퍼파라미터 최적화 성공.")

except Exception as e:
    logger.write(f">> [오류] 5단계(Optuna 최적화) 중 문제 발생: {e}", print_error=True)
    logger.write(">> [힌트] 'device_type = gpu' 관련 오류는 LightGBM이 GPU 지원 없이 설치되었을 수 있습니다.", print_error=True)
    logger.close()
    sys.exit()

In [None]:
# ==============================================================================
# --- 7. 🚂 최종 모델 학습 및 예측 (TimeSeriesSplit) ---
# ==============================================================================
try:
    logger.write(f"\n>> [6단계 시작] 최종 모델 학습 및 예측을 시작합니다. (CV 폴드 수: {Config.N_SPLITS_TS})")
    
    # Learning Rate 미세 조정
    final_params = best_params.copy()
    # 기존 학습률보다 20% 낮춰서 더 정교하게 학습
    final_params['learning_rate'] = best_params['learning_rate'] * 0.8 
    # 학습 기회를 더 주기 위해 n_estimators 증가
    final_params['n_estimators'] = 3000 
    
    logger.write(f">> 최종 학습 파라미터 (미세 조정 적용): {final_params}")
    
    ts_cv = TimeSeriesSplit(n_splits=Config.N_SPLITS_TS)
    oof_preds = np.zeros(len(X_train))
    test_preds = np.zeros(len(X_test))
    fold_scores = []
    fold_models = []

    final_params = best_params.copy()
    final_params.update({'device': 'cuda', 'objective': 'regression_l1', 'metric': 'rmse', 'n_estimators': 10000, 'verbose': -1, 'n_jobs': -1, 'seed': Config.SEED})

    final_model = None
    for fold, (train_idx, val_idx) in enumerate(ts_cv.split(X_train)):
        logger.write(f"\n--- Fold {fold+1}/{Config.N_SPLITS_TS} 학습 시작 ---")
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train_log.iloc[train_idx]
        X_val_fold, y_val_fold = X_train.iloc[val_idx], y_train_log.iloc[val_idx]
        logger.write(f"  - Train Index: {train_idx[0]} ~ {train_idx[-1]} (size: {len(train_idx)})")
        logger.write(f"  - Validation Index: {val_idx[0]} ~ {val_idx[-1]} (size: {len(val_idx)})")

        model = lgb.LGBMRegressor(**final_params)
        model.fit(X_train_fold, y_train_fold, 
                  eval_set=[(X_val_fold, y_val_fold)], 
                  eval_metric='rmse', 
                  callbacks=[lgb.early_stopping(200, verbose=False)])

        fold_models.append(model)
        val_preds = model.predict(X_val_fold)
        oof_preds[val_idx] = val_preds
        fold_rmse = np.sqrt(mean_squared_error(y_val_fold, val_preds))
        fold_scores.append(fold_rmse)
        
        logger.write(f"  - Fold {fold+1} RMSE: {fold_rmse:.5f}")
        logger.write(f"  - Best Iteration: {model.best_iteration_}")

        test_preds += model.predict(X_test) / Config.N_SPLITS_TS
        if fold == Config.N_SPLITS_TS - 1:
            final_model = model

    oof_rmse = np.sqrt(mean_squared_error(y_train_log.iloc[np.concatenate([val_idx for _, val_idx in ts_cv.split(X_train)])], 
                                          oof_preds[oof_preds != 0]))
    logger.write("\n>> CV 학습 결과 요약:")
    logger.write(f"  - 각 Fold별 RMSE: {[round(score, 5) for score in fold_scores]}")
    logger.write(f"  - 평균 Fold RMSE: {np.mean(fold_scores):.5f} (±{np.std(fold_scores):.5f})")
    logger.write(f"  - 전체 OOF RMSE: {oof_rmse:.5f}")
    logger.write(">> [6단계 완료] 최종 모델 학습 성공.")

except Exception as e:
    logger.write(f">> [오류] 6단계(최종 모델 학습) 중 문제 발생: {e}", print_error=True)
    logger.close()
    sys.exit()

In [None]:
# ==============================================================================
# --- 8. 📄 제출 파일 생성 및 모델 저장 ---
# ==============================================================================
try:
    logger.write("\n>> [7단계 시작] 제출 파일 생성 및 모델 저장을 시작합니다.")
    final_predictions = np.expm1(test_preds)
    final_predictions[final_predictions < 0] = 0
    
    submission_df['target'] = final_predictions
    submission_df.to_csv(SUBMISSION_PATH, index=False)
    logger.write(f">> 제출 파일 '{SUBMISSION_PATH}' 생성이 완료되었습니다.")
    logger.write(">> 제출 파일 미리보기 (상위 5개):")
    logger.write(str(submission_df.head()))

    if final_model:
        joblib.dump(final_model, MODEL_PATH)
        logger.write(f">> 학습된 최종 모델이 '{MODEL_PATH}'에 저장되었습니다.")
    else:
        logger.write(">> [경고] 최종 모델이 생성되지 않아 저장하지 못했습니다.", print_error=True)
    logger.write(">> [7단계 완료] 제출 파일 및 모델 저장 성공.")

except Exception as e:
    logger.write(f">> [오류] 7단계(제출 및 저장) 중 문제 발생: {e}", print_error=True)

finally:
    logger.write("\n" + "="*60)
    logger.write("🎉 모든 프로세스가 성공적으로 종료되었습니다.")
    logger.write("="*60)
    logger.close()

In [None]:
# ==============================================================================
# --- 9. 📊 최종 모델 결과 시각화, 분석 및 이미지 저장 ---
# ==============================================================================

IMAGE_DIR                       = '../../images/price_prediction_7/1'
IMAGE_FILENAME                  = 'price_prediction_7_model.pkl'
IMAGE_PATH                      = os.path.join(IMAGE_DIR, IMAGE_FILENAME)
os.makedirs(IMAGE_DIR, exist_ok=True)

try:
    logger.write("\n>> [8단계 시작] 모델 결과 시각화 및 분석 파일 저장을 시작합니다...")
    logger.write(f">> 시각화 결과가 저장될 경로: {IMAGE_DIR}")

    # 1. 피처 중요도 시각화
    try:
        logger.write(">> 8.1. 피처 중요도 이미지 저장 중...")
        all_importances = pd.DataFrame()
        for i, model in enumerate(fold_models):
            fold_importance = pd.DataFrame({'feature': X_train.columns, 'importance': model.feature_importances_, 'fold': i + 1})
            all_importances = pd.concat([all_importances, fold_importance], axis=0)
        
        mean_importances = all_importances.groupby('feature')['importance'].mean().sort_values(ascending=False)
        
        plt.figure(figsize=(12, 10)); sns.barplot(x=mean_importances.head(20).values, y=mean_importances.head(20).index)
        plt.title('상위 20개 피처 중요도 (평균)', fontsize=16); plt.savefig(os.path.join(IMAGE_DIR, '01_feature_importance.png'), bbox_inches='tight'); plt.close()
        logger.write(">> 피처 중요도 이미지 저장 완료.")
    except Exception as e:
        logger.write(f">> [오류] 8.1(피처 중요도) 시각화 중 문제 발생: {e}", print_error=True)

    # 2. 실제 값 vs OOF 예측 값 비교
    try:
        logger.write(">> 8.2. 실제 값 vs OOF 예측 값 비교 이미지 저장 중...")
        plt.figure(figsize=(10, 10)); sns.scatterplot(x=np.expm1(y_train_log.iloc[oof_indices]), y=np.expm1(oof_preds[oof_preds != 0]), alpha=0.3)
        plt.plot([0, np.expm1(y_train_log).max()], [0, np.expm1(y_train_log).max()], 'r--', lw=2)
        plt.xlabel("실제 값 (원)"); plt.ylabel("OOF 예측 값 (원)"); plt.title('실제 값 vs OOF 예측 값 비교', fontsize=16)
        plt.savefig(os.path.join(IMAGE_DIR, '02_actual_vs_oof_scatter.png'), bbox_inches='tight'); plt.close()
        logger.write(">> 실제 값 vs OOF 예측 값 비교 이미지 저장 완료.")
    except Exception as e:
        logger.write(f">> [오류] 8.2(실제값vs예측값) 시각화 중 문제 발생: {e}", print_error=True)

    # 3. 잔차 분포
    try:
        logger.write(">> 8.3. 잔차 분포 확인 이미지 저장 중...")
        residuals = np.expm1(y_train_log.iloc[oof_indices]) - np.expm1(oof_preds[oof_preds != 0])
        plt.figure(figsize=(10, 6)); sns.histplot(residuals, kde=True, bins=50)
        plt.title('잔차(실제-예측) 분포 (OOF 기반)', fontsize=16); plt.xlabel("잔차 (원)")
        plt.savefig(os.path.join(IMAGE_DIR, '03_residuals_distribution.png'), bbox_inches='tight'); plt.close()
        logger.write(">> 잔차 분포 확인 이미지 저장 완료.")
    except Exception as e:
        logger.write(f">> [오류] 8.3(잔차 분포) 시각화 중 문제 발생: {e}", print_error=True)

    # 4. 예측 분포 비교
    try:
        logger.write(">> 8.4. OOF 예측과 테스트 예측의 분포 비교 이미지 저장 중...")
        plt.figure(figsize=(10, 6)); sns.kdeplot(np.expm1(oof_preds[oof_preds != 0]), label='OOF 예측 값', fill=True)
        sns.kdeplot(final_predictions, label='테스트 데이터 예측 값', fill=True)
        plt.title('OOF 예측과 테스트 예측의 분포 비교', fontsize=16); plt.xlabel("예측 값 (원)"); plt.legend()
        plt.savefig(os.path.join(IMAGE_DIR, '04_prediction_distribution_comparison.png'), bbox_inches='tight'); plt.close()
        logger.write(">> OOF 예측과 테스트 예측의 분포 비교 이미지 저장 완료.")
    except Exception as e:
        logger.write(f">> [오류] 8.4(예측 분포) 시각화 중 문제 발생: {e}", print_error=True)

    # 5. SHAP 요약 플롯
    try:
        logger.write(">> 8.5. SHAP 요약 플롯 분석 및 저장 중...")
        explainer = shap.TreeExplainer(fold_models[-1]) # 마지막 모델 사용
        shap_sample = X_train.sample(2000, random_state=Config.SEED) if len(X_train) > 2000 else X_train
        shap_values = explainer.shap_values(shap_sample)
        
        plt.figure()
        shap.summary_plot(shap_values, shap_sample, show=False)
        plt.title("SHAP 요약 플롯 (마지막 폴드 모델)", fontsize=16)
        plt.savefig(os.path.join(IMAGE_DIR, '05_shap_summary_plot.png'), bbox_inches='tight'); plt.close()
        logger.write(">> SHAP 요약 플롯 저장 완료.")
    except Exception as e:
        logger.write(f">> [오류] 8.5(SHAP) 분석 중 문제 발생: {e}", print_error=True)

    # 6. 학습 곡선
    try:
        logger.write(">> 8.6. 학습 곡선 이미지 저장 중...")
        train_sizes, train_scores, validation_scores = learning_curve(
            estimator=lgb.LGBMRegressor(**best_params, verbosity=-1, random_state=Config.SEED, device='cuda'),
            X=X_train, y=y_train_log, train_sizes=np.linspace(0.1, 1.0, 5),
            cv=TimeSeriesSplit(n_splits=3), scoring='neg_root_mean_squared_error', n_jobs=-1)
        
        train_scores_mean = -train_scores.mean(axis=1)
        validation_scores_mean = -validation_scores.mean(axis=1)

        plt.figure(figsize=(10, 6)); plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
        plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score")
        plt.title("학습 곡선 (Learning Curve)", fontsize=16); plt.xlabel("학습 데이터 샘플 수"); plt.ylabel("RMSE"); plt.legend(loc="best"); plt.grid()
        plt.savefig(os.path.join(IMAGE_DIR, '06_learning_curve.png'), bbox_inches='tight'); plt.close()
        logger.write(">> 학습 곡선 이미지 저장 완료.")
    except Exception as e:
        logger.write(f">> [오류] 8.6(학습 곡선) 생성 중 문제 발생: {e}", print_error=True)

    logger.write(">> [8단계 완료] 모든 시각화 및 분석 파일 저장 완료.")

except Exception as e:
    logger.write(f">> [오류] 8단계(시각화 및 분석) 전체 프로세스 중 문제 발생: {e}", print_error=True)

finally:
    logger.write("\n" + "="*60)
    logger.write("🎉 모든 프로세스가 성공적으로 종료되었습니다.")
    logger.write("="*60)
    logger.close()