# 최종 가격 예측 모델: 시금치(Voting) vs 오이(XGBoost)
*시금치는 다양한 모델을 결합한 보팅 앙상블로, 오이는 XGBoost 단일 모델의 하이퍼파라미터를 집중 튜닝하여 각각 최적의 모델을 구축합니다.*

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings

warnings.filterwarnings('ignore')

## 1. 데이터 불러오기 및 통합 전처리

In [None]:
# 데이터 로드 및 전처리 (이전과 동일)
cold_df = pd.read_excel('../data/mon_cold.xlsx')
wind_df = pd.read_excel('../data/mon_wind.xlsx')
hot_df = pd.read_excel('../data/mon_hot.xlsx')
price_df = pd.read_excel('../data/region_price.xlsx')
weather_df = pd.read_csv('../data/region_weather.csv')
trade_df = pd.read_excel('../data/spinach_cucumber_df.xlsx')
cold_df_melted = cold_df.melt(id_vars=['지역'], var_name='날짜', value_name='한파발생')
wind_df_melted = wind_df.melt(id_vars=['지역'], var_name='날짜', value_name='태풍발생')
hot_df_melted = hot_df.melt(id_vars=['지역'], var_name='날짜', value_name='폭염발생')
for df in [price_df, weather_df, trade_df]:
    df['날짜'] = pd.to_datetime(df['날짜'], errors='coerce').dt.strftime('%Y-%m')
merged_df = pd.merge(price_df, weather_df, on=['지역', '날짜'], how='left')
merged_df = pd.merge(merged_df, cold_df_melted, on=['지역', '날짜'], how='left')
merged_df = pd.merge(merged_df, wind_df_melted, on=['지역', '날짜'], how='left')
merged_df = pd.merge(merged_df, hot_df_melted, on=['지역', '날짜'], how='left')
merged_df = pd.merge(merged_df, trade_df, on=['품목', '날짜'], how='left')
merged_df.dropna(subset=['평균가격'], inplace=True)
merged_df = merged_df[merged_df['평균가격'] > 0].copy()
merged_df.fillna(0, inplace=True)
merged_df['날짜'] = pd.to_datetime(merged_df['날짜'])
merged_df['연도'] = merged_df['날짜'].dt.year
merged_df['월'] = merged_df['날짜'].dt.month
merged_df = pd.get_dummies(merged_df, columns=['지역'], prefix='지역', drop_first=True)
print('데이터 통합 및 전처리 완료.')

데이터 통합 및 전처리 완료.


## 2. 시금치 모델: 보팅(Voting) 앙상블

In [3]:
def train_spinach_voting_model(df):
    target_df = df[df['품목'] == '시금치'].copy()
    base_features = ['평균기온(°C)', '월합강수량(00~24h만)(mm)', '평균풍속(m/s)', '최심적설(cm)', '한파발생', '태풍발생', '폭염발생', '연도', '월', '수출중량', '수입중량']
    region_features = [col for col in target_df.columns if col.startswith('지역_')]
    features = base_features + region_features
    target = '평균가격'
    X_train, X_test, y_train, y_test = train_test_split(target_df[features], target_df[target], test_size=0.2, random_state=42)

    # 기본 모델 정의
    rfr = RandomForestRegressor(random_state=42)
    xgb = XGBRegressor(random_state=42)
    lgbm = LGBMRegressor(random_state=42, verbose=-1)
    ridge = Ridge(random_state=42)
    voting_reg = VotingRegressor(estimators=[('rf', rfr), ('xgb', xgb), ('lgbm', lgbm), ('ridge', ridge)])

    # 하이퍼파라미터 그리드
    params = {'rf__n_estimators': [100, 200], 'xgb__n_estimators': [100, 200], 'lgbm__n_estimators': [100, 200], 'ridge__alpha': [0.1, 1.0, 10.0]}

    print('--- 시금치 보팅 모델 튜닝 시작 ---')
    grid_search = GridSearchCV(estimator=voting_reg, param_grid=params, cv=3, n_jobs=-1, scoring='r2', verbose=2)
    grid_search.fit(X_train, y_train)

    y_pred = grid_search.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    print(f'--- 시금치 최종 보팅 모델 평가 결과 ---')
    print(f'R-squared: {r2:.4f}')
    print(f'Best Hyperparameters: {grid_search.best_params_}')
    print('-'*80)

## 3. 오이 모델: XGBoost 집중 튜닝

In [4]:
def train_cucumber_xgboost_model(df):
    target_df = df[df['품목'] == '오이'].copy()
    base_features = ['평균기온(°C)', '월합강수량(00~24h만)(mm)', '평균풍속(m/s)', '최심적설(cm)', '한파발생', '태풍발생', '폭염발생', '연도', '월', '수출중량', '수입중량']
    region_features = [col for col in target_df.columns if col.startswith('지역_')]
    features = base_features + region_features
    target = '평균가격'
    X_train, X_test, y_train, y_test = train_test_split(target_df[features], target_df[target], test_size=0.2, random_state=42)

    # XGBoost 및 하이퍼파라미터 그리드
    xgb = XGBRegressor(random_state=42)
    params = {'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.05, 0.1], 'subsample': [0.8, 1.0], 'colsample_bytree': [0.8, 1.0]}

    print('--- 오이 XGBoost 모델 튜닝 시작 ---')
    grid_search = GridSearchCV(estimator=xgb, param_grid=params, cv=3, n_jobs=-1, scoring='r2', verbose=2)
    grid_search.fit(X_train, y_train)

    y_pred = grid_search.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    print(f'--- 오이 최종 XGBoost 모델 평가 결과 ---')
    print(f'R-squared: {r2:.4f}')
    print(f'Best Hyperparameters: {grid_search.best_params_}')
    print('-'*80)

In [5]:
# 모델 학습 실행
train_spinach_voting_model(merged_df)
train_cucumber_xgboost_model(merged_df)

--- 시금치 보팅 모델 튜닝 시작 ---
Fitting 3 folds for each of 24 candidates, totalling 72 fits
--- 시금치 최종 보팅 모델 평가 결과 ---
R-squared: 0.7081
Best Hyperparameters: {'lgbm__n_estimators': 200, 'rf__n_estimators': 100, 'ridge__alpha': 0.1, 'xgb__n_estimators': 200}
--------------------------------------------------------------------------------
--- 오이 XGBoost 모델 튜닝 시작 ---
Fitting 3 folds for each of 108 candidates, totalling 324 fits
--- 오이 최종 XGBoost 모델 평가 결과 ---
R-squared: 0.8952
Best Hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.8}
--------------------------------------------------------------------------------
