# config.py

In [1]:
# config.py

import os

# --- 기본 설정 ---
RANDOM_STATE = 42
BASE_DIR = "../"

# --- 경로 설정 ---
DATA_DIR = os.path.join(BASE_DIR, 'data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
MODEL_DIR = os.path.join(BASE_DIR, 'models')
SUBMISSION_DIR = os.path.join(OUTPUT_DIR, 'submissions')

# --- 데이터 경로 ---
TRAIN_CSV_PATH = os.path.join(DATA_DIR, 'train', 'train.csv')
TEST_DIR_PATH = os.path.join(DATA_DIR, 'test')
SUBMISSION_CSV_PATH = os.path.join(DATA_DIR, 'submission', 'sample_submission.csv')

# --- 피처 엔지니어링 및 샘플링 설정 ---
INPUT_DAYS = 28
OUTPUT_DAYS = 7

FEATURE_ENGINEERING = {
    # [수정] 1일부터 28일까지 모든 lag 사용
    'lags': list(range(1, 29)),
    # [수정] 베이스라인에는 rolling window가 없으므로 비활성화
    'rolling_windows': [],
    # [수정] 베이스라인에 사용된 기본 날짜 피처만 남김
    'date_features': ['dayofweek', 'month'],
    
    # [추가] 영업장 위치(좌표) 정보
    'store_xy': {
        "담하": (620, 350),
        "미라시아": (620, 350),
        "느티나무 셀프BBQ": (500, 235),
        "포레스트릿": (770, 260),
        "연회장": (900, 350),
        "카페테리아": (345, 300),
        "화담숲주막": (920, 150),
        "화담숲카페": (920, 150),
        "라그로타": (300, 300)
    }
}

SAMPLING = {
    # [수정] 팀원의 기준인 14로 변경
    'default_zero_run_threshold': 14,
    # [수정] 베이스라인에는 품목별 설정이 없으므로 비워둠
    'item_specific_zero_run_thresholds': {}
}

# --- 검증(Validation) 설정 ---
CV_SETTINGS = {
    'strategy': 'TimeSeriesSplit',
    'n_splits': 5
}

# # --- 모델링 설정 ---
#     base_model = XGBRegressor(
#         n_estimators      = 2000,
#         learning_rate     = 0.05,
#         max_depth         = 6,
#         subsample         = 0.8,
#         colsample_bytree  = 0.8,
#         objective         = 'reg:squarederror',
#         random_state      = 42,
#         tree_method       = 'gpu_hist' if gpu else 'hist',
#         predictor        = 'gpu_predictor' ,
#         # early_stopping_rounds 는 MultiOutputRegressor 안에서
#         # 자동 지원이 안 되므로 일단 제외, 아래 manual callback 참고
#     )
MODELS = {
    'XGBoost_baseline': {
        'categorical_features': ['영업장명_메뉴명', 'dow', 'month'],
        'model_params': {
            'n_estimators': 2000,
            'learning_rate': 0.05,
            'max_depth': 6,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': RANDOM_STATE,
            'n_jobs': 2,
            # [수정] objective 파라미터 제거
            'objective': 'reg:squarederror', # MSE를 목표로 학습
            
            'tree_method': 'hist',
            # 'tree_mathod': 'gpu_hist',
            # 'predictor': 'gpu_predictor'
        }
    },
}

# Preprocessing

In [2]:
import pandas as pd

In [3]:
raw_df = pd.read_csv(TRAIN_CSV_PATH)

In [4]:
df = raw_df.copy()

In [5]:
all_items_ = df['영업장명_메뉴명'].unique()
all_items_[:5]

array(['느티나무 셀프BBQ_1인 수저세트', '느티나무 셀프BBQ_BBQ55(단체)',
       '느티나무 셀프BBQ_대여료 30,000원', '느티나무 셀프BBQ_대여료 60,000원',
       '느티나무 셀프BBQ_대여료 90,000원'], dtype=object)

In [6]:
df['영업일자'] = pd.to_datetime(df['영업일자'])
start_date, end_date = df['영업일자'].min(), df['영업일자'].max()
all_dates = pd.date_range(start=start_date, end=end_date, freq='D')
multi_index = pd.MultiIndex.from_product([all_items_, all_dates], names=['영업장명_메뉴명', '영업일자'])
df_reindexed = df.set_index(['영업장명_메뉴명', '영업일자']).reindex(multi_index).reset_index()
df_reindexed['매출수량'] = df_reindexed['매출수량'].fillna(0).astype(int)
split_data = df_reindexed['영업장명_메뉴명'].str.split('_', n=1, expand=True)
df_reindexed['영업장명'], df_reindexed['메뉴명'] = split_data[0], split_data[1]

display(df_reindexed)

Unnamed: 0,영업장명_메뉴명,영업일자,매출수량,영업장명,메뉴명
0,느티나무 셀프BBQ_1인 수저세트,2023-01-01,0,느티나무 셀프BBQ,1인 수저세트
1,느티나무 셀프BBQ_1인 수저세트,2023-01-02,0,느티나무 셀프BBQ,1인 수저세트
2,느티나무 셀프BBQ_1인 수저세트,2023-01-03,0,느티나무 셀프BBQ,1인 수저세트
3,느티나무 셀프BBQ_1인 수저세트,2023-01-04,0,느티나무 셀프BBQ,1인 수저세트
4,느티나무 셀프BBQ_1인 수저세트,2023-01-05,0,느티나무 셀프BBQ,1인 수저세트
...,...,...,...,...,...
102671,화담숲카페_현미뻥스크림,2024-06-11,12,화담숲카페,현미뻥스크림
102672,화담숲카페_현미뻥스크림,2024-06-12,10,화담숲카페,현미뻥스크림
102673,화담숲카페_현미뻥스크림,2024-06-13,14,화담숲카페,현미뻥스크림
102674,화담숲카페_현미뻥스크림,2024-06-14,12,화담숲카페,현미뻥스크림


In [7]:
processed_df = df_reindexed.copy()


In [8]:
def _create_calendar_features(df):
    # [수정] config 설정에 따라 베이스라인에 필요한 피처만 생성하도록 단순화
    date_features_config = FEATURE_ENGINEERING.get('date_features', [])
    
    if 'dayofweek' in date_features_config: 
        df['dow'] = df['영업일자'].dt.dayofweek
    if 'month' in date_features_config: 
        df['month'] = df['영업일자'].dt.month
    # ... (weekofyear, is_holiday 등은 config에 없으므로 생성되지 않음) ...
    return df

processed_df = _create_calendar_features(processed_df)
processed_df

Unnamed: 0,영업장명_메뉴명,영업일자,매출수량,영업장명,메뉴명,dow,month
0,느티나무 셀프BBQ_1인 수저세트,2023-01-01,0,느티나무 셀프BBQ,1인 수저세트,6,1
1,느티나무 셀프BBQ_1인 수저세트,2023-01-02,0,느티나무 셀프BBQ,1인 수저세트,0,1
2,느티나무 셀프BBQ_1인 수저세트,2023-01-03,0,느티나무 셀프BBQ,1인 수저세트,1,1
3,느티나무 셀프BBQ_1인 수저세트,2023-01-04,0,느티나무 셀프BBQ,1인 수저세트,2,1
4,느티나무 셀프BBQ_1인 수저세트,2023-01-05,0,느티나무 셀프BBQ,1인 수저세트,3,1
...,...,...,...,...,...,...,...
102671,화담숲카페_현미뻥스크림,2024-06-11,12,화담숲카페,현미뻥스크림,1,6
102672,화담숲카페_현미뻥스크림,2024-06-12,10,화담숲카페,현미뻥스크림,2,6
102673,화담숲카페_현미뻥스크림,2024-06-13,14,화담숲카페,현미뻥스크림,3,6
102674,화담숲카페_현미뻥스크림,2024-06-14,12,화담숲카페,현미뻥스크림,4,6


In [9]:
def _create_spatial_features(df):
    store_xy_map = FEATURE_ENGINEERING.get('store_xy', {})
    if not store_xy_map:
        return df
        
    # [수정] 튜플의 인덱스를 사용하여 x, y 좌표에 접근
    df['x'] = df['영업장명'].map(lambda s: store_xy_map.get(s, (None, None))[0])
    df['y'] = df['영업장명'].map(lambda s: store_xy_map.get(s, (None, None))[1])
    return df
processed_df = _create_spatial_features(processed_df)
processed_df

Unnamed: 0,영업장명_메뉴명,영업일자,매출수량,영업장명,메뉴명,dow,month,x,y
0,느티나무 셀프BBQ_1인 수저세트,2023-01-01,0,느티나무 셀프BBQ,1인 수저세트,6,1,500,235
1,느티나무 셀프BBQ_1인 수저세트,2023-01-02,0,느티나무 셀프BBQ,1인 수저세트,0,1,500,235
2,느티나무 셀프BBQ_1인 수저세트,2023-01-03,0,느티나무 셀프BBQ,1인 수저세트,1,1,500,235
3,느티나무 셀프BBQ_1인 수저세트,2023-01-04,0,느티나무 셀프BBQ,1인 수저세트,2,1,500,235
4,느티나무 셀프BBQ_1인 수저세트,2023-01-05,0,느티나무 셀프BBQ,1인 수저세트,3,1,500,235
...,...,...,...,...,...,...,...,...,...
102671,화담숲카페_현미뻥스크림,2024-06-11,12,화담숲카페,현미뻥스크림,1,6,920,150
102672,화담숲카페_현미뻥스크림,2024-06-12,10,화담숲카페,현미뻥스크림,2,6,920,150
102673,화담숲카페_현미뻥스크림,2024-06-13,14,화담숲카페,현미뻥스크림,3,6,920,150
102674,화담숲카페_현미뻥스크림,2024-06-14,12,화담숲카페,현미뻥스크림,4,6,920,150


In [10]:
def _create_lag_features(df):
    lags = FEATURE_ENGINEERING.get('lags', [])
    for lag in lags:
        df[f'lag_{lag}'] = df.groupby('영업장명_메뉴명')['매출수량'].shift(lag)
    return df

processed_df = _create_lag_features(processed_df)
processed_df.head()

Unnamed: 0,영업장명_메뉴명,영업일자,매출수량,영업장명,메뉴명,dow,month,x,y,lag_1,...,lag_19,lag_20,lag_21,lag_22,lag_23,lag_24,lag_25,lag_26,lag_27,lag_28
0,느티나무 셀프BBQ_1인 수저세트,2023-01-01,0,느티나무 셀프BBQ,1인 수저세트,6,1,500,235,,...,,,,,,,,,,
1,느티나무 셀프BBQ_1인 수저세트,2023-01-02,0,느티나무 셀프BBQ,1인 수저세트,0,1,500,235,0.0,...,,,,,,,,,,
2,느티나무 셀프BBQ_1인 수저세트,2023-01-03,0,느티나무 셀프BBQ,1인 수저세트,1,1,500,235,0.0,...,,,,,,,,,,
3,느티나무 셀프BBQ_1인 수저세트,2023-01-04,0,느티나무 셀프BBQ,1인 수저세트,2,1,500,235,0.0,...,,,,,,,,,,
4,느티나무 셀프BBQ_1인 수저세트,2023-01-05,0,느티나무 셀프BBQ,1인 수저세트,3,1,500,235,0.0,...,,,,,,,,,,


# Modeling

In [12]:
from tqdm import tqdm
import numpy as np
import itertools


def create_sliding_window_samples(processed_df):
    print("--- 2차 전처리(Tabular 변환) 시작 ---")
    
    horizon = OUTPUT_DAYS
    lag_cols = [f'lag_{l}' for l in FEATURE_ENGINEERING['lags']]
    X_rows, y_rows = [], []

    for item_name, grp in tqdm(processed_df.groupby('영업장명_메뉴명'), desc="Sliding Window Generation"):
        threshold = SAMPLING['item_specific_zero_run_thresholds'].get(
            item_name, SAMPLING['default_zero_run_threshold']
        )
        
        # ==================== 수정된 부분 ====================
        # for 루프의 범위를 len(grp) - horizon + 1 에서 +1 을 제거합니다.
        for i in range(len(grp) - horizon):
        # =======================================================
            features = grp.iloc[[i]]
            target = grp['매출수량'].iloc[i+1 : i+1+horizon]
            
            lag_values = features[lag_cols].values.flatten()
            if np.all(lag_values == 0): continue
            
            zero_runs = [len(list(g)) for v, g in itertools.groupby(lag_values) if v == 0]
            max_zero_run = max(zero_runs, default=0)
            
            if max_zero_run >= threshold: continue

            X_rows.append(features)
            y_rows.append(target.values)

    if not X_rows:
        print("경고: 생성된 학습 샘플이 없습니다. 필터링 임계값이 너무 낮을 수 있습니다.")
        return pd.DataFrame(), pd.DataFrame()

    X = pd.concat(X_rows, ignore_index=True)
    y = pd.DataFrame(y_rows, columns=[f't+{h+1}' for h in range(horizon)])
    
    print(f"--- 2차 전처리(Tabular 변환) 완료 ---")
    print(f"생성된 X shape: {X.shape}, y shape: {y.shape}")
    
    return X, y

X, y = create_sliding_window_samples(processed_df)

--- 2차 전처리(Tabular 변환) 시작 ---


Sliding Window Generation: 100%|██████████| 193/193 [00:40<00:00,  4.76it/s]


--- 2차 전처리(Tabular 변환) 완료 ---
생성된 X shape: (78140, 37), y shape: (78140, 7)


In [17]:
display(X.head())
display(y)

Unnamed: 0,영업장명_메뉴명,영업일자,매출수량,영업장명,메뉴명,dow,month,x,y,lag_1,...,lag_19,lag_20,lag_21,lag_22,lag_23,lag_24,lag_25,lag_26,lag_27,lag_28
0,느티나무 셀프BBQ_1인 수저세트,2023-01-01,0,느티나무 셀프BBQ,1인 수저세트,6,1,500,235,,...,,,,,,,,,,
1,느티나무 셀프BBQ_1인 수저세트,2023-01-02,0,느티나무 셀프BBQ,1인 수저세트,0,1,500,235,0.0,...,,,,,,,,,,
2,느티나무 셀프BBQ_1인 수저세트,2023-01-03,0,느티나무 셀프BBQ,1인 수저세트,1,1,500,235,0.0,...,,,,,,,,,,
3,느티나무 셀프BBQ_1인 수저세트,2023-01-04,0,느티나무 셀프BBQ,1인 수저세트,2,1,500,235,0.0,...,,,,,,,,,,
4,느티나무 셀프BBQ_1인 수저세트,2023-01-05,0,느티나무 셀프BBQ,1인 수저세트,3,1,500,235,0.0,...,,,,,,,,,,


Unnamed: 0,t+1,t+2,t+3,t+4,t+5,t+6,t+7
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
78135,16,145,60,25,43,0,12
78136,145,60,25,43,0,12,10
78137,60,25,43,0,12,10,14
78138,25,43,0,12,10,14,12


In [25]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from typing import Dict, Any, List
from sklearn.model_selection import TimeSeriesSplit

In [33]:
def _get_model_instance(model_name: str, model_params: Dict) -> Any:
    if model_name == 'XGBoost_baseline':
        cat_features = MODELS[model_name]['categorical_features']
        
        # [수정] 수치형 피처 리스트를 명확하게 지정
        num_features = [
            'x', 'y'
        ] + [col for col in X.columns if col.startswith('lag_')]
        
        # `ColumnTransformer`는 `영업일자`와 `매출수량` 등 불필요한 컬럼을 받지 않도록
        # `X`에서 해당 컬럼을 제외한 상태로 학습이 진행되므로,
        # `preprocessing.py`의 결과물인 `X` 데이터셋에 있는 피처들을 정확히 지정해야 합니다.
        all_features = cat_features + num_features
        
        # `ColumnTransformer`에 들어갈 최종 데이터셋을 준비합니다.
        preprocessor = ColumnTransformer(
            transformers=[
                ('onehot', OneHotEncoder(handle_unknown='ignore'), cat_features),
                ('passthrough', 'passthrough', num_features)
            ],
            # 나머지 피처들은 무시합니다.
            remainder='drop'
        )
        
        base_model = XGBRegressor(**model_params)
        
        return Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('multi_output_regressor', MultiOutputRegressor(base_model, n_jobs=-1))
        ])
    else:
        raise ValueError(f"지원하지 않는 모델입니다: {model_name}")

scores = {}
for model_name, model_info in MODELS.items():
    model = _get_model_instance(model_name, model_info['model_params'])
    tss = TimeSeriesSplit(n_splits=CV_SETTINGS['n_splits'])
    fold_rmses = []
    X_data = X.drop(columns=['영업일자', '매출수량', '영업장명', '메뉴명'], errors='ignore')
    y_data = y
    
    for fold, (train_index, val_index) in enumerate(tss.split(X_data)):
        print(f"\n========== FOLD {fold + 1} / {CV_SETTINGS['n_splits']} ==========")
        X_train, X_val = X_data.iloc[train_index], X_data.iloc[val_index]
        y_train, y_val = y_data.iloc[train_index], y_data.iloc[val_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        rmse_per_horizon = ((y_pred - y_val.values) ** 2).mean(axis=0) ** 0.5
        avg_rmse_fold = np.mean(rmse_per_horizon)
        fold_rmses.append(avg_rmse_fold)
        print(f"FOLD {fold + 1} - Validation MAE: {avg_rmse_fold:.4f}")
    
    avg_rmse = np.mean(fold_rmses)
    scores[model_name] = avg_rmse
    print(f"\n[{model_name}] 평균 Validation MAE: {avg_rmse:.4f}")


FOLD 1 - Validation MAE: 8.3122

FOLD 2 - Validation MAE: 8.5753

FOLD 3 - Validation MAE: 17.0035

FOLD 4 - Validation MAE: 28.8789

FOLD 5 - Validation MAE: 69.2839

[XGBoost_baseline] 평균 Validation MAE: 26.4107
