In [17]:
import pandas as pd
import argparse
from tqdm import tqdm
from datetime import timedelta


In [4]:
train_path="../data/train/train.csv"
test_dir  ="../data/test"        # TEST00,01… 폴더
out_path  ="../data/submission/sample_submission.csv"

args = argparse.Namespace(train=train_path, test=test_dir, out=out_path)

'../data/train/train.csv'

In [7]:
def read_data(path):
    return pd.read_csv(path)

In [10]:
def add_calendar_feats(df: pd.DataFrame) -> pd.DataFrame:
    """요일, 월 등 캘린더 변수 추가"""
    df = df.copy()
    df['영업일자'] = pd.to_datetime(df['영업일자'])
    df['dow'] = df['영업일자'].dt.dayofweek  # 0=월
    df['month'] = df['영업일자'].dt.month
    return df

train_df = add_calendar_feats(read_data(args.train))

In [11]:
train_df

Unnamed: 0,영업일자,영업장명_메뉴명,매출수량,dow,month
0,2023-01-01,느티나무 셀프BBQ_1인 수저세트,0,6,1
1,2023-01-02,느티나무 셀프BBQ_1인 수저세트,0,0,1
2,2023-01-03,느티나무 셀프BBQ_1인 수저세트,0,1,1
3,2023-01-04,느티나무 셀프BBQ_1인 수저세트,0,2,1
4,2023-01-05,느티나무 셀프BBQ_1인 수저세트,0,3,1
...,...,...,...,...,...
102671,2024-06-11,화담숲카페_현미뻥스크림,12,1,6
102672,2024-06-12,화담숲카페_현미뻥스크림,10,2,6
102673,2024-06-13,화담숲카페_현미뻥스크림,14,3,6
102674,2024-06-14,화담숲카페_현미뻥스크림,12,4,6


In [18]:
def ensure_daily_continuity(df: pd.DataFrame) -> pd.DataFrame:
    """
    개별 품목 데이터프레임(grp)을 받아, 날짜가 빠짐없이 연속되도록 보장합니다.
    빠진 날짜의 매출수량은 0으로 채웁니다.
    """
    # '영업일자'를 인덱스로 설정
    df = df.set_index('영업일자').sort_index()
    
    # 해당 품목의 시작일과 종료일로 전체 날짜 범위 생성
    full_date_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='D')
    
    # reindex를 사용하여 빠진 날짜를 NaN으로 채움
    df_continuous = df.reindex(full_date_range)
    
    # NaN 값을 채움
    # 매출수량은 0으로, 다른 값들은 이전 값으로 채움(forward-fill)
    df_continuous['매출수량'] = df_continuous['매출수량'].fillna(0)
    df_continuous = df_continuous.ffill()
    
    # '영업일자'를 다시 컬럼으로 변환하여 반환
    return df_continuous.reset_index().rename(columns={'index': '영업일자'})
# ==============================================================

In [19]:
# -------------------------------
# 2. 슬라이딩 윈도우 생성
# -------------------------------

def build_lag_samples(
    df: pd.DataFrame,
    lags: int = 28,
    horizon: int = 7,
    train_mode: bool = True
):
    """
    Train 모드:
        각 그룹별로 (lags -> horizon) 윈도우를 모든 가능한 위치에서 생성.
        반환: X(DataFrame), y(DataFrame), meta(ref_date 포함)

    Test 모드:
        각 그룹별로 마지막 28일만 사용하여 1행 생성.
        반환: X_test(DataFrame), keys(list[(key, 예측시작일)])
    """
    X_rows, y_rows, keys = [], [], []

    for key, grp in tqdm(df.groupby('영업장명_메뉴명'), desc='window'):
        g = ensure_daily_continuity(grp)
        sales = g['매출수량'].values
        dates = g['영업일자']

        if len(sales) < lags:
            # 라그를 만들 수 없으면 스킵 (혹은 패딩)
            continue

        if train_mode:
            # 전체 기간에서 슬라이딩
            for i in range(lags, len(sales) - horizon):
                lag_block = sales[i - lags:i][::-1]     # 최신값이 lag_1
                target = sales[i:i + horizon]          # t+1 ... t+H
                ref_date = dates.iloc[i]               # 기준 시점(라그의 바로 다음 날)

                X_row = {
                    '영업장명_메뉴명': key,
                    'dow': int(ref_date.dayofweek),
                    'month': int(ref_date.month),
                    'ref_date': ref_date,              # 검증 split용 (학습시 drop)
                }
                for l in range(1, lags + 1):
                    X_row[f'lag_{l}'] = float(lag_block[l - 1])

                X_rows.append(X_row)
                y_rows.append(target)
                keys.append((key, ref_date))
        else:
            # 마지막 28일로 1행 생성 → 예측 시작일은 마지막 관측일 + 1
            lag_block = sales[-lags:][::-1]
            last_date = dates.iloc[-1]
            X_row = {
                '영업장명_메뉴명': key,
                'dow': int(last_date.dayofweek),
                'month': int(last_date.month),
            }
            for l in range(1, lags + 1):
                X_row[f'lag_{l}'] = float(lag_block[l - 1])

            X_rows.append(X_row)
            keys.append((key, last_date + timedelta(days=1)))

    X = pd.DataFrame(X_rows)

    if train_mode:
        y_cols = [f't+{h}' for h in range(1, horizon + 1)]
        y = pd.DataFrame(y_rows, columns=y_cols)
        # ref_date가 없는 행은 제거(안전장치)
        assert 'ref_date' in X.columns
        return X, y
    else:
        return X, keys
X_train, y_train = build_lag_samples(train_df, lags=28, horizon=7, train_mode=True)

window: 100%|██████████| 193/193 [00:01<00:00, 147.85it/s]


In [23]:
display(X_train.head())
display(y_train.head())

Unnamed: 0,영업장명_메뉴명,dow,month,ref_date,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,...,lag_19,lag_20,lag_21,lag_22,lag_23,lag_24,lag_25,lag_26,lag_27,lag_28
0,느티나무 셀프BBQ_1인 수저세트,6,1,2023-01-29,0.0,8.0,0.0,0.0,2.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,느티나무 셀프BBQ_1인 수저세트,0,1,2023-01-30,8.0,0.0,8.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,느티나무 셀프BBQ_1인 수저세트,1,1,2023-01-31,0.0,8.0,0.0,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,느티나무 셀프BBQ_1인 수저세트,2,2,2023-02-01,4.0,0.0,8.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,느티나무 셀프BBQ_1인 수저세트,3,2,2023-02-02,6.0,4.0,0.0,8.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,t+1,t+2,t+3,t+4,t+5,t+6,t+7
0,8,0,4,6,2,0,2
1,0,4,6,2,0,2,2
2,4,6,2,0,2,2,4
3,6,2,0,2,2,4,5
4,2,0,2,2,4,5,0


In [26]:
import itertools, numpy as np, pandas as pd

lag_cols = [f'lag_{i}' for i in range(1, 29)]  # lag_1 … lag_28

def max_zero_run(arr):
    runs = (len(list(g)) for v, g in itertools.groupby(arr) if v == 0.0)
    return max(runs, default=0)

# 연속 0 길이
X_train['max_zero_run'] = X_train[lag_cols].apply(max_zero_run, axis=1)

# 1) 연속 0 길이 계산 (이미 max_zero_run 컬럼 존재한다고 가정)
THR = 14                     # cut 기준
mask_keep = X_train['max_zero_run'] < THR

print(f"드롭 비율: {(~mask_keep).mean():.1%}")  # ≈ 18 %

# 2) 학습·튜닝용 데이터
X_fit = X_train.loc[mask_keep].drop(['max_zero_run'], axis=1)
y_fit = y_train.loc[mask_keep]

드롭 비율: 23.0%


In [30]:
display(X_fit.head())
display(y_fit.head())

Unnamed: 0,영업장명_메뉴명,dow,month,ref_date,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,...,lag_19,lag_20,lag_21,lag_22,lag_23,lag_24,lag_25,lag_26,lag_27,lag_28
3,느티나무 셀프BBQ_1인 수저세트,2,2,2023-02-01,4.0,0.0,8.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,느티나무 셀프BBQ_1인 수저세트,3,2,2023-02-02,6.0,4.0,0.0,8.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,느티나무 셀프BBQ_1인 수저세트,4,2,2023-02-03,2.0,6.0,4.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,느티나무 셀프BBQ_1인 수저세트,5,2,2023-02-04,0.0,2.0,6.0,4.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,느티나무 셀프BBQ_1인 수저세트,6,2,2023-02-05,2.0,0.0,2.0,6.0,4.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,t+1,t+2,t+3,t+4,t+5,t+6,t+7
3,6,2,0,2,2,4,5
4,2,0,2,2,4,5,0
5,0,2,2,4,5,0,6
6,2,2,4,5,0,6,6
7,2,4,5,0,6,6,3


In [5]:
from typing import List, Tuple
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

# ------------------------------------------------------------------
# 1. 시간 기준 검증 분할 (그대로 재사용)
# ------------------------------------------------------------------
def time_based_split(X: pd.DataFrame,
                     y: pd.DataFrame,
                     valid_days: int = 14) -> Tuple[List[int], List[int]]:
    """ref_date 기준으로 최근 valid_days 일을 검증 세트로 사용"""
    cutoff = X['ref_date'].max() - pd.Timedelta(days=valid_days)
    train_idx = X[X['ref_date'] <= cutoff].index.tolist()
    val_idx   = X[X['ref_date'] >  cutoff].index.tolist()
    return train_idx, val_idx

# ------------------------------------------------------------------
# 2. XGBoost 학습 함수
# ------------------------------------------------------------------
def train_xgboost(
    X: pd.DataFrame,
    y: pd.DataFrame,
    gpu: bool = True,
):
    # ── 2-1. 검증 분할 ────────────────────────────────────────────
    train_idx, val_idx = time_based_split(X, y, valid_days=14)

    # ref_date 제거
    X_train = X.loc[train_idx].drop(columns=['ref_date'])
    X_val   = X.loc[val_idx]  .drop(columns=['ref_date'])
    y_train = y.loc[train_idx]
    y_val   = y.loc[val_idx]

    # ── 2-2. 전처리: 범주형 원-핫 + 수치형 패스 ─────────────────
    cat_cols  = ['영업장명_메뉴명', 'dow', 'month']
    num_cols  = [c for c in X_train.columns if c.startswith('lag_')]

    preproc = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
            ('num', 'passthrough',                   num_cols)
        ]
    )



    # ── 2-3. XGBRegressor (개별 타깃 7개를 한꺼번에) ─────────────
    base_model = XGBRegressor(
        n_estimators      = 2000,
        learning_rate     = 0.05,
        max_depth         = 6,
        subsample         = 0.8,
        colsample_bytree  = 0.8,
        objective         = 'reg:squarederror',
        random_state      = 42,
        tree_method       = 'gpu_hist' if gpu else 'hist',
        predictor        = 'gpu_predictor' ,
        # early_stopping_rounds 는 MultiOutputRegressor 안에서
        # 자동 지원이 안 되므로 일단 제외, 아래 manual callback 참고
    )

    model = Pipeline(steps=[
        ('prep', preproc),
        ('xgb',  MultiOutputRegressor(base_model, n_jobs=-1))
    ])

    # ── 2-4. 학습 (간단 버전: 일괄 학습) ─────────────────────────
    model.fit(X_train, y_train)

    # ── 2-5. 검증 RMSE 확인 (optional) ─────────────────────────
    preds = model.predict(X_val)
    rmse  = ((preds - y_val.values) ** 2).mean(axis=0) ** 0.5
    print("[Validation] RMSE per horizon:", rmse.round(2))

    return model
model = train_xgboost(X_fit, y_fit)
def predict_test(
    model,
    X_test: pd.DataFrame,
    keys,
    test_prefix: str | None = None,
    horizon: int = HORIZON
) -> pd.DataFrame:
    """
    model : sklearn Pipeline (prep + MultiOutputRegressor(XGBRegressor))
    X_test: build_lag_samples(...) 결과 DataFrame (ref_date 컬럼 없음)
    """
    # ① CatBoost Pool 제거 → DataFrame 그대로 전달
    preds = model.predict(X_test)           # shape = (n_rows, 7)

    # ② 1차원 예외 처리 (샘플이 1행일 때)
    if preds.ndim == 1:
        preds = preds.reshape(1, -1)

    rows = []
    for (key, start_date), p in zip(keys, preds):
        for h in range(horizon):
            # (A) 파일 접두어 형식
            if test_prefix is not None:
                date_label = f"{test_prefix}+{h+1}일"
            # (B) datetime 형식
            else:
                date_label = start_date + timedelta(days=h)

            rows.append({
                '영업장명_메뉴명': key,
                '영업일자'      : date_label,
                '예측수량'      : max(0.0, float(p[h]))
            })
    return pd.DataFrame(rows)

# ② 테스트 파일 수집 ------------------------------
test_path = Path(args.test)
test_files = ([test_path] if test_path.is_file()
              else sorted(test_path.glob("TEST*.csv")))
if not test_files:
    raise FileNotFoundError(f"{test_path} 에서 TEST*.csv 를 찾을 수 없습니다.")
print(f"[INFO] 예측 대상 TEST 파일 {len(test_files)}개")

# ③ 파일별 예측 → 저장용 리스트 --------------------
pred_list = []
for p in test_files:                    # p = PosixPath('.../TEST_00.csv')
    tdf          = add_calendar_feats(read_data(p))
    X_tst, keys  = build_lag_samples(tdf, train_mode=False)

    prefix = p.stem.split('.')[0]       # 'TEST_00'
    pred_df = predict_test(model, X_tst, keys, test_prefix=prefix)
    pred_list.append(pred_df)

if not pred_list:
    raise RuntimeError("예측된 결과가 없습니다. TEST 파일/데이터 확인!")

# ④ concat & 저장 -------------------------------
full_pred = (pd.concat(pred_list, ignore_index=True)
              .sort_values(['영업장명_메뉴명','영업일자']))
out_path = Path(args.out)
if out_path.is_dir():
    out_path = out_path / "submission.csv"
full_pred.to_csv(out_path, index=False)
print(f"[DONE] 최종 {full_pred.shape[0]:,}행 저장 → {out_path.resolve()}")

df = pd.read_csv("/content/drive/MyDrive/data/submission.csv")
df = df.rename(columns={'예측수량': '매출수량'})

def convert_to_submission_format(pred_df: pd.DataFrame, sample_submission: pd.DataFrame):
    # (영업일자, 메뉴) → 매출수량 딕셔너리로 변환
    pred_dict = dict(zip(
        zip(pred_df['영업일자'], pred_df['영업장명_메뉴명']),
        pred_df['매출수량']
    ))

    final_df = sample_submission.copy()

    for row_idx in final_df.index:
        date = final_df.loc[row_idx, '영업일자']
        for col in final_df.columns[1:]:  # 메뉴명들
            final_df[col] = final_df[col].astype(float)
            final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)

    return final_df

sample_submission = pd.read_csv("/content/drive/MyDrive/data/baseline_submission.csv")
submission = convert_to_submission_format(df, sample_submission)
submission.to_csv('baseline_submission_xgb_deadcut.csv', index=False, encoding='utf-8-sig')

NameError: name 'read_data' is not defined