**LG Aimers 7기 본선 제출용 코드(해커톤 본선은 LG인화원에서 하였으며 제공된 서버로 진행하여 코드가 남아있지않지만 마지막으로 코랩에 저장된 코드)**

LightGBM + XGBoost + Meta 데이터(예선에 사용했던 LSTM 제거)

public 0.53972점으로 17등이었지만 일반화 성능에 집중하여 ->

private 점수 0.54399점으로 9등 달성

https://dacon.io/competitions/official/236594/leaderboard


In [None]:
## ============================================================
# Resort F&B 7-day Forecast — LGB + XGB (+ meta 28-day rolls)
# - 시간 퍼지 GroupKFold OOF → α(h,DOW[,season]) 튜닝 → γ/β & 캘맵 보정
# - FULL 재학습 멀티시드 백깅 → TEST별 독립 추론
# - meta: group/hwadam/room/ski/weather 의 28일 롤링 통합 (누출 방지)
# - 강화: price 피처, 동요일 반복 신호, 동시매장 모멘텀, 시즌별 α·q
# ============================================================
import os, glob, re, math, gc, warnings, random, unicodedata
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder

# ---------------- Repro & blend knobs ----------------
SEED = 42
random.seed(SEED); np.random.seed(SEED)

OOB_BLEND = 0.40    # OOF 기준 XGB 비중 초기값
ALPHA_XGB = 0.40    # 추론 fallback
MAX_AX_SUM = 0.70   # 트리 비중 상한

# ---------------- path ----------------
def _pick(*candidates):
    for p in candidates:
        if isinstance(p, str) and os.path.exists(p):
            return p
    return candidates[-1]

if os.path.exists('/mnt/data'):
    BASE_DIR = '/mnt/data'
elif os.path.exists('data'):
    BASE_DIR = 'data'
else:
    BASE_DIR = './'

TRAIN_DIR       = _pick(os.path.join(BASE_DIR, 'train'), BASE_DIR)
TEST_DIR        = _pick(os.path.join(BASE_DIR, 'test'), BASE_DIR)
TRAIN_META_DIR  = _pick(os.path.join(BASE_DIR, 'meta'),
                        os.path.join(TRAIN_DIR, 'meta'),
                        BASE_DIR)
TEST_META_DIR   = _pick(os.path.join(TEST_DIR, 'meta'),
                        os.path.join(BASE_DIR, 'test', 'meta'),
                        os.path.join(BASE_DIR, 'meta'),
                        BASE_DIR)

TRAIN_PATH        = _pick(os.path.join(TRAIN_DIR, 'train.csv'),
                          os.path.join(BASE_DIR, 'train.csv'))
SAMPLE_SUB_PATH   = _pick(os.path.join(BASE_DIR, 'sample_submission.csv'),
                          os.path.join(BASE_DIR, 'data', 'sample_submission.csv'))
PRICE_PATH        = _pick(os.path.join(BASE_DIR, 'price.csv'),
                          os.path.join(TRAIN_DIR, 'price.csv'))

# meta paths (train)
TRAIN_GROUP_PATH   = _pick(os.path.join(TRAIN_META_DIR, 'TRAIN_group.csv'),
                           os.path.join(BASE_DIR, 'TRAIN_group.csv'))
TRAIN_HWADAM_PATH  = _pick(os.path.join(TRAIN_META_DIR, 'TRAIN_hwadam.csv'),
                           os.path.join(BASE_DIR, 'TRAIN_hwadam.csv'))
TRAIN_ROOM_PATH    = _pick(os.path.join(TRAIN_META_DIR, 'TRAIN_room.csv'),
                           os.path.join(BASE_DIR, 'TRAIN_room.csv'))
TRAIN_SKI_PATH     = _pick(os.path.join(TRAIN_META_DIR, 'TRAIN_ski.csv'),
                           os.path.join(BASE_DIR, 'TRAIN_ski.csv'))
TRAIN_WEATHER_PATH = _pick(os.path.join(TRAIN_META_DIR, 'TRAIN_weather.csv'),
                           os.path.join(BASE_DIR, 'TRAIN_weather.csv'))

def _pick_test(fname):  # TEST_xx meta 파일 탐색
    return _pick(os.path.join(TEST_META_DIR, fname),
                 os.path.join(BASE_DIR, fname))

TEST_FILES = sorted(
    glob.glob(os.path.join(TEST_DIR, 'TEST_*.csv'))
    + glob.glob(os.path.join(BASE_DIR, 'TEST_*.csv'))
)
OUT_PATH = os.path.join(BASE_DIR, 'submission.csv')
print(f"[Path] BASE_DIR={BASE_DIR}")
print(f"        TRAIN_DIR={TRAIN_DIR}")
print(f"        TEST_DIR={TEST_DIR}")

# ---------------- Utils ----------------
def _canon_text(s: str) -> str:
    if s is None: return ''
    s = unicodedata.normalize('NFKC', str(s))
    return s.replace('\ufeff','').replace('\u200b','').replace('\xa0','').strip()

def _clean_columns(df: pd.DataFrame) -> pd.DataFrame:
    return df.rename(columns={c:_canon_text(c) for c in df.columns})

def _force_kor_cols(df: pd.DataFrame) -> pd.DataFrame:
    m = { 'date':'영업일자', 'key':'영업장명_메뉴명', 'y':'매출수량' }
    return df.rename(columns={k:v for k,v in m.items() if k in df.columns})

def _normalize_key_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if '영업장명_메뉴명' in df.columns:
        df['영업장명_메뉴명'] = df['영업장명_메뉴명'].map(_canon_text)
    return df

def _safe_read_csv(path):
    try:
        if path and os.path.exists(path):
            df = pd.read_csv(path)
            return _clean_columns(df)
    except Exception:
        return None
    return None

# --- Calibration shrink helper ---
def _shrink(x, tau=0.5):
    return 1.0 + float(tau) * (float(x) - 1.0)

TAU = dict(
    gamma=1.05,  # γ 수축
    store=0.40,
    sdow=0.40,
    sh=0.40,
    winter=0.50,
    wk=0.25,
    season=0.30,
    aff=0.20
)

# ---------------- Domain: stores/holidays ----------------
HEAVY_STORES = {'담하','미라시아'}
HEAVY_W = 1.35
def store_weight(name: str) -> float:
    for s in HEAVY_STORES:
        if str(name).startswith(s):
            return HEAVY_W
    return 1.0

def is_hwadam_store(s: str) -> bool:
    s = str(s)
    return s.startswith('화담숲주막') or s.startswith('화담숲카페')

def _season_id_from_dt(dt: pd.Timestamp) -> int:
    m = int(dt.month)
    return 1 if 6<=m<=8 else (2 if 9<=m<=11 else (0 if 3<=m<=5 else 3))

def _spring_autumn_active(dt: pd.Timestamp) -> int:
    m = int(dt.month)
    return int(m in [4,5,6,9,10,11])

def _summer_family(dt: pd.Timestamp) -> int:
    return int(int(dt.month) in [7,8])

STORE_SEASON_AFFINITY = {
    ('화담숲주막','spring'): 1.15, ('화담숲주막','autumn'): 1.20,
    ('화담숲카페','spring'): 1.15, ('화담숲카페','autumn'): 1.20,
    ('미라시아','summer'): 1.08, ('느티나무 셀프BBQ','summer'): 1.10,
}

K_HOLS = set(pd.to_datetime([
    # 2023
    "2023-01-01","2023-01-21","2023-01-22","2023-01-23","2023-01-24",
    "2023-03-01","2023-05-05","2023-05-27","2023-06-06","2023-08-15",
    "2023-09-28","2023-09-29","2023-09-30","2023-10-03","2023-10-09",
    "2023-12-25",
    # 2024
    "2024-01-01","2024-02-09","2024-02-10","2024-02-11","2024-02-12",
    "2024-03-01","2024-05-05","2024-05-06","2024-05-15","2024-06-06",
    "2024-08-15","2024-09-16","2024-09-17","2024-09-18","2024-10-03",
    "2024-10-09","2024-12-25",
    # 2025
    "2025-01-01","2025-01-27","2025-01-28","2025-01-29",
    "2025-03-01","2025-05-05","2025-05-06","2025-06-06","2025-08-15",
    "2025-10-03","2025-10-06","2025-10-07","2025-10-08","2025-10-09",
    "2025-12-25",
]).date)

def is_holiday(ts) -> pd.Series:
    td = pd.to_datetime(ts)
    if not isinstance(td, pd.Series):
        td = pd.Series(td)
    return td.dt.date.map(lambda d: int(d in K_HOLS)).astype(int)

def season_code(m: int) -> int:
    if m in [12,1,2]: return 0
    if m in [3,4,5]:  return 1
    if m in [6,7,8]:  return 2
    return 3

def _is_winter_date(dt: pd.Timestamp) -> int:
    m = dt.month; d = dt.day
    return int((m==12 and d>=10) or (m in [1,2]) or (m==3 and d<=10))

# ---------------- Store/menu ecosystem priors ----------------
_STORE_PRIOR_RAW = {
    '포레스트릿': 47.84,
    '화담숲주막': 34.38,
    '카페테리아': 18.86,
    '담하': 5.50,
    '미라시아': 5.50,
    '라그로타': 1.31
}
_sp_mean = float(np.mean(list(_STORE_PRIOR_RAW.values()))) if _STORE_PRIOR_RAW else 1.0
STORE_PRIOR = {k: (v/_sp_mean if _sp_mean>0 else 1.0) for k,v in _STORE_PRIOR_RAW.items()}
def store_prior_index(store: str) -> float:
    return float(STORE_PRIOR.get(str(store), 1.0))

# ---------------- Time-pattern priors (월/요일) ----------------
_MONTH_PRIOR_RAW = {1:21.64, 2:17.37, 3:2.64, 10:15.47, 12:13.71}
_m_mean = float(np.mean(list(_MONTH_PRIOR_RAW.values()))) if _MONTH_PRIOR_RAW else 1.0
MONTH_PRIOR = {m: (v/_m_mean if _m_mean>0 else 1.0) for m,v in _MONTH_PRIOR_RAW.items()}

_DOW_PRIOR_RAW = {0:7.76, 4:12.26, 5:15.28, 6:12.72}
_d_mean = float(np.mean(list(_DOW_PRIOR_RAW.values()))) if _DOW_PRIOR_RAW else 1.0
DOW_PRIOR = {d: (v/_d_mean if _d_mean>0 else 1.0) for d,v in _DOW_PRIOR_RAW.items()}

def month_prior(m: int) -> float:
    return float(MONTH_PRIOR.get(int(m), 1.0))

def dow_prior(d: int) -> float:
    return float(DOW_PRIOR.get(int(d), 1.0))

# ---------------- General helpers ----------------
def split_store_menu(x: str):
    x = str(x)
    if "_" in x:
        p = x.find("_"); return x[:p], x[p+1:]
    return "UNKNOWN", x

def ensure_full_daily_index_multi(df, value_cols):
    df = df.copy()
    df['영업일자'] = pd.to_datetime(df['영업일자'])
    out=[]
    for name,g in df.groupby('영업장명_메뉴명'):
        g = g.sort_values('영업일자')
        idx = pd.date_range(g['영업일자'].min(), g['영업일자'].max(), freq="D")
        gg = g.set_index('영업일자').reindex(idx).rename_axis('영업일자').reset_index()
        gg['영업장명_메뉴명'] = name
        for c in value_cols:
            gg[c] = gg[c].fillna(0.0)
        out.append(gg)
    return pd.concat(out, ignore_index=True)

def build_feature_from_window(y28: np.ndarray, y28_raw: np.ndarray):
    f={}
    y = np.clip(y28.astype(float), 0.0, None)
    yr = y28_raw.astype(float)

    for i in range(28):
        f[f'lag_{i+1}'] = float(y[-(i+1)])

    def stats(prefix, arr):
        f[f'{prefix}_mean'] = float(arr.mean())
        f[f'{prefix}_std']  = float(arr.std(ddof=0))
        f[f'{prefix}_max']  = float(arr.max())
        f[f'{prefix}_sum']  = float(arr.sum())
        f[f'{prefix}_zero_share'] = float((arr==0).mean())
        f[f'{prefix}_nz_share']   = float((arr>0).mean())
    stats('r7',  y[-7:])
    stats('r14', y[-14:])
    stats('r28', y[-28:])

    a7 = y[-7:].mean(); a14 = y[-14:-7].mean() if y[-14:-7].size>0 else 0.0
    f['mom_7_7']  = float(a7/(a14+1e-6))
    f['mom_7_28'] = float(a7/(y.mean()+1e-6))
    x = np.arange(1,29); ylog = np.log1p(y)
    denom = np.sum((x-x.mean())**2); slope = 0.0 if denom==0 else np.sum((x-x.mean())*(ylog-ylog.mean()))/denom
    f['trend_slope'] = float(slope)

    nz_idx = np.where(y>0)[0]
    f['days_since_last_nz'] = float(28 - (nz_idx[-1]+1) if len(nz_idx)>0 else 28)
    f['last_nz_val'] = float(y[nz_idx[-1]] if len(nz_idx)>0 else 0.0)

    neg_mask = (yr < 0)
    f['neg_cnt_7']  = float(neg_mask[-7:].sum())
    f['neg_cnt_14'] = float(neg_mask[-14:].sum())
    f['neg_cnt_28'] = float(neg_mask[-28:].sum())
    f['neg_sum_abs_28'] = float(np.abs(yr[neg_mask][-28:]).sum()) if neg_mask.any() else 0.0
    f['neg_min_28'] = float(yr[-28:].min())
    f['neg_share_28'] = float((yr[-28:]<0).mean())
    neg_idx = np.where(yr<0)[0]
    f['days_since_last_neg'] = float(28 - (neg_idx[-1]+1) if len(neg_idx)>0 else 28)

    # --- 동일 요일 반복 신호(4주) ---
    dows = np.array([y[-7], y[-14], y[-21], y[-28]])
    f['dow_repeat_mean4'] = float(dows.mean())
    f['dow_repeat_std4']  = float(dows.std(ddof=0))
    f['dow_repeat_max4']  = float(dows.max())
    f['dow_repeat_min4']  = float(dows.min())
    return f

# ============================================================
# Meta loaders (weather fix included)
# ============================================================
def _extract_weather_series(wdf):
    """일단위 날씨 시계열 3종 추출: temp_mean, temp_range, rain_sum"""
    if wdf is None or wdf.empty:
        return None, None, None
    w = wdf.copy()
    if '일시' in w.columns:
        w['date'] = pd.to_datetime(w['일시']).dt.date
    elif '영업일자' in w.columns:
        w['date'] = pd.to_datetime(w['영업일자']).dt.date
    elif 'date' not in w.columns:
        return None, None, None
    w['date'] = pd.to_datetime(w['date'])

    def _pick(col_sub):
        cand = [c for c in w.columns if col_sub in str(c)]
        return cand[0] if cand else None

    c_tm  = _pick('평균기온')
    c_rng = _pick('일교차')
    c_rn  = _pick('강수량')

    g = w.groupby('date')
    temp_mean = g[c_tm].mean().astype(float) if c_tm else None
    temp_rng  = g[c_rng].mean().astype(float) if c_rng else None
    rain      = g[c_rn].sum().astype(float)  if c_rn else None
    return temp_mean, temp_rng, rain

def _load_train_meta_bundle(train_dates_index: pd.DatetimeIndex, store_names: np.ndarray):
    # group
    gdf = _safe_read_csv(TRAIN_GROUP_PATH)
    group_df = None; site_group = None
    if gdf is not None and not gdf.empty:
        gdf['date'] = pd.to_datetime(gdf.iloc[:,0])
        group_cols = [c for c in gdf.columns if c not in ['date']]
        group_df = gdf.set_index('date')[group_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0).sort_index()
        site_group = group_df.sum(axis=1)

    # hwadam
    hdf = _safe_read_csv(TRAIN_HWADAM_PATH)
    site_hwadam = None
    if hdf is not None and not hdf.empty:
        hdf['date'] = pd.to_datetime(hdf.iloc[:,0])
        cols = [c for c in hdf.columns if c not in ['date']]
        site_hwadam = hdf.set_index('date')[cols].apply(pd.to_numeric, errors='coerce').sum(axis=1).sort_index()

    # room
    rdf = _safe_read_csv(TRAIN_ROOM_PATH)
    site_room = None
    if rdf is not None and not rdf.empty:
        rdf['date'] = pd.to_datetime(rdf.iloc[:,0])
        cols = [c for c in rdf.columns if c not in ['date']]
        site_room = rdf.set_index('date')[cols].apply(pd.to_numeric, errors='coerce').sum(axis=1).sort_index()

    # ski
    sdf = _safe_read_csv(TRAIN_SKI_PATH)
    site_ski = None
    if sdf is not None and not sdf.empty:
        sdf['date'] = pd.to_datetime(sdf.iloc[:,0])
        cols = [c for c in sdf.columns if ('1일' in c) or ('내장객' in c)]
        if not cols:
            # 시간대 태깅 합으로 대체
            cols = [c for c in sdf.columns if c not in ['date']]
            site_ski = sdf.set_index('date')[cols].apply(pd.to_numeric, errors='coerce').sum(axis=1).sort_index()
        else:
            site_ski = sdf.set_index('date')[cols].apply(pd.to_numeric, errors='coerce').sum(axis=1).sort_index()

    # weather
    wdf = _safe_read_csv(TRAIN_WEATHER_PATH)
    temp_mean = temp_rng = rain = None
    if wdf is not None:
        temp_mean, temp_rng, rain = _extract_weather_series(wdf)

    site_df = pd.DataFrame(index=pd.DatetimeIndex(sorted(pd.unique(train_dates_index))))
    for ser, name in [
        (site_group, 'site_group'),
        (site_hwadam,'site_hwadam'),
        (site_room,  'site_room'),
        (site_ski,   'site_ski'),
        (temp_mean,  'temp_mean'),
        (temp_rng,   'temp_range'),
        (rain,       'rain_sum'),
    ]:
        if ser is not None:
            site_df[name] = ser
    site_df = site_df.sort_index().fillna(0.0)

    # group_store_map
    group_store_map = {}
    if group_df is not None:
        gcols = list(group_df.columns)
        for s in np.unique(store_names.astype(str)):
            hit = None
            for c in gcols:
                if s.startswith(str(c)):
                    hit = c; break
            group_store_map[s] = hit  # 없으면 None
    return dict(site_df=site_df, group_df=group_df, group_store_map=group_store_map)

def _load_test_meta_bundle(tid: int):
    # group
    gpath = _pick_test(f'TEST_group_{tid:02d}.csv'); group_df=None; site_group=None
    gdf = _safe_read_csv(gpath)
    if gdf is not None and not gdf.empty:
        gdf['date'] = pd.to_datetime(gdf.iloc[:,0])
        group_cols = [c for c in gdf.columns if c not in ['date']]
        group_df = gdf.set_index('date')[group_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0).sort_index()
        site_group = group_df.sum(axis=1)

    # hwadam
    hpath = _pick_test(f'TEST_hwadam_{tid:02d}.csv'); site_hwadam=None
    hdf = _safe_read_csv(hpath)
    if hdf is not None and not hdf.empty:
        hdf['date'] = pd.to_datetime(hdf.iloc[:,0])
        cols = [c for c in hdf.columns if c not in ['date']]
        site_hwadam = hdf.set_index('date')[cols].apply(pd.to_numeric, errors='coerce').sum(axis=1).sort_index()

    # room
    rpath = _pick_test(f'TEST_room_{tid:02d}.csv'); site_room=None
    rdf = _safe_read_csv(rpath)
    if rdf is not None and not rdf.empty:
        rdf['date'] = pd.to_datetime(rdf.iloc[:,0])
        cols = [c for c in rdf.columns if c not in ['date']]
        site_room = rdf.set_index('date')[cols].apply(pd.to_numeric, errors='coerce').sum(axis=1).sort_index()

    # ski
    spath = _pick_test(f'TEST_ski_{tid:02d}.csv'); site_ski=None
    sdf = _safe_read_csv(spath)
    if sdf is not None and not sdf.empty:
        sdf['date'] = pd.to_datetime(sdf.iloc[:,0])
        cols = [c for c in sdf.columns if ('1일' in c) or ('내장객' in c)]
        if not cols:
            cols = [c for c in sdf.columns if c not in ['date']]
            site_ski = sdf.set_index('date')[cols].apply(pd.to_numeric, errors='coerce').sum(axis=1).sort_index()
        else:
            site_ski = sdf.set_index('date')[cols].apply(pd.to_numeric, errors='coerce').sum(axis=1).sort_index()

    # weather
    wpath = _pick_test(f'TEST_weather_{tid:02d}.csv')
    wdf = _safe_read_csv(wpath)
    temp_mean = temp_rng = rain = None
    if wdf is not None:
        temp_mean, temp_rng, rain = _extract_weather_series(wdf)

    site_df = pd.DataFrame()
    for ser, name in [
        (site_group, 'site_group'),
        (site_hwadam,'site_hwadam'),
        (site_room,  'site_room'),
        (site_ski,   'site_ski'),
        (temp_mean,  'temp_mean'),
        (temp_rng,   'temp_range'),
        (rain,       'rain_sum'),
    ]:
        if ser is not None:
            site_df[name] = ser
    if not site_df.empty:
        site_df = site_df.sort_index().fillna(0.0)
    return dict(site_df=site_df, group_df=group_df)

# meta → 28일 윈도우에서 요약 피처 생성
def _meta_feats_from_dates(dates_28: np.ndarray,
                           store_name: str,
                           site_df: pd.DataFrame,
                           group_df: pd.DataFrame = None,
                           group_store_map: dict = None,
                           prefix: str = 'meta'):
    out={}
    if site_df is None or site_df.empty:
        cols = ['site_group','site_hwadam','site_room','site_ski','temp_mean','temp_range','rain_sum']
        for c in cols:
            out[f'{prefix}_{c}_r7_mean']  = 0.0
            out[f'{prefix}_{c}_r28_mean'] = 0.0
            out[f'{prefix}_{c}_last']     = 0.0
    else:
        idx = pd.to_datetime(dates_28)
        sub = site_df.reindex(idx).fillna(0.0)
        for c in sub.columns:
            arr = sub[c].values.astype(float)
            out[f'{prefix}_{c}_r7_mean']  = float(arr[-7:].mean())
            out[f'{prefix}_{c}_r28_mean'] = float(arr[-28:].mean())
            out[f'{prefix}_{c}_last']     = float(arr[-1])

    # 매장별 단체건수(있으면)
    v7=v28=vlast=0.0
    if (group_df is not None) and (group_store_map is not None):
        gcol = group_store_map.get(str(store_name))
        if gcol and (gcol in group_df.columns):
            subg = group_df.reindex(pd.to_datetime(dates_28))[gcol].fillna(0.0).values.astype(float)
            v7, v28, vlast = float(subg[-7:].mean()), float(subg[-28:].mean()), float(subg[-1])
    out[f'{prefix}_group_store_r7_mean']  = v7
    out[f'{prefix}_group_store_r28_mean'] = v28
    out[f'{prefix}_group_store_last']     = vlast
    return out

# ---------------- Load ----------------
print(f"[Load] {TRAIN_PATH}")
train_raw = pd.read_csv(TRAIN_PATH)
train_raw = _clean_columns(train_raw); train_raw = _force_kor_cols(train_raw)
train_raw = train_raw.rename(columns={"영업일자":"date","영업장명_메뉴명":"key","매출수량":"y_raw"})
train_raw["date"] = pd.to_datetime(train_raw["date"])
train_raw["y_raw"] = train_raw["y_raw"].astype(float)

# --- price map ---
PRICE_MAP = {}; STORE_PRICE_MEAN = {}; GLOBAL_PRICE_MEAN = 0.0
pdf = _safe_read_csv(PRICE_PATH)
if pdf is not None and not pdf.empty:
    pdf = pdf.rename(columns={'영업장명_메뉴명':'key','평균판매금액':'avg_price'})
    pdf['key'] = pdf['key'].map(_canon_text)
    pdf['avg_price'] = pd.to_numeric(pdf['avg_price'], errors='coerce').fillna(0.0)
    pdf['store'], pdf['menu'] = zip(*pdf['key'].map(split_store_menu))
    PRICE_MAP = dict(zip(pdf['key'], pdf['avg_price'].astype(float)))
    STORE_PRICE_MEAN = pdf.groupby('store')['avg_price'].mean().to_dict()
    GLOBAL_PRICE_MEAN = float(pdf['avg_price'].mean())

train = train_raw.copy()
train["y"] = train["y_raw"].clip(lower=0.0)
train["store"], train["menu"] = zip(*train["key"].map(split_store_menu))

print("Train:", train.shape, "| Period:", train["date"].min().date(), "~", train["date"].max().date())
print("Unique series:", train["key"].nunique(), "| Stores:", train["store"].nunique())
print(f"Zero share: {(train['y']==0).mean():.3f}")

train_full = ensure_full_daily_index_multi(
    train[["date","key","y","y_raw"]].rename(columns={"date":"영업일자","key":"영업장명_메뉴명"}),
    value_cols=["y","y_raw"]
)
train_full = train_full.rename(columns={"영업일자":"date","영업장명_메뉴명":"key"})
train_full["store"], train_full["menu"] = zip(*train_full["key"].map(split_store_menu))

# ---------------- Store totals (for peer/ecosystem) ----------------
_store_tot_ser = train_full.groupby(['date','store'])['y'].sum()
STORE_TOTAL_SER = _store_tot_ser.rename_axis(['date','store']).swaplevel(0,1).sort_index()

def _lookup_store_total_series(store_name, dates, ser):
    idx = pd.MultiIndex.from_arrays(
        [np.array([store_name]*len(dates), dtype=object), pd.to_datetime(dates)],
        names=['store','date']
    )
    return ser.reindex(idx, fill_value=0.0).values

# ---------------- Train meta bundle ----------------
TRAIN_META = _load_train_meta_bundle(train_full['date'], train_full['store'].values)

# ---------------- Build windows (Tabular only) ----------------
print("[Windows] 28->7 supervised windows ...")

def make_supervised_windows(df_item, store_name, menu_name):
    vals     = df_item['y'].values.astype(float)
    vals_raw = df_item['y_raw'].values.astype(float)
    dates    = pd.to_datetime(df_item['영업일자']).values
    st28_all = _lookup_store_total_series(store_name, dates, STORE_TOTAL_SER)

    X_rows=[]; Y_rows=[]; meta=[]
    if len(vals)<35: return X_rows, Y_rows, meta
    for end in range(27, len(vals)-7):
        y28  = vals[end-27:end+1]
        y28r = vals_raw[end-27:end+1]
        st28 = st28_all[end-27:end+1]
        peer28 = np.clip(st28 - y28, 0.0, None)
        dts = pd.to_datetime(dates[end-27:end+1])

        feats = build_feature_from_window(y28, y28r)
        feats['store_prior']    = store_prior_index(store_name)
        feats['store_r7_mean']  = float(st28[-7:].mean())
        feats['store_r28_mean'] = float(st28[-28:].mean())
        feats['peer_r7_mean']   = float(peer28[-7:].mean())
        feats['peer_r28_mean']  = float(peer28[-28:].mean())
        feats['share_in_store_r7']  = float(y28[-7:].sum()/(st28[-7:].sum()+1e-6))
        feats['share_in_store_r28'] = float(y28[-28:].sum()/(st28[-28:].sum()+1e-6))

        # 동시 매장 총수요/피어 모멘텀
        def _mom(a,b): return float(a/(b+1e-6))
        feats['store_mom_7_7'] = _mom(st28[-7:].mean(),  st28[-14:-7].mean())
        feats['peer_mom_7_7']  = _mom(peer28[-7:].mean(), peer28[-14:-7].mean())

        # ---- meta 28일 롤링 피처 (누출 금지: 오직 과거 28일)
        mf = _meta_feats_from_dates(
            dates_28=dts,
            store_name=store_name,
            site_df=TRAIN_META['site_df'],
            group_df=TRAIN_META['group_df'],
            group_store_map=TRAIN_META['group_store_map'],
            prefix='meta'
        )
        feats.update(mf)

        # 가격 피처
        key_name = f"{store_name}_{menu_name}"
        p = PRICE_MAP.get(key_name, STORE_PRICE_MEAN.get(store_name, GLOBAL_PRICE_MEAN))
        mu_s = STORE_PRICE_MEAN.get(store_name, GLOBAL_PRICE_MEAN if GLOBAL_PRICE_MEAN>0 else 1.0)
        feats['avg_price']   = float(p if p is not None else 0.0)
        feats['price_index'] = float((p/(mu_s+1e-6)) if p and mu_s else 1.0)

        feats['store_name'] = store_name
        feats['menu_name']  = menu_name
        yf  = vals[end+1:end+8]
        X_rows.append(feats); Y_rows.append(yf)
        meta.append({'anchor_date': pd.to_datetime(dates[end]),
                     'store_name': store_name,
                     'menu_name' : menu_name})
    return X_rows, Y_rows, meta

def df_to_windows(df_full):
    X_all=[]; Y_all=[]; M_all=[]
    for name,g in df_full.groupby('영업장명_메뉴명'):
        store, menu = split_store_menu(name)
        g = g.sort_values('영업일자')
        X,Y,M = make_supervised_windows(g, store, menu)
        if X:
            X_all.extend(X); Y_all.extend(Y); M_all.extend(M)
    return pd.DataFrame(X_all), np.array(Y_all), pd.DataFrame(M_all)

X_base, Y, META = df_to_windows(
    train_full[["date","key","y","y_raw"]].rename(columns={"date":"영업일자","key":"영업장명_메뉴명"})
)
META = META.reset_index(drop=True); X_base = X_base.reset_index(drop=True)

# Encoders
def fit_le_with_unk(values):
    le = LabelEncoder()
    uniq = pd.Series(values).astype(str).unique().tolist()
    uniq = sorted(list(set(uniq + ["<UNK>"])))
    le.fit(uniq)
    return le

le_store = fit_le_with_unk(META['store_name'])
le_menu  = fit_le_with_unk(META['menu_name'])

X_base["store_le"] = le_store.transform(X_base["store_name"].astype(str).fillna("<UNK>"))
X_base["menu_le"]  = le_menu.transform(X_base["menu_name"].astype(str).fillna("<UNK>"))

def add_kw(df):
    MENU_KW = ['세트','라떼','커피','아메리카노','맥주','소주','와인','막걸리','사케','피자','파스타','국수','라면','우동',
               '볶음','탕','스테이크','버거','샐러드','밥','비빔','디저트','케이크','아이스','주스','티','차','빵',
               '샌드위치','BBQ','꼬치','튀김','만두','키즈']
    STORE_KW = ['카페','주막','BBQ','라그로타','담하','미라시아','연회장','포레스트릿','화담','카페테리아']
    df = df.copy()
    df['store_name'] = df['store_name'].astype(str).fillna('')
    df['menu_name']  = df['menu_name'].astype(str).fillna('')
    for w in MENU_KW:
        df[f'kw_m_{w}'] = df['menu_name'].str.contains(w).astype(int)
    for w in STORE_KW:
        df[f'kw_s_{w}'] = df['store_name'].str.contains(w).astype(int)
    df['menu_len'] = df['menu_name'].str.len().astype(int)
    return df

X_base = add_kw(X_base)
X_feat = X_base.drop(columns=['store_name','menu_name']).copy()
anchor = META['anchor_date']
stores_for_rows = META['store_name'].values

print("Supervised X:", X_feat.shape, "Y:", Y.shape)

# ---------------- Site/Store calendar profiles ----------------
print("[Profiles] build Site/Store calendar indices (DOW & WOY) ...")
tf_ = train_full[['date','store','y']].copy()
tf_['date'] = pd.to_datetime(tf_['date'])

site_daily = tf_.groupby('date')['y'].sum().sort_index()
site_mu = site_daily.mean()
SITE_DOW_IDX = {d: (site_daily[site_daily.index.dayofweek==d].mean()/site_mu if site_mu>0 else 1.0) for d in range(7)}
SITE_WOY_IDX = {}
tmp = site_daily.to_frame('y')
tmp['woy'] = tmp.index.isocalendar().week.astype(int)
for w, g in tmp.groupby('woy'):
    SITE_WOY_IDX[int(w)] = (g['y'].mean()/site_mu if site_mu>0 else 1.0)

store_daily = tf_.groupby(['date','store'])['y'].sum().rename('y').reset_index()
STORE_DOW_IDX = {}
STORE_WOY_IDX = {}
for s, g in store_daily.groupby('store'):
    g = g.sort_values('date').copy()
    mu = g['y'].mean()
    if mu<=0:
        for d in range(7): STORE_DOW_IDX[(s,d)] = 1.0
        continue
    g['dow'] = g['date'].dt.dayofweek
    for d, gg in g.groupby('dow'):
        STORE_DOW_IDX[(s,int(d))] = gg['y'].mean()/mu
    g['woy'] = g['date'].dt.isocalendar().week.astype(int)
    for w, gg in g.groupby('woy'):
        STORE_WOY_IDX[(s,int(w))] = gg['y'].mean()/mu

def horizon_calendar(anchor_dates: pd.Series, h: int) -> pd.DataFrame:
    td = pd.to_datetime(anchor_dates) + pd.to_timedelta(h, unit='D') + pd.to_timedelta(1, unit='D')
    dow = td.dt.dayofweek
    mon = td.dt.month
    woy = td.dt.isocalendar().week.astype(int)
    hol = is_holiday(td)
    hol_adj = ((td - pd.Timedelta(days=1)).dt.date.map(lambda d: int(d in K_HOLS)) |
               (td + pd.Timedelta(days=1)).dt.date.map(lambda d: int(d in K_HOLS))).astype(int)
    is_winter = td.map(_is_winter_date).astype(int)
    season_id = td.map(_season_id_from_dt).astype(int)      # 0봄 1여름 2가을 3겨울
    spring_autumn_act = td.map(_spring_autumn_active).astype(int)
    summer_fam = td.map(_summer_family).astype(int)

    dow_prior_vals = dow.map(dow_prior).astype(float)
    mon_prior_vals = mon.map(month_prior).astype(float)
    return pd.DataFrame({
        f'dow_h{h}': dow.astype(int),
        f'month_h{h}': mon.astype(int),
        f'woy_h{h}': woy.astype(int),
        f'is_weekend_h{h}': (dow>=5).astype(int),
        f'is_holiday_h{h}': hol,
        f'is_hol_adj_h{h}': hol_adj,
        f'season_h{h}': mon.map(season_code).astype(int),
        f'season_id_h{h}': season_id,
        f'is_winter_h{h}': is_winter,
        f'spring_autumn_active_h{h}': spring_autumn_act,
        f'summer_family_h{h}': summer_fam,
        f'is_winter_weekend_h{h}': (is_winter & (dow>=5)).astype(int),
        f'dow_prior_h{h}': dow_prior_vals.values,
        f'month_prior_h{h}': mon_prior_vals.values,
    })

def _profile_block(anchor_series: pd.Series, h: int, store_series: pd.Series) -> pd.DataFrame:
    cal = horizon_calendar(anchor_series, h+1)
    dow = cal[f'dow_h{h+1}'].values
    woy = cal[f'woy_h{h+1}'].values
    stores = store_series.values
    site_dow = np.array([SITE_DOW_IDX.get(int(d),1.0) for d in dow], dtype=float)
    site_woy = np.array([SITE_WOY_IDX.get(int(w),1.0) for w in woy], dtype=float)
    store_dow= np.array([STORE_DOW_IDX.get((str(s), int(d)),1.0) for s,d in zip(stores,dow)], dtype=float)
    store_woy= np.array([STORE_WOY_IDX.get((str(s), int(w)),1.0) for s,w in zip(stores,woy)], dtype=float)
    return pd.DataFrame({
        f'site_dow_idx_h{h+1}': site_dow,
        f'site_woy_idx_h{h+1}': site_woy,
        f'store_dow_idx_h{h+1}': store_dow,
        f'store_woy_idx_h{h+1}': store_woy,
    })

def add_h_feats(Xb: pd.DataFrame, anchor_series: pd.Series, h: int, store_series: pd.Series):
    cal = horizon_calendar(anchor_series, h+1)  # +1..+7
    prof = _profile_block(anchor_series, h, store_series)
    return pd.concat([Xb.reset_index(drop=True), cal.reset_index(drop=True), prof.reset_index(drop=True)], axis=1)

# weighted SMAPE
def smape_store_weighted(y_true, y_pred, stores_arr):
    idx = y_true>0
    if not np.any(idx): return 0.0
    A = y_true[idx]; P = y_pred[idx]; S = stores_arr[idx]
    sm = 2.0*np.abs(A-P)/(np.abs(A)+np.abs(P)+1e-8)
    score=0.0; tot=0.0
    for s in np.unique(S):
        m = (S==s); w = store_weight(s)
        score += w * sm[m].mean(); tot += w
    return float(score/tot)

def lgb_wsmape_feval(preds, dset):
    y = dset.get_label()
    w = dset.get_weight()
    if w is None: w = np.ones_like(y)
    mask = y>0
    if not np.any(mask): return ('wSMAPE', 0.0, False)
    s = 2.0*np.abs(y - preds) / (np.abs(y) + np.abs(preds) + 1e-8)
    return ('wSMAPE', float(np.average(s[mask], weights=w[mask])), False)

def xgb_wsmape_feval(preds, dtrain):
    y = dtrain.get_label()
    w = dtrain.get_weight()
    if w is None or len(w)==0: w = np.ones_like(y)
    mask = y>0
    if not np.any(mask): return 'wSMAPE', 0.0
    s = 2.0*np.abs(y - preds) / (np.abs(y) + np.abs(preds) + 1e-8)
    return 'wSMAPE', float(np.average(s[mask], weights=w[mask]))

# ---- XGBoost compatibility helpers ----
def xgb_train_compat(params, dtrain, num_boost_round, evals, early_stopping_rounds=200, verbose_eval=200):
    try:
        return xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=evals,
                         feval=xgb_wsmape_feval, early_stopping_rounds=early_stopping_rounds,
                         verbose_eval=verbose_eval)
    except TypeError:
        params2 = dict(params); params2.setdefault('eval_metric', 'rmse')
        return xgb.train(params2, dtrain, num_boost_round=num_boost_round, evals=evals,
                         early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval)

def xgb_best_iteration(model):
    bi = getattr(model, 'best_iteration', None)
    if bi is not None: return int(bi)
    bn = getattr(model, 'best_ntree_limit', None)
    if bn is not None and bn > 0: return int(bn)
    return None

def xgb_predict_compat(model, dmatrix):
    bi = xgb_best_iteration(model)
    try:
        if bi is not None: return model.predict(dmatrix, iteration_range=(0, bi))
        else: return model.predict(dmatrix)
    except TypeError:
        bn = getattr(model, 'best_ntree_limit', None)
        if bn is not None and bn > 0: return model.predict(dmatrix, ntree_limit=bn)
        return model.predict(dmatrix)

# ---------------- Model params (tuned) ----------------
lgb_params = dict(
    objective="poisson",
    learning_rate=0.045,
    num_leaves=48,
    feature_fraction=0.85,
    bagging_fraction=0.75,
    bagging_freq=1,
    min_data_in_leaf=60,
    max_depth=-1,
    lambda_l1=1e-3,
    lambda_l2=4e-2,
    metric="None",
    n_estimators=3000,
    verbosity=-1,
    seed=SEED,
    feature_fraction_seed=SEED,
    bagging_seed=SEED,
    deterministic=True,
    force_row_wise=True
)

XGB_TREE_METHOD = 'hist'
xgb_params_base = dict(
    objective='reg:tweedie',
    tweedie_variance_power=1.2,
    eta=0.045,
    max_depth=8,
    min_child_weight=6,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_lambda=1.0,
    reg_alpha=0.10,
    tree_method=XGB_TREE_METHOD,
    disable_default_eval_metric=1,
    seed=SEED,
    sampling_method='uniform',
    max_bin=384,
    monotone_constraints='()'
)

def get_cat_cols(cols):
    cats = []
    base_cats = ['store_le','menu_le']
    hol_cats  = [c for c in cols if c.startswith((
        'dow_h','month_h','woy_h','is_weekend_h','is_holiday_h','is_hol_adj_h',
        'season_h','season_id_h','is_winter_h','is_winter_weekend_h',
        'spring_autumn_active_h','summer_family_h'
    ))]
    kw_cats   = [c for c in cols if c.startswith(('kw_m_','kw_s_'))]
    cats.extend(base_cats + hol_cats + kw_cats)
    return sorted(list(set([c for c in cats if c in cols])))

# ---------------- Train 7 heads with GroupKFold (LGB + XGB OOF) ----------------
print("[Train] LGB + XGB heads with GROUP time-purged CV ...")
oof_lgb = np.zeros_like(Y, dtype=float)
oof_xgb = np.zeros_like(Y, dtype=float)
models_cv_lgb = []
models_cv_xgb = []
avg_iters_lgb=[]; avg_iters_xgb=[]

META_store_series = META['store_name']

# 그룹: 앵커 주차 × 매장
groups = pd.to_datetime(META['anchor_date']).dt.to_period('W').astype(str) + "_" + META['store_name'].astype(str)
gkf = GroupKFold(n_splits=5)
split_iter = list(gkf.split(X_feat, groups=groups))

for h in range(7):
    print(f"  - Horizon +{h+1}d")
    yh = Y[:,h].astype(float).clip(min=0.0)
    Xh = add_h_feats(X_feat, anchor, h, META_store_series)

    cal_h = horizon_calendar(anchor, h+1)
    is_weekend = cal_h[f'is_weekend_h{h+1}'].values
    mon = cal_h[f'month_h{h+1}'].values
    is_winter = cal_h[f'is_winter_h{h+1}'].values

    # recency & weights
    max_anchor = pd.to_datetime(anchor).max()
    days_from_edge = (pd.to_datetime(anchor) - (max_anchor - pd.Timedelta(days=240))).dt.days.clip(lower=0, upper=240).values
    rec = 0.5 + 0.8*(days_from_edge/240.0)**2

    pos = yh[yh>0]
    thr_low = np.quantile(pos, 0.25) if pos.size>50 else (pos.mean() if pos.size>0 else 0.0)
    lowpos_bonus = np.where((yh>0) & (yh<=thr_low), 1.10, 1.0)

    sw = np.array([store_weight(s) for s in stores_for_rows]) * rec * lowpos_bonus
    sw = sw * (1.17**(is_weekend)) * (1.10**(pd.Series(mon).isin([12,1,2]).astype(int).values))
    sw = sw * (1.12**(is_winter)) * (1.06**(is_winter & is_weekend))
    sw = sw * (yh>0)

    # 화담 성수기/여름 가중
    spring_autumn_act = cal_h[f'spring_autumn_active_h{h+1}'].values.astype(int)
    summer_fam = cal_h[f'summer_family_h{h+1}'].values.astype(int)
    is_hwadam_arr = np.array([is_hwadam_store(s) for s in stores_for_rows]).astype(int)
    summer_aff = np.array([STORE_SEASON_AFFINITY.get((s,'summer'),1.0) for s in stores_for_rows])
    sw = sw * (1.05 ** (is_hwadam_arr & spring_autumn_act))
    sw = sw * np.where((summer_fam==1) & (summer_aff>1.0), 1.03, 1.0)

    cat_cols = get_cat_cols(Xh.columns.tolist())
    fold_models_lgb=[]; fold_models_xgb=[];
    oof_col_lgb=np.zeros_like(yh); oof_col_xgb=np.zeros_like(yh)

    for fold,(tr,va) in enumerate(split_iter):
        # LGB
        dtr = lgb.Dataset(Xh.iloc[tr], label=yh[tr], weight=sw[tr], categorical_feature=cat_cols, free_raw_data=False)
        dva = lgb.Dataset(Xh.iloc[va], label=yh[va], weight=sw[va], categorical_feature=cat_cols, free_raw_data=False)
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(200)]
        model_lgb = lgb.train(lgb_params, dtr, valid_sets=[dtr,dva], valid_names=['train','valid'],
                              feval=lgb_wsmape_feval, num_boost_round=5000, callbacks=callbacks)
        best_it_lgb = model_lgb.best_iteration or model_lgb.current_iteration()
        avg_iters_lgb.append(best_it_lgb)
        oof_col_lgb[va] = model_lgb.predict(Xh.iloc[va], num_iteration=best_it_lgb)
        fold_models_lgb.append(model_lgb)

        # XGB
        xtr = xgb.DMatrix(Xh.iloc[tr], label=yh[tr], weight=sw[tr], feature_names=Xh.columns.tolist())
        xva = xgb.DMatrix(Xh.iloc[va], label=yh[va], weight=sw[va], feature_names=Xh.columns.tolist())
        params = dict(xgb_params_base); params['seed'] = SEED + fold + 100*h
        evals = [(xtr,'train'), (xva,'valid')]
        model_xgb = xgb_train_compat(params, xtr, num_boost_round=5000, evals=evals,
                                     early_stopping_rounds=200, verbose_eval=200)
        best_it_xgb = xgb_best_iteration(model_xgb)
        avg_iters_xgb.append(best_it_xgb if best_it_xgb is not None else model_xgb.num_boosted_rounds())
        oof_col_xgb[va] = xgb_predict_compat(model_xgb, xva)
        fold_models_xgb.append(model_xgb)

    oof_lgb[:,h] = oof_col_lgb
    oof_xgb[:,h] = oof_col_xgb
    models_cv_lgb.append((fold_models_lgb, Xh.columns.tolist(), cat_cols))
    models_cv_xgb.append((fold_models_xgb, Xh.columns.tolist()))
    gc.collect()

# ---------------- OOF dynamic blend α(h,DOW) + q(h,DOW) ----------------
stores_rep = np.repeat(stores_for_rows, 7)

alpha_hd = {}  # (h,d) -> alpha for XGB
for h in range(7):
    y   = Y[:, h].astype(float)
    pL  = oof_lgb[:, h].astype(float)
    pX  = oof_xgb[:, h].astype(float)
    dws = horizon_calendar(anchor, h+1)[f'dow_h{h+1}'].values
    for d in range(7):
        m = (y>0) & (dws==d)
        if m.sum() < 300:
            continue
        best_s, best_a = 1e9, OOB_BLEND
        a_grid = [0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65]
        for a in a_grid:
            if a > MAX_AX_SUM: continue
            pr = pL[m]*(1-a) + pX[m]*a
            s  = smape_store_weighted(y[m], pr, stores_for_rows[m])
            if s < best_s:
                best_s, best_a = s, a
        alpha_hd[(h,d)] = 0.5*OOB_BLEND + 0.5*best_a  # 스무딩

oof_blend = np.zeros_like(oof_lgb)
for h in range(7):
    pL  = oof_lgb[:, h].astype(float)
    pX  = oof_xgb[:, h].astype(float)
    dws = horizon_calendar(anchor, h+1)[f'dow_h{h+1}'].values
    a_arr = np.array([alpha_hd.get((h, int(dd)), OOB_BLEND) for dd in dws], dtype=float)
    a_arr = np.clip(a_arr, 0.0, MAX_AX_SUM)
    oof_blend[:, h] = pL*(1.0 - a_arr) + pX*a_arr

Q_HD = {}
for h in range(7):
    y  = Y[:, h].astype(float)
    p  = oof_blend[:, h].astype(float)
    dws= horizon_calendar(anchor, h+1)[f'dow_h{h+1}'].values
    for d in range(7):
        m = (y>0) & (dws==d)
        if m.sum() < 300:
            continue
        r = np.median(y[m] / (p[m] + 1e-6))
        Q_HD[(h,d)] = 0.60 if r > 1.03 else 0.50

val_score = smape_store_weighted(Y.flatten(), oof_blend.flatten(), stores_rep)
print(f"[OOF] weighted SMAPE (LGB+XGB dyn α, OOB_BLEND={OOB_BLEND}) ≈ {val_score:.4f}")
print(f"       tuned α buckets: {len(alpha_hd)} | q buckets: {len(Q_HD)}")

# ---------------- α(h,d,season) + q(h,d,season) 튜닝 (추가) ----------------
alpha_hds = {}   # (h,d,season) -> alpha
Q_HDS = {}       # (h,d,season) -> quantile
for h in range(7):
    y   = Y[:, h].astype(float); pL = oof_lgb[:, h]; pX = oof_xgb[:, h]
    cal = horizon_calendar(anchor, h+1)
    dws = cal[f'dow_h{h+1}'].values
    sez = cal[f'season_id_h{h+1}'].values  # 0봄 1여름 2가을 3겨울
    for d in range(7):
        for g in range(4):
            m = (y>0) & (dws==d) & (sez==g)
            if m.sum() < 220:
                continue
            best_s, best_a = 1e9, OOB_BLEND
            for a in [0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65]:
                if a > MAX_AX_SUM: continue
                pr = pL[m]*(1-a) + pX[m]*a
                s  = smape_store_weighted(y[m], pr, stores_for_rows[m])
                if s < best_s:
                    best_s, best_a = s, a
            alpha_hds[(h,d,g)] = 0.5*OOB_BLEND + 0.5*best_a
            blend_m = pL[m]*(1-best_a) + pX[m]*best_a
            r = np.median(y[m]/(blend_m + 1e-6))
            Q_HDS[(h,d,g)] = 0.60 if r > 1.03 else 0.50
print(f"[Tune] α(h,d,season): {len(alpha_hds)}  q(h,d,season): {len(Q_HDS)}")

# ---------------- Horizon gamma/beta from blended OOF ----------------
print("[Calibrate] gamma(h,dow) & beta(h,dow) from blended OOF ...")
eps=1e-6
h_gamma = np.ones(7, dtype=float)
h_beta  = np.zeros(7, dtype=float)
gamma_hd = {}
beta_hd  = {}

for h in range(7):
    y  = Y[:,h].astype(float)
    p  = oof_blend[:,h].astype(float)
    nz = y>0
    if np.any(nz):
        ratio = np.median(y[nz]/(p[nz]+eps))
        h_gamma[h] = float(np.clip(ratio, 0.90, 1.08))
    naive = X_feat[f'lag_{7-h}'].values.astype(float)
    best_s, best_b = 1e9, 0.0
    for b in [0.00,0.02,0.04,0.06,0.08,0.10,0.12,0.15,0.18,0.20]:
        pr = p*h_gamma[h]*(1-b) + naive*b
        s = smape_store_weighted(y, pr, stores_for_rows)
        if s < best_s:
            best_s, best_b = s, b
    h_beta[h] = best_b

    dows = horizon_calendar(anchor, h+1)[f'dow_h{h+1}'].values
    for d in range(7):
        m = (y>0) & (dows==d)
        if m.sum() >= 200:
            r = np.median(y[m]/(p[m]+eps))
            gamma_hd[(h,d)] = float(np.clip(r, 0.88, 1.10))
            n = naive[m]
            best_s, best_b = 1e9, h_beta[h]
            for b in [0.00,0.02,0.04,0.06,0.08,0.10,0.12,0.15,0.18]:
                pr = p[m]*gamma_hd[(h,d)]*(1-b) + n*b
                s = smape_store_weighted(y[m], pr, stores_for_rows[m])
                if s < best_s:
                    best_s, best_b = s, b
            beta_hd[(h,d)] = best_b

print("  gamma_h (fallback):", np.round(h_gamma,3))
print("  beta_h  (fallback):", np.round(h_beta,3))
print(f"  gamma_hd keys: {len(gamma_hd)}  beta_hd keys: {len(beta_hd)}")

# ---------------- OOF-based calibration maps (store/*) ----------------
print("[Calibrate] store / (store,dow) / (store,h) ratios from blended OOF ...")
all_dows=[]; h_idx=[]
for h in range(7):
    d = horizon_calendar(anchor, h+1)[f'dow_h{h+1}'].values
    all_dows.append(d)
    h_idx.append(np.full(Y.shape[0], h, dtype=int))
all_dows = np.concatenate(all_dows)
h_idx    = np.concatenate(h_idx)

y_flat = Y.flatten(); p_flat = oof_blend.flatten()
mask = y_flat>0
y_pos, p_pos = y_flat[mask], p_flat[mask]
s_pos = stores_rep[mask]
dow_pos = all_dows[mask]
h_pos = h_idx[mask]

store_corr = {}; store_dow_corr = {}; store_h_corr = {}
for s in np.unique(s_pos):
    m = (s_pos==s)
    if m.sum()>=50:
        r = np.median(y_pos[m]/(p_pos[m]+eps))
        store_corr[s] = float(np.clip(r, 0.75, 1.35))
    else:
        store_corr[s] = 1.0

for s in np.unique(s_pos):
    for d in range(7):
        m = (s_pos==s) & (dow_pos==d)
        if m.sum()>=30:
            r = np.median(y_pos[m]/(p_pos[m]+eps))
            store_dow_corr[(s,d)] = float(np.clip(r, 0.80, 1.30))

for s in np.unique(s_pos):
    for hh in range(7):
        m = (s_pos==s) & (h_pos==hh)
        if m.sum()>=30:
            r = np.median(y_pos[m]/(p_pos[m]+eps))
            store_h_corr[(s,hh)] = float(np.clip(r, 0.80, 1.25))

print("[Calibrate] (store, winter_flag) & (store, weekend_or_holiday) ratios ...")
winter_flags=[]; wk_or_h_flags=[]
for h in range(7):
    cal = horizon_calendar(anchor, h+1)
    winter_flags.append(cal[f'is_winter_h{h+1}'].values)
    wk_or_h_flags.append((cal[f'is_weekend_h{h+1}'].values | cal[f'is_holiday_h{h+1}'].values).astype(int))
winter_flags = np.concatenate(winter_flags)
wk_or_h_flags = np.concatenate(wk_or_h_flags)

winter_pos = winter_flags[mask]
wk_pos  = wk_or_h_flags[mask]

store_winter_corr = {}
for s in np.unique(s_pos):
    for flag in [0,1]:
        m = (s_pos==s) & (winter_pos==flag)
        if m.sum()>=30:
            r = np.median(y_pos[m]/(p_pos[m]+eps))
            store_winter_corr[(s,int(flag))] = float(np.clip(r, 0.80, 1.25))

store_weekend_corr = {}
for s in np.unique(s_pos):
    for flag in [0,1]:
        m = (s_pos==s) & (wk_pos==flag)
        if m.sum()>=30:
            r = np.median(y_pos[m]/(p_pos[m]+eps))
            store_weekend_corr[(s,int(flag))] = float(np.clip(r, 0.80, 1.25))

# ---------------- (store×season) OOF 보정 ----------------
print("[Calibrate] (store, season_id) ratios from blended OOF ...")
all_season_ids=[]
for h in range(7):
    all_season_ids.append(horizon_calendar(anchor, h+1)[f'season_id_h{h+1}'].values)
season_ids = np.concatenate(all_season_ids)
g_pos = season_ids[mask]

store_season_corr = {}
for s in np.unique(s_pos):
    for g in [0,1,2,3]:  # 0spring 1summer 2autumn 3winter
        m = (s_pos==s) & (g_pos==g)
        if m.sum()>=30:
            r = np.median(y_pos[m]/(p_pos[m]+eps))
            store_season_corr[(s,int(g))] = float(np.clip(r, 0.85, 1.20))
print("  calibration maps ready.")

# ---------------- Winsorized caps & positive floors ----------------
print("[Calibrate] winsor caps & positive floors ...")
caps = {}
floor_sm = {}
floor_s  = {}
GLOBAL_POS_Q05 = 0.0

train_full2 = train_full.copy()
train_full2['dow'] = pd.to_datetime(train_full2['date']).dt.dayofweek
train_full2['is_weekend'] = (train_full2['dow']>=5).astype(int)
train_full2['is_hol'] = is_holiday(train_full2['date'])

pos_all = train_full2.loc[train_full2['y']>0, 'y'].values
if pos_all.size>0:
    GLOBAL_POS_Q05 = float(np.quantile(pos_all, 0.05))

for (s,m), g in train_full2.groupby(['store','menu']):
    pos = g.loc[g['y']>0, 'y'].values
    if pos.size>=20:
        floor_sm[(s,m)] = float(np.quantile(pos, 0.05))

for s, g in train_full2.groupby('store'):
    pos = g.loc[g['y']>0, 'y'].values
    if pos.size>=20:
        floor_s[s] = float(np.quantile(pos, 0.05))

for s, g1 in train_full2.groupby('store'):
    for d, g2 in g1.groupby('dow'):
        for flag, g3 in g2.groupby((g2['is_weekend'] | g2['is_hol']).astype(int)):
            arr = g3['y'].values.astype(float)
            if len(arr)>=50:
                q95 = float(np.quantile(arr, 0.95))
                q99 = float(np.quantile(arr, 0.99))
                caps[(s,d,int(flag))] = (q95, q99)

def get_floor(store, menu):
    if (store, menu) in floor_sm: return floor_sm[(store, menu)]
    if store in floor_s: return floor_s[store]
    return GLOBAL_POS_Q05

def apply_cap(store, dow, is_weekend, is_holiday, yhat):
    flag = int((is_weekend==1) or (is_holiday==1))
    q = caps.get((store,dow,flag))
    if q is None: return yhat
    q95,q99 = q
    upper = (1.05 if flag else 1.00) * q99
    if yhat > upper:
        return float(0.7*upper + 0.3*yhat)
    return yhat

# ---------------- Retrain on FULL with multi-seed bagging (LGB+XGB) ----------------
print("[Retrain] full data with multi-seed bagging (LGB+XGB) ...")
num_rounds_lgb = int(1.05*(np.mean(avg_iters_lgb) if len(avg_iters_lgb)>0 else 1500))
num_rounds_xgb = int(1.05*(np.mean(avg_iters_xgb) if len(avg_iters_xgb)>0 else 1500))

SEEDS = [42, 202, 404]

final_models_lgb = []
final_models_xgb = []

for h in range(7):
    yh = Y[:,h].astype(float).clip(min=0.0)
    Xh = add_h_feats(X_feat, anchor, h, META_store_series)
    cal_h = horizon_calendar(anchor, h+1)
    is_weekend = cal_h[f'is_weekend_h{h+1}'].values
    mon = cal_h[f'month_h{h+1}'].values
    is_winter = cal_h[f'is_winter_h{h+1}'].values

    max_anchor = pd.to_datetime(anchor).max()
    days_from_edge = (pd.to_datetime(anchor) - (max_anchor - pd.Timedelta(days=240))).dt.days.clip(lower=0, upper=240).values
    rec = 0.5 + 0.8*(days_from_edge/240.0)**2

    pos = yh[yh>0]
    thr_low = np.quantile(pos, 0.25) if pos.size>50 else (pos.mean() if pos.size>0 else 0.0)
    lowpos_bonus = np.where((yh>0) & (yh<=thr_low), 1.10, 1.0)

    sw = np.array([store_weight(s) for s in stores_for_rows]) * rec * lowpos_bonus
    sw = sw * (1.17**(is_weekend)) * (1.10**(pd.Series(mon).isin([12,1,2]).astype(int).values))
    sw = sw * (1.12**(is_winter)) * (1.06**(is_winter & is_weekend))
    sw = sw * (yh>0)

    spring_autumn_act = cal_h[f'spring_autumn_active_h{h+1}'].values.astype(int)
    summer_fam = cal_h[f'summer_family_h{h+1}'].values.astype(int)
    is_hwadam_arr = np.array([is_hwadam_store(s) for s in stores_for_rows]).astype(int)
    summer_aff = np.array([STORE_SEASON_AFFINITY.get((s,'summer'),1.0) for s in stores_for_rows])
    sw = sw * (1.05 ** (is_hwadam_arr & spring_autumn_act))
    sw = sw * np.where((summer_fam==1) & (summer_aff>1.0), 1.03, 1.0)

    cat_cols = get_cat_cols(Xh.columns.tolist())

    # LGB multi-seed
    models_h_lgb=[]
    for sd in SEEDS:
        params = dict(lgb_params); params['random_state']=sd
        dtr = lgb.Dataset(Xh, label=yh, weight=sw, categorical_feature=cat_cols)
        model = lgb.train(params, dtr, num_boost_round=num_rounds_lgb, feval=lgb_wsmape_feval,
                          callbacks=[lgb.log_evaluation(250)])
        models_h_lgb.append((model, Xh.columns.tolist(), cat_cols))
    final_models_lgb.append(models_h_lgb)

    # XGB multi-seed
    models_h_xgb=[]
    for sd in SEEDS:
        params = dict(xgb_params_base); params['seed']=sd
        dtr = xgb.DMatrix(Xh, label=yh, weight=sw, feature_names=Xh.columns.tolist())
        model = xgb.train(params, dtr, num_boost_round=num_rounds_xgb, verbose_eval=250)
        models_h_xgb.append((model, Xh.columns.tolist()))
    final_models_xgb.append(models_h_xgb)
    gc.collect()

# ---------------- Inference helpers ----------------
def weekly_naive_from_history(history28, h):
    return float(history28[-(7 - h)])

def infer_single_item(history28_raw, store_name, menu_name, last_date,
                      history_dates=None, store_total_lookup=None,
                      site_df_28=None, group_df_28=None, group_store_map=None):
    history28 = np.clip(np.asarray(history28_raw, dtype=float), 0.0, None)
    feats = build_feature_from_window(history28, np.asarray(history28_raw, dtype=float))

    feats['store_prior'] = store_prior_index(store_name)
    if (history_dates is not None) and (store_total_lookup is not None):
        st28 = _lookup_store_total_series(store_name, pd.to_datetime(history_dates), store_total_lookup)
        peer28 = np.clip(st28 - history28, 0.0, None)
        feats['store_r7_mean']  = float(np.mean(st28[-7:]))
        feats['store_r28_mean'] = float(np.mean(st28[-28:]))
        feats['peer_r7_mean']   = float(np.mean(peer28[-7:]))
        feats['peer_r28_mean']  = float(np.mean(peer28[-28:]))
        feats['share_in_store_r7']  = float(history28[-7:].sum()/(st28[-7:].sum()+1e-6))
        feats['share_in_store_r28'] = float(history28[-28:].sum()/(st28[-28:].sum()+1e-6))
        # 동시 매장 모멘텀
        def _mom(a,b): return float(a/(b+1e-6))
        feats['store_mom_7_7'] = _mom(st28[-7:].mean(),  st28[-14:-7].mean())
        feats['peer_mom_7_7']  = _mom(peer28[-7:].mean(), peer28[-14:-7].mean())
    else:
        feats['store_r7_mean']=feats['store_r28_mean']=0.0
        feats['peer_r7_mean']=feats['peer_r28_mean']=0.0
        feats['share_in_store_r7']=feats['share_in_store_r28']=0.0
        feats['store_mom_7_7']=feats['peer_mom_7_7']=1.0

    # meta 28일 롤링 (TEST는 파일별 28일만 사용)
    if history_dates is not None:
        mf = _meta_feats_from_dates(
            dates_28=pd.to_datetime(history_dates),
            store_name=store_name,
            site_df=site_df_28 if site_df_28 is not None else pd.DataFrame(),
            group_df=group_df_28,
            group_store_map=group_store_map,
            prefix='meta'
        )
        feats.update(mf)
    else:
        for c in ['site_group','site_hwadam','site_room','site_ski','temp_mean','temp_range','rain_sum','group_store']:
            feats[f'meta_{c}_r7_mean']=0.0; feats[f'meta_{c}_r28_mean']=0.0; feats[f'meta_{c}_last']=0.0

    # 가격 피처
    key_name = f"{store_name}_{menu_name}"
    p = PRICE_MAP.get(key_name, STORE_PRICE_MEAN.get(store_name, GLOBAL_PRICE_MEAN))
    mu_s = STORE_PRICE_MEAN.get(store_name, GLOBAL_PRICE_MEAN if GLOBAL_PRICE_MEAN>0 else 1.0)
    feats['avg_price']   = float(p if p is not None else 0.0)
    feats['price_index'] = float((p/(mu_s+1e-6)) if p and mu_s else 1.0)

    feats['store_name']=store_name; feats['menu_name']=menu_name
    row = pd.DataFrame([feats])
    row = add_kw(row)
    s_in = store_name if store_name in le_store.classes_ else "<UNK>"
    m_in = menu_name  if menu_name  in le_menu.classes_  else "<UNK>"
    row['store_le'] = le_store.transform([s_in])[0]
    row['menu_le']  = le_menu.transform([m_in])[0]
    row = row.drop(columns=['store_name','menu_name'])

    anchor_date = pd.to_datetime(last_date)
    preds=[]
    for h in range(7):
        cal = horizon_calendar(pd.Series([anchor_date]), h+1)
        prof = _profile_block(pd.Series([anchor_date]), h, pd.Series([store_name]))
        Xh = pd.concat([row.reset_index(drop=True), cal.reset_index(drop=True), prof.reset_index(drop=True)], axis=1)

        dow = int(cal[f'dow_h{h+1}'].iloc[0])
        is_wk = int(cal[f'is_weekend_h{h+1}'].iloc[0])
        is_h  = int(cal[f'is_holiday_h{h+1}'].iloc[0])
        is_winter= int(cal[f'is_winter_h{h+1}'].iloc[0])
        g_sid = int(cal[f'season_id_h{h+1}'].iloc[0])

        q = Q_HDS.get((h, dow, g_sid), Q_HD.get((h, dow), 0.50))

        seed_preds_lgb=[]; seed_preds_xgb=[]
        for (model, cols, cat_cols) in final_models_lgb[h]:
            Xuse = Xh.reindex(columns=cols, fill_value=0)
            yhat = float(model.predict(Xuse)[0])
            seed_preds_lgb.append(max(0.0, yhat))
        for (model, cols) in final_models_xgb[h]:
            Xuse = Xh.reindex(columns=cols, fill_value=0)
            dm = xgb.DMatrix(Xuse, feature_names=cols)
            yhat = float(xgb_predict_compat(model, dm)[0])
            seed_preds_xgb.append(max(0.0, yhat))

        y_lgb = float(np.quantile(seed_preds_lgb, q)) if seed_preds_lgb else 0.0
        y_xgb = float(np.quantile(seed_preds_xgb, q)) if seed_preds_xgb else 0.0

        a = alpha_hds.get((h, dow, g_sid), alpha_hd.get((h, dow), ALPHA_XGB))
        a = float(np.clip(a, 0.0, MAX_AX_SUM))
        yhat  = (1.0-a)*y_lgb + a*y_xgb

        g = _shrink(gamma_hd.get((h,dow), h_gamma[h]), TAU['gamma'])
        b = beta_hd.get((h,dow), h_beta[h])

        naive = weekly_naive_from_history(history28, h)
        if naive <= 1e-6: b = 0.0

        nz_idx = np.where(np.asarray(history28) > 0)[0]
        days_since_last_nz = 28 - (nz_idx[-1]+1) if len(nz_idx)>0 else 28
        if days_since_last_nz >= 21: b *= 0.75
        elif days_since_last_nz >= 14: b *= 0.90

        yhat = yhat * g * (1.0 - b) + naive * b

        # 시즌/어피니티 보정
        m_season = _shrink(store_season_corr.get((store_name, g_sid), 1.0), TAU['season'])
        g_name = {0:'spring',1:'summer',2:'autumn',3:'winter'}[g_sid]
        m_aff = _shrink(STORE_SEASON_AFFINITY.get((store_name, g_name), 1.0), TAU['aff'])
        if is_hwadam_store(store_name) and int(cal[f'spring_autumn_active_h{h+1}'].iloc[0])==1:
            m_season *= 1.02
        m_season = float(np.clip(m_season * m_aff, 0.90, 1.15))
        yhat *= m_season

        m_store = _shrink(store_corr.get(store_name, 1.0), TAU['store'])
        m_sdow  = _shrink(store_dow_corr.get((store_name, dow), 1.0), TAU['sdow'])
        m_sh    = _shrink(store_h_corr.get((store_name, h), 1.0), TAU['sh'])
        m_winter   = _shrink(store_winter_corr.get((store_name, is_winter), 1.0), TAU['winter'])
        m_wk    = _shrink(store_weekend_corr.get((store_name, int(is_wk or is_h)), 1.0), TAU['wk'])
        mult = m_store * m_sdow * m_sh * m_winter * m_wk
        yhat = yhat * mult

        floor_val = get_floor(store_name, menu_name)
        if floor_val > 0: yhat = max(yhat, floor_val)
        yhat = apply_cap(store_name, dow, is_wk, is_h, yhat)

        preds.append(max(0.0, yhat))
    return np.array(preds, dtype=float)

# ---------------- Build submission ----------------
print("[Predict] build submission from TEST files ...")
pred_rows=[]
for tfp in TEST_FILES:
    tid = int(re.findall(r'TEST_(\d+)\.csv', os.path.basename(tfp))[0])
    tdf = pd.read_csv(tfp)
    tdf = _clean_columns(tdf); tdf = _force_kor_cols(tdf); tdf = _normalize_key_cols(tdf)
    tdf = tdf.rename(columns={"영업일자":"date","영업장명_메뉴명":"key","매출수량":"y_raw"})
    tdf["date"] = pd.to_datetime(tdf["date"])
    tdf["store"], tdf["menu"] = zip(*tdf["key"].map(split_store_menu))

    # TEST 파일 내 28일 (store,date) 합계로 피어 집계 생성
    tdf['_yc'] = tdf['y_raw'].clip(lower=0.0)
    test_store_total_ser = tdf.groupby(['store','date'])['_yc'].sum().sort_index()

    # TEST meta 로더: 이 파일의 28일만
    TMB = _load_test_meta_bundle(tid)
    site_df_28 = TMB['site_df'] if (TMB and ('site_df' in TMB)) else pd.DataFrame()
    group_df_28= TMB['group_df'] if (TMB and ('group_df' in TMB)) else None

    # group_store_map은 TRAIN 기준으로 고정(명칭 매칭)
    group_store_map = TRAIN_META['group_store_map']

    for name,g in tdf.groupby("key"):
        g = g.sort_values("date")
        vals_raw = g["y_raw"].values.astype(float)
        dates_28 = g["date"].values
        assert len(vals_raw)==28, f"{name} in {tfp} is not 28 days"
        store,menu = split_store_menu(name)
        preds = infer_single_item(
            vals_raw, store, menu, g["date"].max(),
            history_dates=dates_28,
            store_total_lookup=test_store_total_ser,
            site_df_28=site_df_28,
            group_df_28=group_df_28,
            group_store_map=group_store_map
        )
        for h in range(7):
            pred_rows.append({
                "영업일자": f"TEST_{tid:02d}+{h+1}일",
                "영업장명_메뉴명": name,
                "매출수량": float(preds[h])
            })

pred_df = pd.DataFrame(pred_rows)
pred_df = _clean_columns(pred_df)
pred_df = _force_kor_cols(pred_df)
pred_df = _normalize_key_cols(pred_df)

pred_wide_clean = pred_df.pivot_table(index='영업일자',
                                      columns='영업장명_메뉴명',
                                      values='매출수량',
                                      aggfunc='first').reset_index()

raw_sample = pd.read_csv(SAMPLE_SUB_PATH)
raw_cols = raw_sample.columns.tolist()
raw_dates = raw_sample['영업일자'].astype(str)

def _canon(s: str) -> str:
    return (unicodedata.normalize('NFKC', str(s))
            .replace('\ufeff','').replace('\u200b','').replace('\xa0','').strip())
clean_cols_target = ['영업일자'] + [_canon(c) for c in raw_cols if c != '영업일자']
clean_to_raw = { _canon(c): c for c in raw_cols }

submission_clean = pd.DataFrame({'영업일자': raw_dates})
submission_clean = submission_clean.merge(pred_wide_clean, on='영업일자', how='left')

for c in clean_cols_target:
    if c not in submission_clean.columns and c != '영업일자':
        submission_clean[c] = 0.0
extra_cols = [c for c in submission_clean.columns if c not in clean_cols_target]
if extra_cols:
    submission_clean = submission_clean.drop(columns=extra_cols, errors='ignore')
submission_clean = submission_clean[clean_cols_target]

submission_final = submission_clean.rename(columns=clean_to_raw)
submission_final = submission_final[raw_cols]
if submission_final.isna().any().any():
    print("[Warn] NaN detected in submission; filling 0.0")
    submission_final = submission_final.fillna(0.0)

submission_final.to_csv(OUT_PATH, index=False, encoding='utf-8-sig')
print(f"[DONE] saved: {OUT_PATH}")
print(submission_final.head(3))

s2 = pd.read_csv(SAMPLE_SUB_PATH)
print('[Check] same columns? ', set(submission_final.columns) == set(s2.columns))
print('[Check] same order?   ', list(submission_final.columns) == list(s2.columns))
print('[Check] same dates?   ', submission_final['영업일자'].tolist() == s2['영업일자'].tolist())
print('[Check] NaNs count:   ', int(submission_final.isna().sum().sum()))
print('[Check] shape:        ', submission_final.shape, ' / sample:', s2.shape)