<a href="https://colab.research.google.com/github/EastHuni/lg-aimers/blob/main/ltsf_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# =================== 기본 설정 ===================
from google.colab import drive
drive.mount('/content/drive')

import os, glob, random, re, zipfile
import pandas as pd
import numpy as np
import holidays
from tqdm import tqdm

from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn

# 시드 고정 함수
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# 하이퍼파라미터
LOOKBACK, PREDICT, BATCH_SIZE, EPOCHS = 28, 7, 16, 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# =================== 데이터 로드 및 압축 해제 ===================
zip_path = '/content/drive/MyDrive/LG/open.zip'
extract_path = '/content/LG_data'
os.makedirs(extract_path, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

train = pd.read_csv('/content/LG_data/train/train.csv')
kr_holidays = holidays.KR(years=range(2023, 2026))

# =================== 파생변수 생성 함수 ===================
def make_features(df):
    df = df.sort_values(['영업장명_메뉴명', '영업일자']).copy()
    df['영업일자'] = pd.to_datetime(df['영업일자'])
    df['요일'] = df['영업일자'].dt.dayofweek
    df['공휴일'] = df['영업일자'].isin(kr_holidays).astype(int)
    df['lag_1'] = df.groupby('영업장명_메뉴명')['매출수량'].shift(1)
    df['rolling_7_mean'] = (
        df.groupby('영업장명_메뉴명')['매출수량']
        .shift(1).rolling(7, min_periods=1).mean()
        .reset_index(0, drop=True)
    )

    feature_cols = ['lag_1', 'rolling_7_mean']
    df[feature_cols] = df.groupby('영업장명_메뉴명')[feature_cols].ffill()
    rolling_avg = (
        df.groupby('영업장명_메뉴명')['매출수량']
        .transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())
    )
    for col in feature_cols:
        df[col] = df[col].fillna(rolling_avg)
    df[feature_cols] = df[feature_cols].fillna(0)

    selected_cols = ['영업일자', '영업장명_메뉴명', '매출수량', '요일', '공휴일'] + feature_cols
    return df[selected_cols]

train = make_features(train).copy()

# =================== LTSF-Linear 모델 정의 ===================
class NLinear(nn.Module):
    def __init__(self, input_len=28, output_len=7, input_dim=5):
        super(NLinear, self).__init__()
        self.linear = nn.Linear(input_len, output_len)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # (B, C, T)
        x = self.linear(x)      # (B, C, output_len)
        return x.permute(0, 2, 1)[:, :, 0]  # (B, output_len)

# =================== 학습 함수 ===================
def train_ltsf_linear(train_df, seed):
    set_seed(seed)
    trained_models = {}

    for store_menu, group in tqdm(train_df.groupby(['영업장명_메뉴명']), desc=f'Training LTSF-Linear (seed={seed})'):
        store_train = group.sort_values('영업일자').copy()
        if len(store_train) < LOOKBACK + PREDICT:
            continue

        features = ['매출수량', '요일', '공휴일', 'lag_1', 'rolling_7_mean']
        scaler = MinMaxScaler()
        store_train[features] = scaler.fit_transform(store_train[features])
        train_vals = store_train[features].values

        X_train, y_train = [], []
        for i in range(len(train_vals) - LOOKBACK - PREDICT + 1):
            X_train.append(train_vals[i:i+LOOKBACK])
            y_train.append(train_vals[i+LOOKBACK:i+LOOKBACK+PREDICT, 0])

        X_train = torch.tensor(X_train).float().to(DEVICE)
        y_train = torch.tensor(y_train).float().to(DEVICE)

        model = NLinear(input_len=LOOKBACK, output_len=PREDICT, input_dim=5).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.MSELoss()

        model.train()
        for epoch in range(EPOCHS):
            idx = torch.randperm(len(X_train))
            for i in range(0, len(X_train), BATCH_SIZE):
                batch_idx = idx[i:i+BATCH_SIZE]
                X_batch, y_batch = X_train[batch_idx], y_train[batch_idx]
                output = model(X_batch)
                loss = criterion(output, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        trained_models[store_menu] = {
            'model': model.eval(),
            'scaler': scaler,
            'last_sequence': train_vals[-LOOKBACK:]
        }

    return trained_models

# =================== 예측 함수 ===================
def predict_ltsf_linear(test_df, trained_models, test_prefix: str):
    results = []
    for store_menu, store_test in test_df.groupby(['영업장명_메뉴명']):
        if store_menu not in trained_models:
            continue
        model = trained_models[store_menu]['model']
        scaler = trained_models[store_menu]['scaler']
        store_test_sorted = store_test.sort_values('영업일자')
        recent_vals = store_test_sorted[['매출수량', '요일', '공휴일', 'lag_1', 'rolling_7_mean']].values[-LOOKBACK:]
        if len(recent_vals) < LOOKBACK:
            continue
        recent_vals = scaler.transform(recent_vals)
        x_input = torch.tensor([recent_vals]).float().to(DEVICE)
        with torch.no_grad():
            pred_scaled = model(x_input).squeeze().cpu().numpy()

        restored = []
        for i in range(PREDICT):
            dummy = np.zeros((1, 5))
            dummy[0, 0] = pred_scaled[i]
            restored_val = scaler.inverse_transform(dummy)[0, 0]
            restored.append(max(restored_val, 0))

        pred_dates = [f"{test_prefix}+{i+1}일" for i in range(PREDICT)]
        for d, val in zip(pred_dates, restored):
            results.append({
                '영업일자': d,
                '영업장명_메뉴명': store_menu,
                '매출수량': val
            })
    return pd.DataFrame(results)

# =================== 변환 함수 ===================
def convert_to_submission_format(pred_df: pd.DataFrame, sample_submission: pd.DataFrame):
    pred_dict = dict(zip(
      zip(pred_df['영업일자'], pred_df['영업장명_메뉴명'].apply(lambda x: x[0] if isinstance(x, tuple) else x)),
      pred_df['매출수량']
    ))

    final_df = sample_submission.copy()
    for row_idx in final_df.index:
        date = final_df.loc[row_idx, '영업일자']
        for cols in final_df.columns[1:]:
            final_df.loc[row_idx, cols] = pred_dict.get((date, cols,), 0)
    return final_df

# =================== 앙상블 파이프라인 ===================
test_files = sorted(glob.glob('/content/LG_data/test/TEST_*.csv'))
seed_list = [42, 43, 2004]
submission_list = []
submission_weights = [0.33, 0.33, 0.34]

for seed in seed_list:
    print(f"\n🚀 Seed {seed} 학습 및 예측 시작")
    trained_models = train_ltsf_linear(train, seed=seed)

    all_preds = []
    for path in test_files:
        test_df = pd.read_csv(path)
        test_df = make_features(test_df)
        filename = os.path.basename(path)
        test_prefix = re.search(r'(TEST_\d+)', filename).group(1)
        pred_df = predict_ltsf_linear(test_df, trained_models, test_prefix)
        all_preds.append(pred_df)

    full_pred_df = pd.concat(all_preds, ignore_index=True)
    sample_submission = pd.read_csv('/content/LG_data/sample_submission.csv')
    submission_df = convert_to_submission_format(full_pred_df, sample_submission)

    submission_df.to_csv(f'seed_{seed}_submission.csv', index=False, encoding='utf-8-sig')
    submission_list.append(submission_df)

# ✅ 최종 앙상블
ensemble_df = submission_list[0].copy()
for col in ensemble_df.columns[1:]:
    ensemble_df[col] = 0
    for sub_df, w in zip(submission_list, submission_weights):
        ensemble_df[col] += sub_df[col] * w

ensemble_df.to_csv('ensemble_submission.csv', index=False, encoding='utf-8-sig')
print("\n✅ 최종 앙상블 저장 완료 → 'ensemble_submission.csv'")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df['공휴일'] = df['영업일자'].isin(kr_holidays).astype(int)



🚀 Seed 42 학습 및 예측 시작


Training LTSF-Linear (seed=42): 100%|██████████| 193/193 [05:43<00:00,  1.78s/it]
  df['공휴일'] = df['영업일자'].isin(kr_holidays).astype(int)
  df['공휴일'] = df['영업일자'].isin(kr_holidays).astype(int)
  df['공휴일'] = df['영업일자'].isin(kr_holidays).astype(int)
  df['공휴일'] = df['영업일자'].isin(kr_holidays).astype(int)
  df['공휴일'] = df['영업일자'].isin(kr_holidays).astype(int)
  df['공휴일'] = df['영업일자'].isin(kr_holidays).astype(int)
  df['공휴일'] = df['영업일자'].isin(kr_holidays).astype(int)
  df['공휴일'] = df['영업일자'].isin(kr_holidays).astype(int)
  df['공휴일'] = df['영업일자'].isin(kr_holidays).astype(int)
  df['공휴일'] = df['영업일자'].isin(kr_holidays).astype(int)
  final_df.loc[row_idx, cols] = pred_dict.get((date, cols,), 0)
  final_df.loc[row_idx, cols] = pred_dict.get((date, cols,), 0)
  final_df.loc[row_idx, cols] = pred_dict.get((date, cols,), 0)
  final_df.loc[row_idx, cols] = pred_dict.get((date, cols,), 0)
  final_df.loc[row_idx, cols] = pred_dict.get((date, cols,), 0)
  final_df.loc[row_idx, cols] = pred_dict.get((d


🚀 Seed 43 학습 및 예측 시작


Training LTSF-Linear (seed=43):   3%|▎         | 5/193 [00:09<05:48,  1.85s/it]


KeyboardInterrupt: 

In [None]:
# --- 기존 LSTM 앙상블까지 수행 후 ensemble_df가 생성되어 있다고 가정 ---

import pandas as pd

# 1. ARIMA 제출 결과 불러오기 (경로는 직접 바꾸세요)
arima_submission = pd.read_csv('/content/LG/arima_submission.csv')

# 2. LSTM 앙상블 결과와 ARIMA 결과를 55:45 비율로 앙상블
final_ensemble = ensemble_df.copy()
for col in final_ensemble.columns[1:]:
    final_ensemble[col] = ensemble_df[col] * 0.55 + arima_submission[col] * 0.45

# 3. 최종 앙상블 제출 파일 저장 (경로/파일명 변경 가능)
final_ensemble.to_csv('/content/LG_data/final_ensemble_submission.csv', index=False, encoding='utf-8-sig')

print("✅ LSTM + ARIMA 최종 앙상블 제출 파일 생성 완료: final_ensemble_submission.csv")
