In [37]:
import os
import random
import glob
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
from tqdm import tqdm

import matplotlib.pyplot as plt
from korean_lunar_calendar import KoreanLunarCalendar


plt.rcParams['font.family'] = 'AppleGothic'  # macOS


#Fixed Random Seed  & Setting Hyperparameter
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


set_seed(42)

LOOKBACK, PREDICT, BATCH_SIZE, EPOCHS = 28, 7, 16, 40
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
MONTH_SCALE = 12
SEASON_SCALE = 3 
#winter : 0
#spring : 1
#summer : 2
#fall : 3
MIN_SEQUENCE_COUNT = 10

def get_lunar_to_solar(years, lunar_month, lunar_day, span=1):
    calendar = KoreanLunarCalendar()
    dates = []
    for year in years:
        for offset in range(-span, span+1):
            try:
                calendar.setLunar(year, lunar_month, lunar_day + offset, False)
                dates.append(calendar.SolarIsoFormat())
            except:
                pass  # 예외 처리: 음력 마지막날 초과
    return dates
# 예시: 2023 ~ 2025
years = [2023, 2024, 2025]
lunar_solar_dates = []
lunar_solar_dates += get_lunar_to_solar(years, 1, 1, span=1)   # 설날 ±1
lunar_solar_dates += get_lunar_to_solar(years, 8, 15, span=1)  # 추석 ±1

solar_md_holidays = [
    (1, 1),   # 신정
    (3, 1),   # 삼일절
    (5, 5),   # 어린이날
    (6, 6),   # 현충일
    (8, 15),  # 광복절
    (10, 3),  # 개천절
    (10, 9),  # 한글날
    (12, 25), # 크리스마스
]

def generate_combined_holiday_list(df, solar_md_list, lunar_solar_list):
    df = df.copy()
    df['영업일자'] = pd.to_datetime(df['영업일자'])

    # 양력 기반 holiday 판별
    df['is_solar_holiday'] = df['영업일자'].apply(
        lambda x: (x.month, x.day) in solar_md_list
    )

    # 음력 변환된 holiday 포함
    lunar_set = set(pd.to_datetime(lunar_solar_list))
    df['is_lunar_holiday'] = df['영업일자'].isin(lunar_set)

    # 최종 통합
    df['is_holiday'] = (df['is_solar_holiday'] | df['is_lunar_holiday']).astype(int)
    df = df.drop(columns=['is_solar_holiday', 'is_lunar_holiday'])
    return df

def remove_leading_zeros_before_sales(df, min_zero_days=90):
    """
    매출 시작 전 연속된 0이 일정 기간 이상이면, 그 전 구간은 제거
    """
    sales_started = df['매출수량'] > 0
    if not sales_started.any():
        return df  # 매출이 전혀 없는 경우 그대로 반환

    first_sale_idx = sales_started.idxmax()

    # 매출 시작 전 구간이 충분히 긴 0으로 구성되어 있다면 제거
    df_before = df.loc[:first_sale_idx - 1]
    if len(df_before) >= min_zero_days and (df_before['매출수량'] == 0).all():
        return df.loc[first_sale_idx:]  # 매출 시작부터 반환
    else:
        return df  # 그대로 반환

def filter_all_menus_by_leading_zeros(train_df, min_zero_days=90):
    """
    모든 메뉴에 대해 remove_leading_zeros_before_sales 함수 적용
    """
    filtered_df = (
        train_df.groupby('영업장명_메뉴명')
        .apply(lambda g: remove_leading_zeros_before_sales(g, min_zero_days))
        .reset_index(drop=True)
    )
    return filtered_df

#Data load
train = pd.read_csv('./train/train.csv')
train = generate_combined_holiday_list(train, solar_md_holidays, lunar_solar_dates)
train = filter_all_menus_by_leading_zeros(train, min_zero_days=90)

#Define Model
class MultiOutputLSTM(nn.Module):
    def __init__(self, input_dim=6, hidden_dim=64, num_layers=2, output_dim=7):
        super(MultiOutputLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])  # (B, output_dim)

def clip_iqr(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    return np.clip(series, None, upper)

def compute_iqr_lower_bounds(train_df):
    lower_bounds = {}
    for menu, group in train_df.groupby('영업장명_메뉴명'):
        q1 = group['매출수량'].quantile(0.25)
        q3 = group['매출수량'].quantile(0.75)
        iqr = q3 - q1
        lower = max(q1 - 1.5 * iqr, 0)
        menu_key = menu[0] if isinstance(menu, tuple) else menu
        lower_bounds[menu_key] = lower
    return lower_bounds

#Train
def train_lstm(train_df):
    trained_models = {}

    for store_menu, group in tqdm(train_df.groupby(['영업장명_메뉴명']), desc ='Training LSTM'):
        #시계열 정렬 , preprocessing
        key = store_menu[0] if isinstance(store_menu, tuple) else store_menu
        store_train = group.sort_values('영업일자').copy()
        store_train['영업일자'] = pd.to_datetime(store_train['영업일자'])
        store_train['weekday'] = store_train['영업일자'].dt.dayofweek
        store_train['month'] = store_train['영업일자'].dt.month
        store_train['season'] = store_train['month'].map({
            12:0,1:0,2:0, #winter
            3:1,4:1,5:1, #spring
            6:2,7:2,8:2, #summer
            9:3,10:3,11:3 #fall
        })  
        #month,season normalize
        store_train['month'] /= MONTH_SCALE
        store_train['season'] /= SEASON_SCALE
        #rolling_mean_7
        store_train['rolling_mean_7'] = store_train['매출수량'].rolling(window=7, min_periods=1).mean()

        #data clipping
        store_train['clipped_SQ'] = clip_iqr(store_train['매출수량'])
        
        #train/validation split을 안전하게 하기 위해 여유분 데이터를 확보하는 목적 -> MIN_SEQUENCE_COUNT
        if len(store_train) < LOOKBACK + PREDICT + MIN_SEQUENCE_COUNT:
            continue

        features = ['clipped_SQ','weekday','month','season', 'is_holiday', 'rolling_mean_7']
        scaler = MinMaxScaler()
        ##매출수량만 정규화, 요일정보 x
        store_train[['clipped_SQ','rolling_mean_7']] = scaler.fit_transform(store_train[['clipped_SQ', 'rolling_mean_7']])
        train_vals = store_train[features].values  # shape: (N, 5)

        # 시퀀스 생성
        X, y = [], []
        for i in range(len(train_vals) - LOOKBACK - PREDICT + 1):
            X.append(train_vals[i:i + LOOKBACK])
            y.append(train_vals[i + LOOKBACK:i + LOOKBACK + PREDICT, 0])  # 매출수량만 예측

        X = torch.tensor(X).float()
        y = torch.tensor(y).float()

        # ✅ Train/Validation split (80% / 20%)
        split_idx = int(len(X) * 0.8)
        X_train, X_val = X[:split_idx], X[split_idx:]
        y_train, y_val = y[:split_idx], y[split_idx:]

        X_train, y_train = X_train.to(DEVICE), y_train.to(DEVICE)
        X_val, y_val = X_val.to(DEVICE), y_val.to(DEVICE)

        model = MultiOutputLSTM(input_dim=6, output_dim=PREDICT).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.MSELoss()

        #loss 저장 리스트
        train_losses = []
        val_losses = []

        model.train()
        for epoch in range(EPOCHS):
            model.train()
            total_loss = 0  # 전체 에폭의 평균 loss 계산용
            idx = torch.randperm(len(X_train))  # 데이터 섞기
            
            for i in range(0, len(X_train), BATCH_SIZE):
                batch_idx = idx[i:i+BATCH_SIZE]
                X_batch, y_batch = X_train[batch_idx], y_train[batch_idx]
                
                output = model(X_batch)
                loss = criterion(output, y_batch)
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
            
            avg_loss = total_loss / (len(X_train) // BATCH_SIZE + 1)
            
            train_losses.append(avg_loss)  # loss 기록
            # 검증 손실 계산
            model.eval()
            with torch.no_grad():
                val_output = model(X_val)
                val_loss = criterion(val_output, y_val).item()
                val_losses.append(val_loss)

        #loss 시각화
        visualize_loss(train_losses, val_losses, key, save=True)

        trained_models[store_menu] = {
            'model': model.eval(),
            'scaler': scaler,
            'last_sequence': train_vals[-LOOKBACK:]  # (28, 6)
        }

    return trained_models

def visualize_loss(train_losses, val_losses, store_menu, save=False, out_dir="./loss_plots"):
    plt.figure()
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title(f"[{store_menu}] Train vs Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    
    if save:
        os.makedirs(out_dir, exist_ok=True)
        safe_name = re.sub(r'[^\w\-_.]', '_', store_menu)  # 파일명 안전하게 처리
        path = os.path.join(out_dir, f"{safe_name}.png")
        plt.savefig(path)
    else:
        plt.show()

    plt.close()

#Prediction
def predict_lstm(test_df, trained_models, test_prefix: str, lower_bound_dict: dict):
    results = []

    for store_menu, store_test in test_df.groupby(['영업장명_메뉴명']):
        key = store_menu
        if key not in trained_models:
            continue

        model = trained_models[key]['model']
        scaler = trained_models[key]['scaler']

        store_test_sorted = store_test.sort_values('영업일자')
        store_test_sorted['영업일자'] = pd.to_datetime(store_test_sorted['영업일자'])
        store_test_sorted['weekday'] = store_test_sorted['영업일자'].dt.dayofweek
        store_test_sorted['month'] = store_test_sorted['영업일자'].dt.month
        store_test_sorted['season'] = store_test_sorted['month'].map({
            12:0,1:0,2:0, #winter
            3:1,4:1,5:1, #spring
            6:2,7:2,8:2, #summer
            9:3,10:3,11:3 #fall
        })  

        #month, season 정규화
        store_test_sorted['month'] /= MONTH_SCALE
        store_test_sorted['season'] /= SEASON_SCALE
        # holiday feature 추가
        store_test_sorted = generate_combined_holiday_list(store_test_sorted, solar_md_holidays, lunar_solar_dates)

        # rolling_mean_7 계산
        store_test_sorted['rolling_mean_7'] = store_test_sorted['매출수량'].rolling(window=7, min_periods=1).mean()
        
        feature_cols = ['매출수량','weekday','month','season','is_holiday','rolling_mean_7']
        recent_vals = store_test_sorted[feature_cols].values[-LOOKBACK:]

        if len(recent_vals) < LOOKBACK:
            recent_vals = trained_models[key]['last_sequence']  # (28, 6)
        
        # 매출수량만 정규화
        recent_vals[:, [0, 5]] = scaler.transform(recent_vals[:, [0, 5]])

        x_input = torch.tensor([recent_vals]).float().to(DEVICE)

        with torch.no_grad():
            pred_scaled = model(x_input).squeeze().cpu().numpy()

        lower_bound = lower_bound_dict.get(key, 0)

        # 역변환
        restored = []
        for i in range(PREDICT):
            val = pred_scaled[i]
            dummy_input = np.zeros((1, 2))
            dummy_input[0, 0] = val  # clipped_SQ만 역변환
            restored_val = scaler.inverse_transform(dummy_input)[0][0]
            restored.append(max(restored_val, lower_bound))



        # 예측일자: TEST_00+1일 ~ TEST_00+7일
        pred_dates = [f"{test_prefix}+{i+1}일" for i in range(PREDICT)]

        for d, val in zip(pred_dates, restored):
            results.append({
                '영업일자': d,
                '영업장명_메뉴명': store_menu[0],
                '매출수량': val
            })
    

    return pd.DataFrame(results)

def convert_to_submission_format(pred_df: pd.DataFrame, sample_submission: pd.DataFrame):
    # (영업일자, 메뉴) → 매출수량 딕셔너리로 변환
    pred_dict = dict(zip(
        zip(pred_df['영업일자'], pred_df['영업장명_메뉴명']),
        pred_df['매출수량']
    ))

    final_df = sample_submission.copy()

    for row_idx in final_df.index:
        date = final_df.loc[row_idx, '영업일자']
        for col in final_df.columns[1:]:  # 메뉴명들
            final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)

    return final_df


  .apply(lambda g: remove_leading_zeros_before_sales(g, min_zero_days))


In [31]:
# 학습
trained_models = train_lstm(train)

Training LSTM: 100%|██████████| 193/193 [16:51<00:00,  5.24s/it]


In [34]:
all_preds = []

# 모든 test_*.csv 순회
test_files = sorted(glob.glob('./test/TEST_*.csv'))
df = pd.read_csv('./train/train.csv')
lower_bound_dict = compute_iqr_lower_bounds(df)
for path in test_files:
    test_df = pd.read_csv(path)
    # 파일명에서 접두어 추출 (예: TEST_00)
    filename = os.path.basename(path)
    test_prefix = re.search(r'(TEST_\d+)', filename).group(1)

    pred_df = predict_lstm(test_df, trained_models, test_prefix, lower_bound_dict)
    all_preds.append(pred_df)
    
full_pred_df = pd.concat(all_preds, ignore_index=True)



In [35]:
sample_submission = pd.read_csv('./sample_submission.csv')
submission = convert_to_submission_format(full_pred_df, sample_submission)
submission.to_csv('./Prediction/6th.csv', index=False, encoding='utf-8-sig')
result = pd.read_csv('./Prediction/6th.csv')
display(result.head())

  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, col] = pred_dict.get((date, col), 0)
  final_df.loc[row_idx, 

Unnamed: 0,영업일자,느티나무 셀프BBQ_1인 수저세트,느티나무 셀프BBQ_BBQ55(단체),"느티나무 셀프BBQ_대여료 30,000원","느티나무 셀프BBQ_대여료 60,000원","느티나무 셀프BBQ_대여료 90,000원","느티나무 셀프BBQ_본삼겹 (단품,실내)",느티나무 셀프BBQ_스프라이트 (단체),느티나무 셀프BBQ_신라면,느티나무 셀프BBQ_쌈야채세트,...,화담숲주막_스프라이트,화담숲주막_참살이 막걸리,화담숲주막_찹쌀식혜,화담숲주막_콜라,화담숲주막_해물파전,화담숲카페_메밀미숫가루,화담숲카페_아메리카노 HOT,화담숲카페_아메리카노 ICE,화담숲카페_카페라떼 ICE,화담숲카페_현미뻥스크림
0,TEST_00+1일,10.96898,0.0,9.285786,3.740235,1.193645,0.55961,0.0,2.568079,0.956581,...,6.144767,30.623226,12.785031,11.654114,61.236351,35.227487,5.413418,29.628756,5.539185,17.561629
1,TEST_00+2일,0.0,0.0,1.909325,0.929242,0.143671,0.600511,3.691389,1.414145,0.163802,...,5.500705,18.281163,6.987962,11.198621,32.182485,16.290788,3.848147,22.238572,5.034027,10.148675
2,TEST_00+3일,0.454125,2.144058,2.892087,1.339023,0.126873,0.384338,5.070331,0.960608,0.058272,...,3.67112,10.304148,5.21463,7.87453,19.737272,13.706752,3.465967,20.316115,3.376217,9.090602
3,TEST_00+4일,4.583421,11.086167,2.433054,1.379726,0.056286,0.167746,7.576319,2.147124,0.0,...,3.247884,10.863139,8.017961,7.948702,26.126058,17.501001,2.340297,15.337673,4.179523,8.697896
4,TEST_00+5일,8.533413,27.62652,3.033821,1.907385,0.090239,0.16074,14.096834,5.103465,0.0,...,4.896689,13.537412,10.926623,10.271109,35.677871,11.989799,4.385347,20.231477,6.385289,9.557989
