#Kaggle的Store Sales - Time Series Forecasting任务

In [1]:
!unzip store-sales-time-series-forecasting.zip && rm -rf store-sales-time-series-forecasting.zip

Archive:  store-sales-time-series-forecasting.zip
  inflating: holidays_events.csv     
  inflating: oil.csv                 
  inflating: sample_submission.csv   
  inflating: stores.csv              
  inflating: test.csv                
  inflating: train.csv               
  inflating: transactions.csv        


In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

def simple_preprocess(df):
    """数据预处理"""
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df = df.dropna(subset=['sales'])
    df = df[df['sales'] >= 0]
    df = df.sort_values(['store_nbr', 'family', 'date'])
    return df

def create_simple_features(df):
    """增强特征工程"""
    df = df.copy()

    # 基础时间特征
    df['month'] = df['date'].dt.month
    df['dayofweek'] = df['date'].dt.dayofweek
    df['day'] = df['date'].dt.day
    df['quarter'] = df['date'].dt.quarter
    df['is_weekend'] = (df['date'].dt.dayofweek >= 5).astype(int)
    df['is_month_start'] = (df['date'].dt.day <= 5).astype(int)
    df['is_month_end'] = (df['date'].dt.day >= 25).astype(int)

    # 促销特征
    df['onpromotion'] = df['onpromotion'].fillna(0)

    # 多个滞后特征 - 修复索引问题
    df = df.sort_values(['store_nbr', 'family', 'date']).reset_index(drop=True)
    for lag in [1, 7, 14, 28]:
        df[f'sales_lag{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)

    # 滚动统计特征 - 修复索引问题
    for window in [7, 14, 28]:
        # 计算滚动均值
        rolling_mean = df.groupby(['store_nbr', 'family'])['sales'].rolling(window, min_periods=1).mean()
        df[f'sales_rolling_mean_{window}'] = rolling_mean.values

        # 计算滚动标准差
        rolling_std = df.groupby(['store_nbr', 'family'])['sales'].rolling(window, min_periods=1).std()
        df[f'sales_rolling_std_{window}'] = rolling_std.values

    # 促销相关特征
    df['promo_lag1'] = df.groupby(['store_nbr', 'family'])['onpromotion'].shift(1)
    promo_rolling = df.groupby(['store_nbr', 'family'])['onpromotion'].rolling(7, min_periods=1).sum()
    df['promo_rolling_sum_7'] = promo_rolling.values

    df = df.fillna(0)
    return df

def create_sequences_simple(df, seq_len=14, pred_len=15):
    """创建时间序列用于多天预测"""
    feature_cols = ['month', 'dayofweek', 'day', 'quarter', 'is_weekend', 'is_month_start', 'is_month_end',
                   'onpromotion', 'sales_lag1', 'sales_lag7', 'sales_lag14', 'sales_lag28',
                   'sales_rolling_mean_7', 'sales_rolling_mean_14', 'sales_rolling_mean_28',
                   'sales_rolling_std_7', 'sales_rolling_std_14', 'sales_rolling_std_28',
                   'promo_lag1', 'promo_rolling_sum_7']
    X, y = [], []

    for (store, family), group in df.groupby(['store_nbr', 'family']):
        group = group.sort_values('date')
        if len(group) < seq_len + pred_len:
            continue

        features = group[feature_cols].values
        sales = group['sales'].values

        for i in range(len(group) - seq_len - pred_len + 1):
            X.append(features[i:i+seq_len])
            y.append(sales[i+seq_len:i+seq_len+pred_len])

    return np.array(X), np.array(y)

def create_dataloaders_simple(X, y, batch_size=32):
    """创建数据加载器"""
    scaler_X = StandardScaler()
    n_samples, seq_len, n_features = X.shape
    X_scaled = scaler_X.fit_transform(X.reshape(-1, n_features))
    X_scaled = X_scaled.reshape(n_samples, seq_len, n_features)

    scaler_y = StandardScaler()
    y_scaled = scaler_y.fit_transform(y)

    split = int(0.8 * len(X_scaled))
    X_train, X_test = X_scaled[:split], X_scaled[split:]
    y_train, y_test = y_scaled[:split], y_scaled[split:]

    X_train = torch.FloatTensor(X_train)
    X_test = torch.FloatTensor(X_test)
    y_train = torch.FloatTensor(y_train)
    y_test = torch.FloatTensor(y_test)

    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size)

    return train_loader, test_loader, scaler_X, scaler_y

def process_data_simple(df):
    """总数据处理流程"""
    df = simple_preprocess(df)
    df = create_simple_features(df)
    X, y = create_sequences_simple(df)
    train_loader, test_loader, scaler_X, scaler_y = create_dataloaders_simple(X, y)

    print(f"训练样本: {len(train_loader.dataset)}")
    print(f"测试样本: {len(test_loader.dataset)}")
    print(f"输入特征数: {X.shape[2]}")
    print(f"输入序列长度: {X.shape[1]}天")
    print(f"预测序列长度: {y.shape[1]}天")

    return train_loader, test_loader, X.shape[2], y.shape[1], scaler_X, scaler_y

def process_test_data(test_df, train_df, scaler_X, scaler_y):
    """处理测试数据"""
    test_df = test_df.copy()
    test_df['date'] = pd.to_datetime(test_df['date'])
    test_df['onpromotion'] = test_df['onpromotion'].fillna(0)

    # 增强时间特征
    test_df['month'] = test_df['date'].dt.month
    test_df['dayofweek'] = test_df['date'].dt.dayofweek
    test_df['day'] = test_df['date'].dt.day
    test_df['quarter'] = test_df['date'].dt.quarter
    test_df['is_weekend'] = (test_df['date'].dt.dayofweek >= 5).astype(int)
    test_df['is_month_start'] = (test_df['date'].dt.day <= 5).astype(int)
    test_df['is_month_end'] = (test_df['date'].dt.day >= 25).astype(int)

    # 从训练数据获取各种统计特征
    last_stats = train_df.groupby(['store_nbr', 'family']).agg({
        'sales': ['last', 'mean', 'std']
    }).reset_index()
    last_stats.columns = ['store_nbr', 'family', 'sales_lag1', 'sales_mean', 'sales_std']

    # 合并统计特征
    test_df = test_df.merge(last_stats, on=['store_nbr', 'family'], how='left')

    # 填充缺失的滞后特征
    for lag in [7, 14, 28]:
        test_df[f'sales_lag{lag}'] = test_df['sales_lag1']  # 用最近的销售值填充

    # 填充滚动统计特征
    for window in [7, 14, 28]:
        test_df[f'sales_rolling_mean_{window}'] = test_df['sales_mean']
        test_df[f'sales_rolling_std_{window}'] = test_df['sales_std']

    # 促销相关特征
    test_df['promo_lag1'] = test_df['onpromotion']
    test_df['promo_rolling_sum_7'] = test_df['onpromotion'] * 3  # 假设平均促销频率

    test_df = test_df.fillna(0)

    feature_cols = ['month', 'dayofweek', 'day', 'quarter', 'is_weekend', 'is_month_start', 'is_month_end',
                   'onpromotion', 'sales_lag1', 'sales_lag7', 'sales_lag14', 'sales_lag28',
                   'sales_rolling_mean_7', 'sales_rolling_mean_14', 'sales_rolling_mean_28',
                   'sales_rolling_std_7', 'sales_rolling_std_14', 'sales_rolling_std_28',
                   'promo_lag1', 'promo_rolling_sum_7']
    X_test = []
    all_predictions_info = []

    for (store, family), group in test_df.groupby(['store_nbr', 'family']):
        group = group.sort_values('date')

        if len(group) >= 14:
            features = group[feature_cols].iloc[:14].values
            X_test.append(features)
            all_ids = group['id'].values
            all_predictions_info.append({
                'store': store,
                'family': family,
                'ids': all_ids
            })
        else:
            print(f"警告: 店铺{store}的{family}类别数据不足")
            features = np.zeros((14, len(feature_cols)))
            X_test.append(features)
            all_predictions_info.append({
                'store': store,
                'family': family,
                'ids': group['id'].values
            })

    X_test = np.array(X_test)
    n_samples, seq_len, n_features = X_test.shape
    X_test_scaled = scaler_X.transform(X_test.reshape(-1, n_features))
    X_test_scaled = X_test_scaled.reshape(n_samples, seq_len, n_features)

    return X_test_scaled, all_predictions_info

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2 if num_layers > 1 else 0
        )

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.size(0)
        h_0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        c_0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)

        if x.is_cuda:
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()

        lstm_out, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        last_output = lstm_out[:, -1, :]
        out = self.fc(last_output)

        return out

def train_and_evaluate(train_loader, test_loader, n_features, pred_len, device, num_epochs=12):
    """训练和评估模型"""
    model = LSTM(input_size=n_features, hidden_size=64, num_layers=2, output_size=pred_len).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    # 训练
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (batch_X, batch_y) in enumerate(train_loader):
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if (batch_idx + 1) % 500 == 0:
                print(f"Epoch {epoch+1}, Batch {batch_idx+1}, Loss: {loss.item():.4f}")

        print(f"Epoch {epoch+1}, Average Loss: {total_loss/len(train_loader):.4f}")

    # 评估
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()

    print(f"Test Loss: {total_loss/len(test_loader):.4f}")

    return model

if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # 加载数据
    data = pd.read_csv('train.csv')
    test_data = pd.read_csv('test.csv')

    # 数据预处理
    train_loader, test_loader, n_features, pred_len, scaler_X, scaler_y = process_data_simple(data)

    # 训练和评估
    model = train_and_evaluate(train_loader, test_loader, n_features, pred_len, device, num_epochs=10)

    # 处理测试数据并预测
    X_test, predictions_info = process_test_data(test_data, data, scaler_X, scaler_y)

    # 预测
    X_test_tensor = torch.FloatTensor(X_test).to(device)
    model.eval()
    with torch.no_grad():
        predictions_scaled = model(X_test_tensor)

    predictions = scaler_y.inverse_transform(predictions_scaled.cpu().numpy())

    # 生成提交文件
    submission_data = {}
    for i, info in enumerate(predictions_info):
        pred_15days = predictions[i]
        ids = info['ids']

        for j, test_id in enumerate(ids):
            if j < len(pred_15days):
                submission_data[test_id] = max(0, pred_15days[j])
            else:
                submission_data[test_id] = max(0, pred_15days[-1])

    test_original = pd.read_csv('test.csv')
    submission_df = test_original[['id']].copy()
    submission_df['sales'] = submission_df['id'].map(submission_data)
    submission_df['sales'] = submission_df['sales'].fillna(0)

    submission_df.to_csv('submission.csv', index=False)
    print(f"提交文件已生成: {len(submission_df)} 行")
    print(f"空值数量: {submission_df['sales'].isna().sum()}")

Epoch 10, Batch 34000, Loss: 0.1242
Epoch 10, Batch 34500, Loss: 0.1262
Epoch 10, Batch 35000, Loss: 0.0115
Epoch 10, Batch 35500, Loss: 0.0088
Epoch 10, Batch 36000, Loss: 0.0082
Epoch 10, Batch 36500, Loss: 0.0077
Epoch 10, Batch 37000, Loss: 0.0055
Epoch 10, Batch 37500, Loss: 0.0481
Epoch 10, Batch 38000, Loss: 0.0720
Epoch 10, Batch 38500, Loss: 0.0014
Epoch 10, Batch 39000, Loss: 0.0069
Epoch 10, Batch 39500, Loss: 0.0459
Epoch 10, Batch 40000, Loss: 0.0165
Epoch 10, Batch 40500, Loss: 0.0056
Epoch 10, Batch 41000, Loss: 0.0071
Epoch 10, Batch 41500, Loss: 0.0132
Epoch 10, Batch 42000, Loss: 0.0084
Epoch 10, Batch 42500, Loss: 0.0090
Epoch 10, Batch 43000, Loss: 0.0021
Epoch 10, Batch 43500, Loss: 0.0026
Epoch 10, Batch 44000, Loss: 0.1029
Epoch 10, Batch 44500, Loss: 0.0155
Epoch 10, Batch 45000, Loss: 0.1135
Epoch 10, Batch 45500, Loss: 0.0017
Epoch 10, Batch 46000, Loss: 0.0559
Epoch 10, Batch 46500, Loss: 0.0059
Epoch 10, Batch 47000, Loss: 0.0095
Epoch 10, Batch 47500, Loss:

In [10]:
#尝试融合外部数据，效果不佳需要尝试别的模型
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

def load_and_merge_data():
    """一次性加载并融合所有数据"""
    print("加载数据...")
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

    # 统一的融合函数
    def merge_external(df, is_train=True):
        # 融合transactions
        try:
            trans_df = pd.read_csv('transactions.csv')
            trans_df['date'] = pd.to_datetime(trans_df['date'])
            df['date'] = pd.to_datetime(df['date'])
            df = df.merge(trans_df, on=['date', 'store_nbr'], how='left')
            df['transactions'] = df['transactions'].fillna(0)
            if is_train:
                df['avg_basket_size'] = df['sales'] / (df['transactions'] + 1)
            else:
                df['avg_basket_size'] = 0
            print("✓ transactions.csv 融合成功")
        except FileNotFoundError:
            print("✗ transactions.csv 未找到")
            df['transactions'] = df['avg_basket_size'] = 0

        # 融合stores
        try:
            stores_df = pd.read_csv('stores.csv')
            df = df.merge(stores_df, on='store_nbr', how='left')

            # 批量编码分类变量
            for col in ['city', 'state', 'type']:
                if col in df.columns:
                    le = LabelEncoder()
                    df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str).fillna('unknown'))
            df['cluster'] = df['cluster'].fillna(0)
            print("✓ stores.csv 融合成功")
        except FileNotFoundError:
            print("✗ stores.csv 未找到")
            for col in ['city_encoded', 'state_encoded', 'type_encoded', 'cluster']:
                df[col] = 0

        return df

    train_df = merge_external(train_df, True)
    test_df = merge_external(test_df, False)

    return train_df, test_df

def create_features(df, is_train=True):
    """数据特征工程"""
    print("特征工程中...")
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])

    if is_train:
        df = df.dropna(subset=['sales'])
        df = df[df['sales'] >= 0]

    # 批量创建时间特征
    time_features = {
        'month': df['date'].dt.month,
        'dayofweek': df['date'].dt.dayofweek,
        'day': df['date'].dt.day,
        'quarter': df['date'].dt.quarter,
        'is_weekend': (df['date'].dt.dayofweek >= 5).astype(int),
        'is_month_start': (df['date'].dt.day <= 5).astype(int),
        'is_month_end': (df['date'].dt.day >= 25).astype(int)
    }
    for name, values in time_features.items():
        df[name] = values

    df['onpromotion'] = df['onpromotion'].fillna(0)
    df = df.sort_values(['store_nbr', 'family', 'date']).reset_index(drop=True)

    # 销售相关特征
    if is_train:
        # 滞后特征
        for lag in [1, 7, 14, 28]:
            df[f'sales_lag{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)

        # 滚动统计特征
        for window in [7, 14, 28]:
            rolling = df.groupby(['store_nbr', 'family'])['sales'].rolling(window, min_periods=1)
            df[f'sales_rolling_mean_{window}'] = rolling.mean().values
            df[f'sales_rolling_std_{window}'] = rolling.std().values

    # 促销特征
    df['promo_lag1'] = df.groupby(['store_nbr', 'family'])['onpromotion'].shift(1)
    df['promo_rolling_sum_7'] = df.groupby(['store_nbr', 'family'])['onpromotion'].rolling(7, min_periods=1).sum().values

    # 交易特征
    if 'transactions' in df.columns and df['transactions'].sum() > 0:
        df['transactions_lag7'] = df.groupby(['store_nbr', 'family'])['transactions'].shift(7)

    df = df.fillna(0)

    feature_count = len([col for col in df.columns if col not in ['id', 'date', 'store_nbr', 'family', 'sales']])
    print(f"特征工程完成，特征数: {feature_count}")

    return df

def create_sequences(df, seq_len=14, pred_len=15):
    """时序处理 - 创建时间序列"""
    print("时序处理中...")

    feature_cols = [
        'month', 'dayofweek', 'day', 'quarter', 'is_weekend', 'is_month_start', 'is_month_end',
        'onpromotion', 'sales_lag1', 'sales_lag7', 'sales_lag14', 'sales_lag28',
        'sales_rolling_mean_7', 'sales_rolling_mean_14', 'sales_rolling_mean_28',
        'sales_rolling_std_7', 'sales_rolling_std_14', 'sales_rolling_std_28',
        'promo_lag1', 'promo_rolling_sum_7', 'transactions', 'avg_basket_size',
        'transactions_lag7', 'city_encoded', 'state_encoded', 'type_encoded', 'cluster'
    ]

    X, y = [], []

    for (store, family), group in df.groupby(['store_nbr', 'family']):
        group = group.sort_values('date')
        if len(group) < seq_len + pred_len:
            continue

        features = group[feature_cols].values
        sales = group['sales'].values

        for i in range(len(group) - seq_len - pred_len + 1):
            X.append(features[i:i+seq_len])
            y.append(sales[i+seq_len:i+seq_len+pred_len])

    X, y = np.array(X), np.array(y)
    print(f"时序处理完成，序列形状: X{X.shape}, y{y.shape}")

    return X, y

def create_dataloaders(X, y, batch_size=32):
    """批量化处理 - 创建数据加载器"""
    print("批量化处理中...")

    # 特征标准化
    scaler_X = StandardScaler()
    n_samples, seq_len, n_features = X.shape
    X_scaled = scaler_X.fit_transform(X.reshape(-1, n_features))
    X_scaled = X_scaled.reshape(n_samples, seq_len, n_features)

    # 目标变量标准化
    scaler_y = StandardScaler()
    y_scaled = scaler_y.fit_transform(y)

    # 训练验证分割
    split = int(0.8 * len(X_scaled))
    X_train, X_val = X_scaled[:split], X_scaled[split:]
    y_train, y_val = y_scaled[:split], y_scaled[split:]

    # 创建数据加载器
    train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
    val_dataset = TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(y_val))

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    print(f"批量化处理完成，训练批次: {len(train_loader)}, 验证批次: {len(val_loader)}")

    return train_loader, val_loader, scaler_X, scaler_y

class LSTM(nn.Module):
    """模型构建 - LSTM时间序列预测模型"""
    def __init__(self, input_size, hidden_size=64, num_layers=2, output_size=15):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2 if num_layers > 1 else 0
        )

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.size(0)
        h_0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        c_0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)

        if x.is_cuda:
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()

        lstm_out, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        last_output = lstm_out[:, -1, :]
        output = self.fc(last_output)

        return output

def train_and_evaluate(train_loader, val_loader, input_size, device, num_epochs=10):
    """训练评估模型"""
    print("开始训练模型...")

    # 创建模型
    model = LSTM(input_size=input_size).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    # 训练循环
    for epoch in range(num_epochs):
        # 训练阶段
        model.train()
        train_loss = 0
        for batch_idx, (batch_X, batch_y) in enumerate(train_loader):
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            # 前向传播
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)

            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            if (batch_idx + 1) % 500 == 0:
                print(f"Epoch {epoch+1}, Batch {batch_idx+1}, Loss: {loss.item():.4f}")

        avg_train_loss = train_loss / len(train_loader)

        # 验证阶段
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    print("模型训练完成")
    return model

def prepare_test_data(test_df, train_df, scaler_X, scaler_y):
    """准备测试数据"""
    print("准备测试数据...")

    test_df = test_df.copy()
    test_df['date'] = pd.to_datetime(test_df['date'])
    test_df['onpromotion'] = test_df['onpromotion'].fillna(0)

    # 时间特征
    time_features = {
        'month': test_df['date'].dt.month,
        'dayofweek': test_df['date'].dt.dayofweek,
        'day': test_df['date'].dt.day,
        'quarter': test_df['date'].dt.quarter,
        'is_weekend': (test_df['date'].dt.dayofweek >= 5).astype(int),
        'is_month_start': (test_df['date'].dt.day <= 5).astype(int),
        'is_month_end': (test_df['date'].dt.day >= 25).astype(int)
    }
    for name, values in time_features.items():
        test_df[name] = values

    # 从训练数据获取统计信息
    train_stats = train_df.groupby(['store_nbr', 'family'])['sales'].agg(['last', 'mean', 'std']).reset_index()
    train_stats.columns = ['store_nbr', 'family', 'sales_last', 'sales_mean', 'sales_std']
    test_df = test_df.merge(train_stats, on=['store_nbr', 'family'], how='left')

    # 填充销售特征
    for lag in [1, 7, 14, 28]:
        test_df[f'sales_lag{lag}'] = test_df['sales_last']

    for window in [7, 14, 28]:
        test_df[f'sales_rolling_mean_{window}'] = test_df['sales_mean']
        test_df[f'sales_rolling_std_{window}'] = test_df['sales_std']

    # 促销和外部特征
    test_df['promo_lag1'] = test_df['onpromotion']
    test_df['promo_rolling_sum_7'] = test_df['onpromotion'] * 3

    for col in ['transactions', 'avg_basket_size', 'transactions_lag7', 'city_encoded', 'state_encoded', 'type_encoded', 'cluster']:
        if col not in test_df.columns:
            test_df[col] = 0

    test_df = test_df.fillna(0)

    # 特征列表
    feature_cols = [
        'month', 'dayofweek', 'day', 'quarter', 'is_weekend', 'is_month_start', 'is_month_end',
        'onpromotion', 'sales_lag1', 'sales_lag7', 'sales_lag14', 'sales_lag28',
        'sales_rolling_mean_7', 'sales_rolling_mean_14', 'sales_rolling_mean_28',
        'sales_rolling_std_7', 'sales_rolling_std_14', 'sales_rolling_std_28',
        'promo_lag1', 'promo_rolling_sum_7', 'transactions', 'avg_basket_size',
        'transactions_lag7', 'city_encoded', 'state_encoded', 'type_encoded', 'cluster'
    ]

    X_test = []
    predictions_info = []

    for (store, family), group in test_df.groupby(['store_nbr', 'family']):
        group = group.sort_values('date')
        if len(group) >= 14:
            X_test.append(group[feature_cols].iloc[:14].values)
        else:
            X_test.append(np.zeros((14, len(feature_cols))))

        predictions_info.append({
            'store': store,
            'family': family,
            'ids': group['id'].values
        })

    X_test = np.array(X_test)
    n_samples, seq_len, n_features = X_test.shape
    X_test_scaled = scaler_X.transform(X_test.reshape(-1, n_features))
    X_test_scaled = X_test_scaled.reshape(n_samples, seq_len, n_features)

    return X_test_scaled, predictions_info

if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # 1. 数据加载和融合
    train_data, test_data = load_and_merge_data()

    # 2. 数据特征工程
    train_featured = create_features(train_data, is_train=True)

    # 3. 时序处理
    X, y = create_sequences(train_featured)

    # 4. 批量化处理
    train_loader, val_loader, scaler_X, scaler_y = create_dataloaders(X, y)

    # 5. 训练评估模型
    model = train_and_evaluate(train_loader, val_loader, X.shape[2], device)

    # 6. 测试数据处理和预测
    test_featured = create_features(test_data, is_train=False)
    X_test, predictions_info = prepare_test_data(test_featured, train_featured, scaler_X, scaler_y)

    # 7. 生成预测
    print("生成预测...")
    X_test_tensor = torch.FloatTensor(X_test).to(device)
    model.eval()
    with torch.no_grad():
        predictions_scaled = model(X_test_tensor)

    predictions = scaler_y.inverse_transform(predictions_scaled.cpu().numpy())

    # 8. 生成提交文件
    submission_data = {}
    for i, info in enumerate(predictions_info):
        pred_15days = predictions[i]
        for j, test_id in enumerate(info['ids']):
            if j < len(pred_15days):
                submission_data[test_id] = max(0, pred_15days[j])
            else:
                submission_data[test_id] = max(0, pred_15days[-1])

    submission_df = pd.read_csv('test.csv')[['id']].copy()
    submission_df['sales'] = submission_df['id'].map(submission_data).fillna(0)
    submission_df.to_csv('submission.csv', index=False)

    print(f"完成！生成 {len(submission_df)} 行预测结果")
    print(f"预测统计: {submission_df['sales'].describe()}")

Using device: cuda
加载数据...
✓ transactions.csv 融合成功
✓ stores.csv 融合成功
✓ transactions.csv 融合成功
✓ stores.csv 融合成功
🎯 特征工程中...
✅ 特征工程完成，特征数: 30
⏰ 时序处理中...
✅ 时序处理完成，序列形状: X(2950992, 14, 27), y(2950992, 15)
🔄 批量化处理中...
✅ 批量化处理完成，训练批次: 73775, 验证批次: 18444
🔥 开始训练模型...
Epoch 1, Batch 500, Loss: 0.0243
Epoch 1, Batch 1000, Loss: 0.0489
Epoch 1, Batch 1500, Loss: 0.0317
Epoch 1, Batch 2000, Loss: 0.0821
Epoch 1, Batch 2500, Loss: 0.0959
Epoch 1, Batch 3000, Loss: 0.0243
Epoch 1, Batch 3500, Loss: 0.1579
Epoch 1, Batch 4000, Loss: 0.0285
Epoch 1, Batch 4500, Loss: 0.0516
Epoch 1, Batch 5000, Loss: 0.0256
Epoch 1, Batch 5500, Loss: 0.1020
Epoch 1, Batch 6000, Loss: 0.0273
Epoch 1, Batch 6500, Loss: 0.0351
Epoch 1, Batch 7000, Loss: 0.0129
Epoch 1, Batch 7500, Loss: 0.0070
Epoch 1, Batch 8000, Loss: 0.1554
Epoch 1, Batch 8500, Loss: 0.0453
Epoch 1, Batch 9000, Loss: 0.0583
Epoch 1, Batch 9500, Loss: 0.2252
Epoch 1, Batch 10000, Loss: 0.0559
Epoch 1, Batch 10500, Loss: 0.0514
Epoch 1, Batch 11000, Loss