In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
import os

DATA_PATH = ".\data\opsd_building.csv"

In [2]:
def load_single_file(path):
    """加载并解析单个数据文件"""
    df = pd.read_csv(
        path,
        usecols=[1,2,3,4],  # 直接读取所有4列
        header=0,           # 第一行为表头，自动跳过
        names=['load1', 'load2', 'load3', 'price'],
        parse_dates=False   # 无时间列时关闭自动解析
    )
    # 生成时间序列（假设数据按时间顺序排列）
    start_time = pd.to_datetime('2020-01-01 00:00:00')  # 根据实际情况修改
    df['timestamp'] = start_time + pd.to_timedelta(df.index*15, 'm')
    df.set_index('timestamp', inplace=True)
    # 验证数据完整性
    assert len(df)%96 == 0, "数据天数不完整"
    print(f"成功加载{len(df)/96:.1f}天的数据")
    return df

raw_df = load_single_file(DATA_PATH)
display(raw_df.head())

成功加载661.0天的数据


Unnamed: 0_level_0,load1,load2,load3,price
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01 00:00:00,0.079,0.187,0.039,29.93
2020-01-01 00:15:00,0.078,0.0,0.031,29.93
2020-01-01 00:30:00,0.08,0.0,0.075,29.62
2020-01-01 00:45:00,0.11,0.0,0.176,29.62
2020-01-01 01:00:00,0.107,0.135,0.541,29.62


In [5]:
def generate_daily_samples(df, lookback_days=3):
    """
    生成每日样本数据（含历史窗口）
    参数：
    lookback_days : 历史天数 (0表示仅当天数据)
    返回：
    形状为 (num_days, 96, features) 的样本数组
    """
    day_length = 96
    total_days = len(df) // day_length
    print(f"数据总天数: {total_days}天")
    samples = []
    for day in range(lookback_days, total_days-1):  # -1保留最后一天作为目标
        # 获取历史窗口数据
        start_idx = (day - lookback_days) * day_length
        end_idx = (day + 1) * day_length  # +1包含当日
        # 提取特征矩阵 [时间步 × 特征]
        sample_data = df.iloc[start_idx:end_idx][['load1', 'load2', 'load3','price']]
        print(sample_data)
        # 验证样本完整性
        assert len(sample_data) == (lookback_days+1)*day_length, \
            "样本长度错误：第{}天".format(day)
        samples.append(sample_data.values.reshape(-1, day_length, 4))
    
    return np.vstack(samples)
# 生成3天历史窗口样本
daily_samples = generate_daily_samples(raw_df, lookback_days=3)
print("样本形状:", daily_samples.shape)  # (num_samples, 历史天数+1, 96时间步, 9特征)


数据总天数: 661天
                     load1  load2  load3  price
timestamp                                      
2020-01-01 00:00:00  0.079  0.187  0.039  29.93
2020-01-01 00:15:00  0.078  0.000  0.031  29.93
2020-01-01 00:30:00  0.080  0.000  0.075  29.62
2020-01-01 00:45:00  0.110  0.000  0.176  29.62
2020-01-01 01:00:00  0.107  0.135  0.541  29.62
...                    ...    ...    ...    ...
2020-01-04 22:45:00  0.103  0.000  0.022  21.09
2020-01-04 23:00:00  0.098  0.000  0.036  21.09
2020-01-04 23:15:00  0.070  0.000  0.041  21.09
2020-01-04 23:30:00  0.130  0.000  0.278  19.90
2020-01-04 23:45:00  0.099  0.000  0.378  19.90

[384 rows x 4 columns]
                     load1  load2  load3  price
timestamp                                      
2020-01-02 00:00:00  0.072   0.03  0.050  27.93
2020-01-02 00:15:00  0.090   0.01  0.059  27.93
2020-01-02 00:30:00  0.088   0.01  0.071  28.42
2020-01-02 00:45:00  0.112   0.00  0.042  28.42
2020-01-02 01:00:00  0.105   0.00  0.050  28.42
... 