# Workshop 5min 数据集探索
- 分布 / 缺失 / 对齐检查
- 请先生成 `workshop_5min_clean_all.csv` 再运行

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", 200)

DATA_PATH = Path("../workshop_5min_clean_all.csv")
if not DATA_PATH.exists():
    raise FileNotFoundError(f"CSV not found: {DATA_PATH}")

df = pd.read_csv(DATA_PATH, parse_dates=["ts_5min"])
df = df.sort_values("ts_5min").reset_index(drop=True)
df.head()

In [None]:
# 基本信息
print("shape:", df.shape)
display(df.dtypes)
print("ts_5min min/max:", df["ts_5min"].min(), df["ts_5min"].max())

In [None]:
# 缺失值统计
na_counts = df.isna().sum()
na_pct = (na_counts / len(df) * 100).round(2)
na_report = pd.DataFrame({"na_count": na_counts, "na_pct": na_pct}).sort_values("na_pct", ascending=False)
display(na_report)

# 可视化缺失比例（前 30 列）
top_cols = na_report.head(30).index
plt.figure(figsize=(10, 4))
sns.barplot(x=na_report.loc[top_cols, "na_pct"], y=top_cols, orient="h")
plt.title("Missing percentage (top 30)")
plt.xlabel("% of missing")
plt.tight_layout()
plt.show()

In [None]:
# 时间对齐检查：重复 / 缺口 / 每天行数
t = df["ts_5min"]
dup_count = t.duplicated().sum()
print("Duplicated ts_5min:", dup_count)

# 期望频率 5min，找缺口
full = pd.date_range(start=t.min(), end=t.max(), freq="5min")
missing_ts = full.difference(t)
print("Missing timestamps count:", len(missing_ts))
if len(missing_ts) > 0:
    display(pd.Series(missing_ts[:20], name="missing_ts_sample"))

# 每天记录数，完整日应为 288 条
df["date"] = df["ts_5min"].dt.date
day_counts = df.groupby("date").size().rename("rows_per_day")
display(day_counts.describe())
plt.figure(figsize=(10, 3))
sns.barplot(x=day_counts.index.astype(str), y=day_counts.values, color="steelblue")
plt.xticks(rotation=90)
plt.axhline(288, color="red", linestyle="--", label="expected 288")
plt.legend()
plt.title("Rows per day")
plt.tight_layout()
plt.show()

# 清理临时列
df.drop(columns=["date"], inplace=True)

In [None]:
# 数值列分布（排除时间特征）
num_cols = [c for c in df.columns if c != "ts_5min" and pd.api.types.is_numeric_dtype(df[c])]
focus_cols = [c for c in num_cols if c not in ("dayofweek", "is_weekend")]

display(df[focus_cols].describe())

n_show = min(6, len(focus_cols))
plt.figure(figsize=(14, 8))
for i, col in enumerate(focus_cols[:n_show], 1):
    plt.subplot(2, (n_show + 1) // 2, i)
    sns.histplot(df[col].dropna(), bins=50, kde=True)
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
# 相关性热力图（数值列）
if len(focus_cols) > 1:
    corr = df[focus_cols].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, cmap="coolwarm", center=0, annot=False)
    plt.title("Correlation (numeric features)")
    plt.tight_layout()
    plt.show()

In [1]:
# ==== 1. 把项目根目录加入 sys.path，方便 import 自己的包 ====
from pathlib import Path
import sys

# 假设当前 notebook 在 project_root/notebooks/ 目录下
ROOT_DIR = Path("..").resolve()
if str(ROOT_DIR) not in sys.path:
    sys.path.append(str(ROOT_DIR))

print("Project root:", ROOT_DIR)

Project root: D:\Python_Graduate_Project\workshop4_energy_prediction


In [2]:
import pandas as pd
from config.dataset_config import TRAIN_CSV

df = pd.read_csv(TRAIN_CSV)
print(df.columns.tolist())


['ts_5min', 'bf_power', 'cold_power', 'cold_freq', 'exh_power', 'exh_freq', 'exh_voltage_v', 'exh_moto_temp', 'exh_ambient_temp', 'env_temp', 'env_hum', 'env_press', 'main_power', 'dayofweek', 'is_weekend', 'tod_sin', 'tod_cos']


In [5]:
import config.model_config as mc
from importlib import reload

# 保险起见先强制 reload 一次
mc = reload(mc)

print("NODE_LOCAL_FEATURES =", mc.NODE_LOCAL_FEATURES)
print("TIME_FEATURES       =", mc.TIME_FEATURES)
print("model_config.py from:", mc.__file__)


NODE_LOCAL_FEATURES = {'BF': ['bf_power'], 'Cold': ['cold_power', 'cold_freq'], 'Exhaust': ['exh_voltage_v', 'exh_moto_temp'], 'Env': ['env_temp', 'env_hum', 'env_press'], 'Main': ['main_power']}
TIME_FEATURES       = ['dayofweek', 'is_weekend', 'tod_sin', 'tod_cos']
model_config.py from: D:\Python_Graduate_Project\workshop4_energy_prediction\config\model_config.py


In [6]:
from pathlib import Path
import sys

ROOT_DIR = Path("..").resolve()
if str(ROOT_DIR) not in sys.path:
    sys.path.append(str(ROOT_DIR))

import torch
from torch.utils.data import DataLoader

from config.dataset_config import TRAIN_CSV, VAL_CSV
from data.graph_dataset import GraphSequenceDataset, fit_scaler_from_csv

print("Train CSV:", TRAIN_CSV)
print("Val   CSV:", VAL_CSV)

scaler = fit_scaler_from_csv(str(TRAIN_CSV))
print("Scaler mean keys:", list(scaler["mean"].index)[:10])

train_ds = GraphSequenceDataset(
    csv_path=str(TRAIN_CSV),
    t_in=288,
    t_out=288,
    feature_scaler=scaler,
    fit_scaler=False,
)

val_ds = GraphSequenceDataset(
    csv_path=str(VAL_CSV),
    t_in=288,
    t_out=288,
    feature_scaler=scaler,
    fit_scaler=False,
)

print("Train samples:", len(train_ds))
print("Val   samples:", len(val_ds))

loader = DataLoader(train_ds, batch_size=16, shuffle=True)
X, y = next(iter(loader))
print("X shape:", X.shape)  # 预期 (16, 288, 5, F_max)
print("y shape:", y.shape)  # 预期 (16, 288)

print("Sample[0] node0 first 3 steps:\n", X[0, :3, 0, :])


Train CSV: D:\Python_Graduate_Project\workshop4_energy_prediction\workshop_5min_train.csv
Val   CSV: D:\Python_Graduate_Project\workshop4_energy_prediction\workshop_5min_val.csv
Scaler mean keys: ['bf_power', 'cold_freq', 'cold_power', 'env_hum', 'env_press', 'env_temp', 'exh_moto_temp', 'exh_voltage_v', 'main_power', 'dayofweek']
Train samples: 3457
Val   samples: 289
X shape: torch.Size([16, 288, 5, 7])
y shape: torch.Size([16, 288])
Sample[0] node0 first 3 steps:
 tensor([[ 0.5947,  0.5000, -0.6325, -0.6530, -1.2544,  0.0000,  0.0000],
        [ 0.5940,  0.5000, -0.6325, -0.6802, -1.2399,  0.0000,  0.0000],
        [ 0.5700,  0.5000, -0.6325, -0.7071, -1.2247,  0.0000,  0.0000]])


In [7]:
torch.cuda.is_available()

True