In [33]:
# 读取数据
import glob
import pandas as pd
import json

# 匹配所有 train_plan_0*.csv
files = glob.glob("../data/train_plan_*.csv")
print("找到的文件:", files)

# 读入并合并
df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

print("总数据行数:", len(df))
print("df:\n", df.head())

#获取json字符串
plans_json = df['json']
print("plans_json:\n", plans_json.iloc[0])

#字符串转json
plans_dict = []
ExecutionTimes = []
idx = 0
for json_str in plans_json:
    idx += 1
    plan_dict = json.loads(json_str)
    plans_dict.append(plan_dict['Plan'])
    try:
        ExecutionTimes.append(plan_dict['Execution Time'])
    except:
        print(f"idx: {idx} 不存在Execution Time")
        print(plan_dict)
print("plans_dict:\n", plans_dict[0])



找到的文件: ['../data/train_plan_part17.csv', '../data/train_plan_part8.csv', '../data/train_plan_part6.csv', '../data/train_plan_part3.csv', '../data/train_plan_part19.csv', '../data/train_plan_part9.csv', '../data/train_plan_part11.csv', '../data/train_plan_part1.csv', '../data/train_plan_part0.csv', '../data/train_plan_part18.csv', '../data/train_plan_part10.csv', '../data/train_plan_part12.csv', '../data/train_plan_part16.csv', '../data/train_plan_part15.csv', '../data/train_plan_part2.csv', '../data/train_plan_part14.csv', '../data/train_plan_part5.csv', '../data/train_plan_part7.csv', '../data/train_plan_part13.csv', '../data/train_plan_part4.csv']
总数据行数: 100000
df:
       id                                               json
0  85000  {"Plan": {"Node Type": "Bitmap Heap Scan", "Pa...
1  85001  {"Plan": {"Node Type": "Gather", "Parallel Awa...
2  85002  {"Plan": {"Node Type": "Hash Join", "Parallel ...
3  85003  {"Plan": {"Node Type": "Gather", "Parallel Awa...
4  85004  {"Plan": {"No

In [34]:
import sys, os
sys.path.append(os.path.abspath(".."))  # 确保当前目录加入路径

# json -> PlanNode
from models.DataPreprocessor import PlanNode, DataPreprocessor
preprocessor = DataPreprocessor()
plans_tree = preprocessor.preprocess_all(plans_dict)

In [35]:
# PlanNode -> edges_list, extra_info_list
def tree_to_graph(root):
    edges_list, extra_info_list = [], []

    def dfs(node, parent_idx):
        idx = len(extra_info_list)
        extra_info_list.append(node.extra_info)
        edges_list.append((idx, idx))
        if parent_idx is not None:
            edges_list.append((parent_idx, idx))
        for ch in node.children:
            dfs(ch, idx)

    dfs(root, None)
    return edges_list, extra_info_list

edges_list, matrix_plans = [], []
for i in plans_tree:
    edges_matrix, extra_info_matrix = tree_to_graph(i)
    # if len(edges_matrix) == 0:
    #     print(i)
    #     assert False
    edges_list.append(edges_matrix)
    matrix_plans.append(extra_info_matrix)

type(matrix_plans[0][0])



dict

In [36]:
from models.Utils import StatisticsInfo

statisticsInfo = StatisticsInfo(matrix_plans, sample_threshold=100, sample_k=10).build()
statisticsInfo.pretty_print_report()



[Node Types] 13: ['Bitmap Heap Scan', 'Bitmap Index Scan', 'BitmapAnd', 'Gather', 'Gather Merge', 'Hash', 'Hash Join', 'Index Scan', 'Materialize', 'Merge Join', 'Nested Loop', 'Seq Scan', 'Sort']

[Global MUST keys] 10: ['Actual Loops', 'Actual Rows', 'Actual Startup Time', 'Actual Total Time', 'Node Type', 'Parallel Aware', 'Plan Rows', 'Plan Width', 'Startup Cost', 'Total Cost']

[Global ALL keys] 41: ['Actual Loops', 'Actual Rows', 'Actual Startup Time', 'Actual Total Time', 'Alias', 'Exact Heap Blocks', 'Filter', 'Hash Batches', 'Hash Buckets', 'Hash Cond', 'Index Cond', 'Index Name', 'Inner Unique', 'Join Filter', 'Join Type', 'Lossy Heap Blocks', 'Merge Cond', 'Node Type', 'Original Hash Batches', 'Original Hash Buckets', 'Parallel Aware', 'Parent Relationship', 'Peak Memory Usage', 'Plan Rows', 'Plan Width', 'Recheck Cond', 'Relation Name', 'Rows Removed by Filter', 'Rows Removed by Index Recheck', 'Rows Removed by Join Filter', 'Scan Direction', 'Single Copy', 'Sort Key', 'Sor

In [37]:
# NodeVectorizer
import re, math
from collections import defaultdict
import numpy as np
import torch
from typing import List

from models.Utils import process_join_cond_field, process_index_cond_field, load_column_stats

# -------- 词表 --------
class Vocab:
    def __init__(self): self.idx = {"<pad>":0, "<unk>":1}
    def add(self, s):
        if s not in self.idx: self.idx[s] = len(self.idx)
    def get(self, s): return self.idx.get(s, 1)
    @property
    def size(self): return len(self.idx)

NodeTypeVocab = ['Bitmap Heap Scan', 'Bitmap Index Scan', 'BitmapAnd', 'Gather', 'Gather Merge', 'Hash', 'Hash Join', 'Index Scan', 'Materialize', 'Merge Join', 'Nested Loop', 'Seq Scan', 'Sort']


def NodeVectorizer(matrix_plans : List[List[dict]]) -> List[List[List[List]]]:
    res = []
    for mp in matrix_plans:
        plan_matrix = []
        for node in mp:
            node_vector = [0] * 16
            
            # [0]-[12] node_type one-hot
            try:
                node_type_idx = NodeTypeVocab.index(node["Node Type"])
                node_vector[node_type_idx] = 1
            except:
                print(node)
                assert False
            
            # [7]-[8] plan_width & plan_rows
            node_vector[len(NodeTypeVocab)] = node["Plan Width"]
            node_vector[len(NodeTypeVocab)+1] = node["Plan Rows"]

            # [9]-[14] other_keys
            # TODO

            # [15]-[24] join
            # TODO

            # [25]-[34] scan
            # TODO

            # [35]-[44] sort
            # TODO

            # [45]-[54] group
            # TODO

            plan_matrix.append(node_vector)
        res.append(plan_matrix)
    return res

res = NodeVectorizer(matrix_plans)
res[0]


[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 1028173, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1028173, 0]]

In [38]:
import torch
import torch.nn as nn
from torch_geometric.data import Data, Batch

class NodeEncoder(nn.Module):
    """
    输入: data.x 形状 [N, F_in]
    输出: node_embs [N, d_node]
    """
    def __init__(self, in_dim: int, d_node: int):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Linear(in_dim, d_node),
            nn.ReLU(),
            nn.LayerNorm(d_node),
        )
    def forward(self, x):
        return self.proj(x)

# ---- 组合总模型 ----
class PlanCostModel(nn.Module):
    """
    NodeEncoder → GATTreeEncoder → PredictionHead
    """
    def __init__(self, nodecoder: nn.Module, treeencoder: nn.Module, predict_head: nn.Module):
        super().__init__()
        self.nodecoder = nodecoder
        self.treeencoder = treeencoder
        self.predict_head = predict_head

    def forward(self, data: Data | Batch):
        """
        期望 data 里至少有:
        - x: [N, F_in]
        - edge_index: [2, E]
        - batch: [N]  指示每个节点属于哪张图
        """
        x = self.nodecoder(data.x)                                   # [N, d_node]
        g = self.treeencoder(x, data.edge_index)         # [B, d_graph]
        y = self.predict_head(g)                                     # [B, out_dim]
        return y


from models.TreeEncoder import GATTreeEncoder
from models.PredictionHead import PredictionHead
# ---- 使用示例 ----
# 假设你的节点原始特征维度 F_in=64，节点隐层 d_node=128，图级维度 d_graph=256
F_in, d_node, d_graph = 16, 32, 64
nodecoder = NodeEncoder(F_in, d_node)
gatTreeEncoder = GATTreeEncoder(
    input_dim=d_node,      # 一定用实际特征维度
    hidden_dim=64,
    output_dim=d_graph,
    num_layers=3,
    num_heads=4,
    dropout=0.1,
    pooling="mean"
)
predict_head = PredictionHead(d_graph, out_dim=1)

model = PlanCostModel(nodecoder, gatTreeEncoder, predict_head)



In [39]:
print(type(ExecutionTimes))
print(type(res))
print(type(edges_list))


print(model)


<class 'list'>
<class 'list'>
<class 'list'>
PlanCostModel(
  (nodecoder): NodeEncoder(
    (proj): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): ReLU()
      (2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    )
  )
  (treeencoder): GATTreeEncoder(
    (convs): ModuleList(
      (0): GATConv(32, 64, heads=4)
      (1-2): 2 x GATConv(256, 64, heads=4)
    )
    (norms): ModuleList(
      (0-2): 3 x LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    )
    (output_proj): Linear(in_features=256, out_features=64, bias=True)
  )
  (predict_head): PredictionHead(
    (mlp): Sequential(
      (0): Linear(in_features=64, out_features=128, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.1, inplace=False)
      (3): Linear(in_features=128, out_features=64, bias=True)
      (4): ReLU()
      (5): Dropout(p=0.1, inplace=False)
      (6): Linear(in_features=64, out_features=1, bias=True)
    )
  )
)


In [40]:
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

def coerce_x_to_tensor(x_plan, in_dim: int):
    """
    x_plan: 很深的 list（最终行向量长度= in_dim）
    变成 [N, in_dim] 的 float32 Tensor
    """
    x = torch.tensor(x_plan, dtype=torch.float32)
    assert x.numel() % in_dim == 0, f"最后一维应为 {in_dim}，拿到形状 {tuple(x.shape)}"
    x = x.view(-1, in_dim)   # 拉平成 [N, in_dim]
    return x

def coerce_edge_index(ei_like):
    """
    ei_like: list/ndarray/tensor, 形状 [2,E] 或 [E,2]
    返回规范 [2,E] 的 long Tensor
    """
    ei = torch.as_tensor(ei_like, dtype=torch.long)
    if ei.ndim != 2:
        raise ValueError(f"edge_index 需要二维，拿到 {tuple(ei.shape)}")
    if ei.shape[0] != 2 and ei.shape[1] == 2:
        ei = ei.t().contiguous()
    elif ei.shape[0] != 2 and ei.shape[1] != 2:
        raise ValueError(f"edge_index 需为 [2,E] 或 [E,2]，拿到 {tuple(ei.shape)}")
    return ei.contiguous()

def build_dataset(res, edges_list, execution_times, in_dim=16, bidirectional=False):
    assert len(res) == len(edges_list) == len(execution_times), "长度必须一致"
    data_list = []
    for i, (x_plan, ei_like, y) in enumerate(zip(res, edges_list, execution_times)):
        x = coerce_x_to_tensor(x_plan, in_dim)      # [N, in_dim]
        edge_index = coerce_edge_index(ei_like)     # [2,E]
        N = x.size(0)

        # 边索引有效性检查
        if edge_index.numel() > 0:
            if int(edge_index.min()) < 0 or int(edge_index.max()) >= N:
                raise ValueError(f"plan[{i}] 的 edge_index 越界：节点数 N={N}，但 edge_index.max={int(edge_index.max())}")

        # 可选：做成双向图（若你的 edges 只有父->子）
        if bidirectional:
            edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=1)

        y = torch.tensor([float(y)], dtype=torch.float32)  # 图级回归标签
        data_list.append(Data(x=x, edge_index=edge_index, y=y))
    return data_list


In [41]:
import math
import torch.nn.functional as F

# 假设你已有 model = PlanCostModel(nodecoder, treeencoder, predict_head)
in_dim = 16
dataset = build_dataset(res, edges_list, ExecutionTimes, in_dim=in_dim, bidirectional=False)

loader  = DataLoader(dataset, batch_size=32, shuffle=True)  # PyG 的 DataLoader 会自动做 Batch
device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model   = model.to(device)

optim = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

EPOCHS = 10
for ep in range(1, EPOCHS+1):
    model.train()
    mse_sum, n = 0.0, 0
    for batch in loader:
        batch = batch.to(device)                # .x / .edge_index / .batch / .y
        pred = model(batch).squeeze(-1)         # [B]
        y    = batch.y.view_as(pred)            # [B]
        loss = F.mse_loss(pred, y)

        optim.zero_grad()
        loss.backward()
        optim.step()

        mse_sum += loss.item() * y.numel()
        n += y.numel()
    print(f"[epoch {ep}] RMSE={math.sqrt(mse_sum/max(1,n)):.4f}")

# 简评估（用训练集演示）
model.eval()
with torch.no_grad():
    preds, gts = [], []
    for batch in loader:
        batch = batch.to(device)
        p = model(batch).squeeze(-1)
        preds.append(p.cpu()); gts.append(batch.y.view_as(p).cpu())
    preds = torch.cat(preds); gts = torch.cat(gts)
    rmse = torch.sqrt(F.mse_loss(preds, gts)).item()
    mae  = torch.mean(torch.abs(preds - gts)).item()
    print(f"[eval] RMSE={rmse:.4f}  MAE={mae:.4f}")


RuntimeError: shape '[]' is invalid for input of size 32