In [1]:
# 读取数据
import pandas as pd
import json
df = pd.read_csv('../data/demo_plan_01.csv')
print("df:\n", df.head())

#获取json字符串
plans_json = df['json']
print("plans_json:\n", plans_json.iloc[0])




#字符串转json
plans_dict = []
ExecutionTimes = []
for json_str in plans_json:
    plan_dict = json.loads(json_str)
    plans_dict.append(plan_dict['Plan'])
    ExecutionTimes.append(plan_dict['Execution Time'])
print("plans_dict:\n", plans_dict[0])



df:
    id                                               json
0   0  {"Plan": {"Node Type": "Gather", "Parallel Awa...
1   1  {"Plan": {"Node Type": "Seq Scan", "Parallel A...
2   2  {"Plan": {"Node Type": "Seq Scan", "Parallel A...
3   3  {"Plan": {"Node Type": "Gather", "Parallel Awa...
4   4  {"Plan": {"Node Type": "Bitmap Heap Scan", "Pa...
plans_json:
 {"Plan": {"Node Type": "Gather", "Parallel Aware": false, "Startup Cost": 23540.58, "Total Cost": 154548.95, "Plan Rows": 567655, "Plan Width": 119, "Actual Startup Time": 386.847, "Actual Total Time": 646.972, "Actual Rows": 283812, "Actual Loops": 1, "Workers Planned": 2, "Workers Launched": 2, "Single Copy": false, "Plans": [{"Node Type": "Hash Join", "Parent Relationship": "Outer", "Parallel Aware": true, "Join Type": "Inner", "Startup Cost": 22540.58, "Total Cost": 96783.45, "Plan Rows": 236523, "Plan Width": 119, "Actual Startup Time": 369.985, "Actual Total Time": 518.487, "Actual Rows": 94604, "Actual Loops": 3, "Inner Uniqu

In [2]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname('.'), '..'))

# 预处理数据
from models.DataPreprocessor import PlanNode, DataPreprocessor
preprocessor = DataPreprocessor()
plans_tree = preprocessor.preprocess_all(plans_dict)

# 展示
for i in range(5):
    print(plans_tree[i])
print("--------------------------------")
preprocessor.print_tree(plans_tree[0])


PlanNode(node_type='Gather', children=[PlanNode(node_type='Hash Join', children=[PlanNode(node_type='Seq Scan', children=[], extra_info={'Node Type': 'Seq Scan', 'Parent Relationship': 'Outer', 'Parallel Aware': True, 'Relation Name': 'title', 'Alias': 't', 'Startup Cost': 0.0, 'Total Cost': 49166.46, 'Plan Rows': 649574, 'Plan Width': 94, 'Actual Startup Time': 0.366, 'Actual Total Time': 147.047, 'Actual Rows': 514421, 'Actual Loops': 3, 'Filter': '(kind_id = 7)', 'Rows Removed by Filter': 328349, 'Workers': []}, node_vector=None), PlanNode(node_type='Hash', children=[PlanNode(node_type='Seq Scan', children=[], extra_info={'Node Type': 'Seq Scan', 'Parent Relationship': 'Outer', 'Parallel Aware': True, 'Relation Name': 'movie_info_idx', 'Alias': 'mi_idx', 'Startup Cost': 0.0, 'Total Cost': 15122.68, 'Plan Rows': 383592, 'Plan Width': 25, 'Actual Startup Time': 0.28, 'Actual Total Time': 54.382, 'Actual Rows': 306703, 'Actual Loops': 3, 'Filter': '(info_type_id > 99)', 'Rows Removed b

In [3]:
# NodeEncoder
from models.NodeEncoder import NodeEncoder
nodeEncoder = NodeEncoder()
from typing import Any, Dict, Iterable, List, Optional

nodeEncodedVectorsBox = []
for plan_tree in plans_tree:
    all_nodes = nodeEncoder.collect_nodes(plan_tree, method="dfs")
    nodeEncodedVectors = nodeEncoder.encode_nodes(all_nodes)
    nodeEncodedVectorsBox.append(nodeEncodedVectors)


print("nodeEncodedVectorsBox:", len(nodeEncodedVectorsBox))
# 查看第一个plan的编码向量
print(len(nodeEncodedVectorsBox[0]))
preprocessor.print_tree(plans_tree[0])

# 检查每个vector是否为空
for i in range(len(nodeEncodedVectorsBox)):
    for j in range(len(nodeEncodedVectorsBox[i])):
        if nodeEncodedVectorsBox[i][j] is None:
            print(f"nodeEncodedVectorsBox[{i}][{j}] is None")




nodeEncodedVectorsBox: 5000
5
└── Gather (Total Cost: 154548.95, Startup Cost: 23540.58, Plan Rows: 567655, Plan Width: 119, Actual Total Time: 646.97, Actual Rows: 283812), node_vector_shape: torch.Size([64])
    └── Hash Join (Total Cost: 96783.45, Startup Cost: 22540.58, Plan Rows: 236523, Plan Width: 119, Actual Total Time: 518.49, Actual Rows: 94604, Join Type: Inner), node_vector_shape: torch.Size([64])
        ├── Seq Scan (Total Cost: 49166.46, Startup Cost: 0.00, Plan Rows: 649574, Plan Width: 94, Actual Total Time: 147.05, Actual Rows: 514421, Relation Name: title, Alias: t), node_vector_shape: torch.Size([64])
        └── Hash (Total Cost: 15122.68, Startup Cost: 15122.68, Plan Rows: 383592, Plan Width: 25, Actual Total Time: 103.55, Actual Rows: 306703), node_vector_shape: torch.Size([64])
            └── Seq Scan (Total Cost: 15122.68, Startup Cost: 0.00, Plan Rows: 383592, Plan Width: 25, Actual Total Time: 54.38, Actual Rows: 306703, Relation Name: movie_info_idx, Alias:

In [4]:
from models.TreeEncoder import GATTreeEncoder, TreeToGraphConverter
import torch

treeToGraphConverter = TreeToGraphConverter()
gatTreeEncoder = GATTreeEncoder(
    input_dim=64,      # 一定用实际特征维度
    hidden_dim=64,
    output_dim=64,
    num_layers=3,
    num_heads=4,
    dropout=0.1,
    pooling="mean"
)

planEmbeddingBox = []
for plan_tree in plans_tree:
    edge_index, x = treeToGraphConverter.tree_to_graph(plan_tree)
    x = torch.stack(
        [torch.as_tensor(f, dtype=torch.float32) for f in x],
        dim=0
    )
    planEmbeddingBox.append(gatTreeEncoder(x, edge_index))

print("x.shape, edge_index.shape:", x.shape, edge_index.shape)
for i in range(5):
    print(planEmbeddingBox[i].shape)


x.shape, edge_index.shape: torch.Size([4, 64]) torch.Size([2, 6])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])


In [5]:

# PredictionHead
from models.PredictionHead import PredictionHead
predictionHead = PredictionHead()

predictionBox = []
for planEmbedding in planEmbeddingBox:
    prediction = predictionHead.predict(planEmbedding)
    predictionBox.append(prediction)


print("predictionBox:", len(predictionBox))
for i in range(5):
    print(predictionBox[i], ExecutionTimes[i])



predictionBox: 5000
0.9999457877129316 654.241
4.700297597795725 349.797
-0.3459443412721157 1699.24
1.3503599683754146 345.056
1.109332099556923 90.666
