In [330]:
import pandas as pd
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data
import pickle

In [331]:
def saveData(data, path):
    with open(path, "wb") as file:
        pickle.dump(data, file)
        file.close()


def loadData(path):
    data = None
    with open(path, "rb") as file:
        data = pickle.load(file)
        file.close()
    return data

# 读取dataframe并且处理一下

读节点

In [332]:
# 读取node
node_df_train = pd.read_csv("data/raw/train_90.csv")
# 删除全0列
drop_c = ["F_23", "F_27"]
node_df_train.drop(drop_c, axis=1, inplace=True)
node_ids = node_df_train["geohash_id"].unique().tolist()

In [333]:
node_ids.index("5324516fr")

1

In [334]:
# 标准化
id_and_date_columns = ["geohash_id", "date_id"]
label_columns = ["active_index", "consume_index"]
feature_columns = node_df_train.drop(
    id_and_date_columns + label_columns, axis=1
).columns

feature_scaler = StandardScaler()
node_df_train.loc[:, feature_columns] = feature_scaler.fit_transform(
    node_df_train[feature_columns]
)
label_scaler = StandardScaler()
node_df_train.loc[:, label_columns] = label_scaler.fit_transform(
    node_df_train[label_columns]
)

读边

In [335]:
edge_df_train = pd.read_csv("data/raw/edge_90.csv")

In [336]:
# 标准化
edge_feature_columns = ["F_1", "F_2"]
edge_feature_scaler = StandardScaler()
edge_df_train.loc[:, edge_feature_columns] = edge_feature_scaler.fit_transform(
    edge_df_train[edge_feature_columns]
)

# 建图

每seq_len天的图连接起来，构成一个大图

连接方法：第t天的图和第t+1天的图之间，相同的节点加一条边

In [337]:
def buildGraph(node_df, edge_df, is_test_dataset=False):
    graphs = []

    # 按照date_id分组
    for date_id, graph in node_df.groupby("date_id"):
        # 节点
        x = torch.tensor(graph[feature_columns].values, dtype=torch.float)
        if not is_test_dataset:
            y = torch.tensor(graph[label_columns].values, dtype=torch.float)

        # 边
        edge_day_df = edge_df[edge_df["date_id"] == date_id]
        edge_index, edge_attr = [], []
        for _, edge in edge_day_df.iterrows():
            # 边可能给多了，只取存在的
            if (
                edge["geohash6_point1"] not in node_ids
                or edge["geohash6_point2"] not in node_ids
            ):
                continue
            edge_index.append(
                [
                    node_ids.index(edge["geohash6_point1"]),
                    node_ids.index(edge["geohash6_point2"]),
                ]
            )
            edge_attr.append(edge[edge_feature_columns].values.astype(np.float32))
        edge_index = torch.tensor(np.array(edge_index), dtype=torch.long).t().contiguous()
        edge_attr = torch.tensor(np.array(edge_attr), dtype=torch.float)

        # 图
        if is_test_dataset:
            graph = [x, edge_index, edge_attr]
        else:
            graph = [x, edge_index, edge_attr, y]
        graphs.append(graph)

        print(date_id, "finished")

    return graphs

In [338]:
# graphs = buildGraph(node_df_train, edge_df_train)

In [339]:
# 把这里的路径改成你本地存放train_graphs_data.pkl的路径
graphs = loadData("data/concat_graph/raw_graphs.pkl")

In [340]:
len(graphs), len(graphs[0]), len(graphs[-1])

(90, 4, 4)

# 建图第二步

每seq_len天的图连接起来，构成一个大图

In [359]:
def concatGraph(graphs, seq_len=7, stride=3, is_test_dataset=False):
    concat_graphs = []
    for i in range(0, len(graphs) - seq_len + 1, stride):
        print(i, i + seq_len)
        raw_graphs = graphs[i : i + seq_len]
        x = torch.cat([graph[0] for graph in raw_graphs], dim=0)
        if not is_test_dataset:
            y = torch.cat([graph[3] for graph in raw_graphs], dim=0)

        # edge_attr 先不要了，因为sage用不到

        # 拿到所有的边
        edge_index = torch.cat(
            [graph[1] + gnum * len(node_ids) for gnum, graph in enumerate(raw_graphs)],
            dim=1,
        )
        # 生成新的边：每个节点和下一个时间片的节点相连
        new_edge_index = (
            torch.tensor(
                np.array(
                    [
                        [n, n + len(node_ids)]
                        for n in range(len(node_ids) * (seq_len - 1))
                    ]
                ),
                dtype=torch.long,
            )
            .t()
            .contiguous()
        )
        # 连接原有边和新边
        edge_index = torch.cat([edge_index, new_edge_index], dim=1)

        # print(x.shape, y.shape, edge_index.shape)
        if is_test_dataset:
            concat_graphs.append(Data(x=x, edge_index=edge_index))
        else:
            concat_graphs.append(Data(x=x, y=y, edge_index=edge_index))

    return concat_graphs

In [342]:
seq_len = 7

In [343]:
concat_graphs = concatGraph(graphs, seq_len=seq_len, stride=1)
len(concat_graphs)

0 7
1 8
2 9
3 10
4 11
5 12
6 13
7 14
8 15
9 16
10 17
11 18
12 19
13 20
14 21
15 22
16 23
17 24
18 25
19 26
20 27
21 28
22 29
23 30
24 31
25 32
26 33
27 34
28 35
29 36
30 37
31 38
32 39
33 40
34 41
35 42
36 43
37 44
38 45
39 46
40 47
41 48
42 49
43 50
44 51
45 52
46 53
47 54
48 55
49 56
50 57
51 58
52 59
53 60
54 61
55 62
56 63
57 64
58 65
59 66
60 67
61 68
62 69
63 70
64 71
65 72
66 73
67 74
68 75
69 76
70 77
71 78
72 79
73 80
74 81
75 82
76 83
77 84
78 85
79 86
80 87
81 88
82 89
83 90


84

In [344]:
concat_graphs

[Data(x=[7980, 33], edge_index=[2, 85761], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 85360], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 84850], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 84854], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 85480], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 85425], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 85565], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 85727], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 86086], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 86513], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 87617], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 87942], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 87962], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 88042], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 88100], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 87772], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=[2, 87398], y=[7980, 2]),
 Data(x=[7980, 33], edge_index=

In [345]:
saveData(concat_graphs, "data/concat_graph/train_graphs_data.pkl")

In [346]:
# 保存node_id和index的对应关系
saveData(node_ids * seq_len, "data/concat_graph/node_ids.pkl")

# 生成训练集图

节点

In [347]:
# 读取node
node_df_test = pd.read_csv("data/raw/A榜/node_test_4_A.csv")
node_df_test.drop(drop_c, axis=1, inplace=True)
node_ids = loadData("data/concat_graph/node_ids.pkl")

In [348]:
# 测试集中有一些不正确的node_id，需要进行转化
special_node = {
    "18377236": "018377236",
    "7.45E+07": "7449766e1",
    "9.80E+10": "9797336e4",
}
node_df_test.replace(special_node, inplace=True)

In [349]:
# 现在测试集中的node_id都是正确的
assert node_df_test["geohash_id"].isin(node_ids).value_counts().item() == len(
    node_df_test
)

In [350]:
# 标准化
node_df_test.loc[:, feature_columns] = feature_scaler.transform(
    node_df_test[feature_columns]
)

边

In [351]:
edge_df_test = pd.read_csv("data/raw/A榜/edge_test_4_A.csv")

In [352]:
# 标准化
edge_df_test.loc[:, edge_feature_columns] = edge_feature_scaler.transform(
    edge_df_test[edge_feature_columns]
)

图

In [353]:
# test_graphs = buildGraph(node_df_test, edge_df_test, is_test_dataset=True)

In [354]:
test_graphs = loadData("data/concat_graph/raw_test_graphs.pkl")

In [360]:
saveData(test_graphs, "data/concat_graph/raw_test_graphs.pkl")

In [361]:
concat_test_graphs = concatGraph(
    graphs[-seq_len + 1 :] + test_graphs,
    seq_len=seq_len,
    stride=1,
    is_test_dataset=True,
)
len(concat_test_graphs)

0 7
1 8
2 9
3 10


4

In [362]:
saveData(concat_test_graphs, "data/concat_graph/test_graphs_data.pkl")

# 保存所有标准化器

In [363]:
# saveData(feature_scaler, "saved/scaler/feature_scaler.pkl")
# saveData(label_scaler, "saved/scaler/label_scaler.pkl")
# saveData(edge_feature_scaler, "saved/scaler/edge_feature_scaler.pkl")