In [1]:
import pandas as pd
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data
import pickle

  from .autonotebook import tqdm as notebook_tqdm


# 读取dataframe并且处理一下

读节点

In [2]:
# 读取node
node_df_train = pd.read_csv("data/raw/train_90.csv")
# 删除全0列
drop_c = ["F_23", "F_27"]
node_df_train.drop(drop_c, axis=1, inplace=True)
node_ids = node_df_train["geohash_id"].unique().tolist()

In [3]:
node_ids.index("5324516fr")

1

In [4]:
# # 标准化
id_and_date_columns = ["geohash_id", "date_id"]
label_columns = ["active_index", "consume_index"]
feature_columns = node_df_train.drop(
    id_and_date_columns + label_columns, axis=1
).columns

# feature_scaler = StandardScaler()
# node_df_train.loc[:, feature_columns] = feature_scaler.fit_transform(
#     node_df_train[feature_columns]
# )
# label_scaler = StandardScaler()
# node_df_train.loc[:, label_columns] = label_scaler.fit_transform(
#     node_df_train[label_columns]
# )

读边

In [5]:
edge_df_train = pd.read_csv("data/raw/edge_90.csv")

In [6]:
# 标准化
edge_feature_columns = ["F_1", "F_2"]
edge_feature_scaler = StandardScaler()
edge_df_train.loc[:, edge_feature_columns] = edge_feature_scaler.fit_transform(
    edge_df_train[edge_feature_columns]
)

# 建图

In [7]:
def buildGraph(node_df, edge_df, is_test_dataset=False):
    graphs = []

    # 按照date_id分组
    for date_id, graph in node_df.groupby("date_id"):
        # 节点
        x = torch.tensor(graph[feature_columns].values, dtype=torch.float)
        if not is_test_dataset:
            y = torch.tensor(graph[label_columns].values, dtype=torch.float)

        # 边
        edge_day_df = edge_df[edge_df["date_id"] == date_id]
        edge_index, edge_attr = [], []
        for _, edge in edge_day_df.iterrows():
            # 边可能给多了，只取存在的
            if (
                edge["geohash6_point1"] not in node_ids
                or edge["geohash6_point2"] not in node_ids
            ):
                continue
            edge_index.append(
                [
                    node_ids.index(edge["geohash6_point1"]),
                    node_ids.index(edge["geohash6_point2"]),
                ]
            )
            edge_attr.append(edge[edge_feature_columns].values.astype(np.float32))
        edge_index = (
            torch.tensor(np.array(edge_index), dtype=torch.long).t().contiguous()
        )
        edge_attr = torch.tensor(np.array(edge_attr), dtype=torch.float)

        # 图
        if is_test_dataset:
            graph = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
        else:
            graph = Data(x=x, y=y, edge_index=edge_index, edge_attr=edge_attr)
        graphs.append(graph)

        print(date_id, "finished")

    return graphs

In [8]:
graphs = buildGraph(node_df_train, edge_df_train)

20230104 finished
20230105 finished
20230106 finished
20230107 finished
20230108 finished
20230109 finished
20230110 finished
20230111 finished
20230112 finished
20230113 finished
20230114 finished
20230115 finished
20230116 finished
20230117 finished
20230118 finished
20230119 finished
20230120 finished
20230121 finished
20230122 finished
20230123 finished
20230124 finished
20230125 finished
20230126 finished
20230127 finished
20230128 finished
20230129 finished
20230130 finished
20230131 finished
20230201 finished
20230202 finished
20230203 finished
20230204 finished
20230205 finished
20230206 finished
20230207 finished
20230208 finished
20230209 finished
20230210 finished
20230211 finished
20230212 finished
20230213 finished
20230214 finished
20230215 finished
20230216 finished
20230217 finished
20230218 finished
20230219 finished
20230220 finished
20230221 finished
20230222 finished
20230223 finished
20230224 finished
20230225 finished
20230226 finished
20230227 finished
20230228 f

In [9]:
def saveData(data, path):
    with open(path, "wb") as file:
        pickle.dump(data, file)
        file.close()


saveData(graphs, "data/train_graphs_nostd_data.pkl")

In [10]:
# 保存node_id和index的对应关系
saveData(node_ids, "data/node_ids.pkl")

# 生成训练集图

节点

In [11]:
# 读取node
node_df_test = pd.read_csv("data/raw/A榜/node_test_4_A.csv")
node_df_test.drop(drop_c, axis=1, inplace=True)
with open("data/node_ids.pkl", "rb") as file:
    # 和训练集的node_ids保持一致
    node_ids = pickle.load(file)
    file.close()

In [12]:
# 测试集中有一些不正确的node_id，需要进行转化
special_node = {
    "18377236": "018377236",
    "7.45E+07": "7449766e1",
    "9.80E+10": "9797336e4",
}
node_df_test.replace(special_node, inplace=True)

In [13]:
# 现在测试集中的node_id都是正确的
assert node_df_test["geohash_id"].isin(node_ids).value_counts().item() == len(
    node_df_test
)

In [None]:
# # 标准化
# node_df_test.loc[:, feature_columns] = feature_scaler.transform(
#     node_df_test[feature_columns]
# )

边

In [14]:
edge_df_test = pd.read_csv("data/raw/A榜/edge_test_4_A.csv")

In [15]:
# 标准化
edge_df_test.loc[:, edge_feature_columns] = edge_feature_scaler.transform(
    edge_df_test[edge_feature_columns]
)

图

In [16]:
test_graphs = buildGraph(node_df_test, edge_df_test, is_test_dataset=True)

20230404 finished
20230405 finished
20230406 finished
20230407 finished


In [17]:
saveData(test_graphs, "data/test_graphs_nostd_data.pkl")

# 保存所有标准化器

In [18]:
# saveData(feature_scaler, "saved/scaler/feature_scaler.pkl")
# saveData(label_scaler, "saved/scaler/label_scaler.pkl")
saveData(edge_feature_scaler, "saved/scaler/edge_feature_scaler.pkl")