# 从csv文件中转化为易读的数据集

### 初始化文件

In [15]:
edge_file = "../val_data/edge_test_4_A.csv"
node_file = "../val_data/node_test_4_A.csv"
graph_id_file = "../val_data/graph_id_map.pkl"
node_id_file = "../val_data/node_id_maps.pkl"
graph_propertity_file = "../val_data/graph_propertity.csv"
output_node_file="../val_data/nodes.csv"
output_edge_file="../val_data/edges.csv"

## 1 获取graph_id的映射

In [16]:
def save_data(data, save_file):
    """保存数据"""
    import pickle
    # 保存到文件
    with open(save_file, 'wb') as file:
        pickle.dump(data, file)


def load_data(load_file):
    """读取数据"""
    import pickle
    # 打开文件
    with open(load_file, 'rb') as file:
        data = pickle.load(file)
    return data

In [17]:
import pandas as pd
nodes_df = pd.read_csv(node_file)  
# 按照日期升序排列
nodes_df = nodes_df.sort_values(by=['date_id'],ascending=[True])
# 按照日期成组
graph_nodes_dfs = nodes_df.groupby(by="date_id")
# date_id -> graph_id
graph_id_map = {}
for graph_id,graph_nodes_df in enumerate(graph_nodes_dfs):
    graph_id_map[graph_nodes_df[0]] = graph_id
graph_id_map_file = graph_id_file
# 保存图序号映射
save_data(graph_id_map,graph_id_map_file)
# Info
print("Size of graphs ",len(graph_nodes_dfs))
print("Graph mapping: ",graph_id_map)

Size of graphs  4
Graph mapping:  {20230404: 0, 20230405: 1, 20230406: 2, 20230407: 3}


## 2 获取node数据每个图的结点映射

### 获取单个图的结点映射

In [18]:
def get_single_graph_node_map(graph_df):
    """传入graph的df数据 返回结点映射"""
    # geohash_id -> node_id
    node_id_map = {}
    for node_id,(_, node_row) in enumerate(graph_df.iterrows()):
        geohash_id = node_row["geohash_id"]
        node_id_map[geohash_id] = node_id
    return node_id_map

### 获取全部图的结点映射关系

In [19]:
node_id_maps = []
for graph_id,graph_nodes_df in enumerate(graph_nodes_dfs):
    node_id_map = get_single_graph_node_map(graph_nodes_df[1])
    node_id_maps.append(node_id_map)
save_data(node_id_maps,node_id_file)

## 3 获取edge数据的每个图的结点映射

### 依据图序号对数据分组

In [20]:
import pandas as pd
edges_df = pd.read_csv(edge_file)  
# 按照日期升序排列
edges_df = edges_df.sort_values(by=['date_id'],ascending=[True])
# 按照日期成组
graph_edges_dfs = edges_df.groupby(by="date_id")
# Info
print("Size of graphs ",len(graph_nodes_dfs))
print("Graph mapping: ",graph_id_map)

Size of graphs  4
Graph mapping:  {20230404: 0, 20230405: 1, 20230406: 2, 20230407: 3}


### 依据src，dst补充结点映射关系

In [21]:
def add_single_graph_node_map(graph_df, node_id_map):
    """传入graph的edges数据 添加结点映射"""
    node_id = len(node_id_map)
    # geohash_id -> node_id
    for _, node_row in graph_df.iterrows():
        geohash_id1 = node_row["geohash6_point1"]
        geohash_id2 = node_row["geohash6_point2"]
        if geohash_id1 not in node_id_map:
            node_id_map[geohash_id1] = node_id
            node_id += 1
            
        if geohash_id2 not in node_id_map:
            node_id_map[geohash_id2] = node_id
            node_id += 1
    return node_id_map

In [22]:
# 按照日期升序排列
edges_df = edges_df.sort_values(by=['date_id'],ascending=[True])
node_id_maps=load_data(node_id_file)
# 按照日期成组
graph_edges_dfs = edges_df.groupby(by="date_id")
for graph_id,graph_edges_df in enumerate(graph_edges_dfs):
    node_id_map = node_id_maps[graph_id]
    node_id_maps[graph_id] = add_single_graph_node_map(graph_edges_df[1],node_id_map)
node_nums = set()
for node_id_map in node_id_maps:
    node_nums.add(len(node_id_map))
print(node_nums)
save_data(node_id_maps,node_id_file)


{1154, 1155}


## 4 获取每个图的结点数量

In [23]:
def save_graph_num_nodes(output_file,node_id_maps):
    graphs_row = []
    for graph_id, node_id_map in enumerate(node_id_maps):
        print(f"graph {graph_id} num_nodes is {len(node_id_map)}")
        graph_row = [graph_id,len(node_id_map)]
        graphs_row.append(graph_row)
    df = pd.DataFrame(graphs_row,columns=['graph_id','num_nodes']) 
    df.to_csv(output_file,index=False)

In [24]:
save_graph_num_nodes(graph_propertity_file,node_id_maps)

graph 0 num_nodes is 1154
graph 1 num_nodes is 1154
graph 2 num_nodes is 1154
graph 3 num_nodes is 1155


## 5 新增src,dst,graph_id,node_id列数据

### 新增graph_id

In [25]:
nodes_df['graph_id'] = nodes_df['date_id'].map(graph_id_map)
edges_df['graph_id'] = edges_df['date_id'].map(graph_id_map)

### 新增node_id

In [26]:
# 需要依据graph_id 分组映射
node_id_maps=load_data(node_id_file)

nodes_groupby_graph_id_df = nodes_df.groupby(by="graph_id")
result = []
for graph_id,nodes_groupby_df in nodes_groupby_graph_id_df:
    node_id_map = node_id_maps[graph_id]
    nodes_groupby_df['node_id'] = nodes_groupby_df['geohash_id'].map(node_id_map)
    result.append(nodes_groupby_df)
result = pd.concat(result)
print(result["node_id"])
result.to_csv(output_node_file,index=False)

0          0
3788       1
1404       2
3084       3
1400       4
        ... 
2411    1135
2407    1136
2403    1137
2395    1138
4559    1139
Name: node_id, Length: 4560, dtype: int64


### 新增src，dst的node_id

In [27]:
edges_groupby_graph_id_df = edges_df.groupby(by="graph_id")
result = []
for graph_id,edges_groupby_df in edges_groupby_graph_id_df:
    node_id_map = node_id_maps[graph_id]
    edges_groupby_df['src'] = edges_groupby_df['geohash6_point1'].map(node_id_map)
    edges_groupby_df['dst'] = edges_groupby_df['geohash6_point2'].map(node_id_map)
    result.append(edges_groupby_df)
result = pd.concat(result)
print(result["src"])
print(result["dst"])
result.to_csv(output_edge_file,index=False)

42801     984
51790     751
51792     128
51793     742
51796     642
         ... 
18316     770
18311     588
84441     680
52534    1062
73466     796
Name: src, Length: 85604, dtype: int64


42801    548
51790    614
51792    886
51793     90
51796    748
        ... 
18316    831
18311    328
84441    879
52534    896
73466    158
Name: dst, Length: 85604, dtype: int64
