# 从csv文件中转化为易读的数据集

### 初始化文件

In [79]:
edge_file = "../data/edge_90.csv"
node_file = "../data/train_90.csv"
graph_id_file = "../data/graph_id_map.pkl"
node_id_file = "../data/node_id_maps.pkl"
graph_propertity_file = "../data/graph_propertity.csv"
output_node_file="../data/nodes.csv"
output_edge_file="../data/edges.csv"

## 1 获取graph_id的映射

In [80]:
def save_data(data, save_file):
    """保存数据"""
    import pickle
    # 保存到文件
    with open(save_file, 'wb') as file:
        pickle.dump(data, file)


def load_data(load_file):
    """读取数据"""
    import pickle
    # 打开文件
    with open(load_file, 'rb') as file:
        data = pickle.load(file)
    return data

In [81]:
import pandas as pd
nodes_df = pd.read_csv(node_file)  
# 按照日期升序排列
nodes_df = nodes_df.sort_values(by=['date_id'],ascending=[True])
# 按照日期成组
graph_nodes_dfs = nodes_df.groupby(by="date_id")
# date_id -> graph_id
graph_id_map = {}
for graph_id,graph_nodes_df in enumerate(graph_nodes_dfs):
    graph_id_map[graph_nodes_df[0]] = graph_id
graph_id_map_file = graph_id_file
# 保存图序号映射
save_data(graph_id_map,graph_id_map_file)
# Info
print("Size of graphs ",len(graph_nodes_dfs))
print("Graph mapping: ",graph_id_map)

Size of graphs  90
Graph mapping:  {20230104: 0, 20230105: 1, 20230106: 2, 20230107: 3, 20230108: 4, 20230109: 5, 20230110: 6, 20230111: 7, 20230112: 8, 20230113: 9, 20230114: 10, 20230115: 11, 20230116: 12, 20230117: 13, 20230118: 14, 20230119: 15, 20230120: 16, 20230121: 17, 20230122: 18, 20230123: 19, 20230124: 20, 20230125: 21, 20230126: 22, 20230127: 23, 20230128: 24, 20230129: 25, 20230130: 26, 20230131: 27, 20230201: 28, 20230202: 29, 20230203: 30, 20230204: 31, 20230205: 32, 20230206: 33, 20230207: 34, 20230208: 35, 20230209: 36, 20230210: 37, 20230211: 38, 20230212: 39, 20230213: 40, 20230214: 41, 20230215: 42, 20230216: 43, 20230217: 44, 20230218: 45, 20230219: 46, 20230220: 47, 20230221: 48, 20230222: 49, 20230223: 50, 20230224: 51, 20230225: 52, 20230226: 53, 20230227: 54, 20230228: 55, 20230301: 56, 20230302: 57, 20230303: 58, 20230304: 59, 20230305: 60, 20230306: 61, 20230307: 62, 20230308: 63, 20230309: 64, 20230310: 65, 20230311: 66, 20230312: 67, 20230313: 68, 20230314

## 2 获取node数据每个图的结点映射

### 获取单个图的结点映射

In [82]:
def get_single_graph_node_map(graph_df):
    """传入graph的df数据 返回结点映射"""
    # geohash_id -> node_id
    node_id_map = {}
    for node_id,(_, node_row) in enumerate(graph_df.iterrows()):
        geohash_id = node_row["geohash_id"]
        node_id_map[geohash_id] = node_id
    return node_id_map

### 获取全部图的结点映射关系

In [83]:
node_id_maps = []
for graph_id,graph_nodes_df in enumerate(graph_nodes_dfs):
    node_id_map = get_single_graph_node_map(graph_nodes_df[1])
    node_id_maps.append(node_id_map)
save_data(node_id_maps,node_id_file)

## 3 获取edge数据的每个图的结点映射

### 依据图序号对数据分组

In [84]:
import pandas as pd
edges_df = pd.read_csv(edge_file)  
# 按照日期升序排列
edges_df = edges_df.sort_values(by=['date_id'],ascending=[True])
# 按照日期成组
graph_edges_dfs = edges_df.groupby(by="date_id")
# Info
print("Size of graphs ",len(graph_nodes_dfs))
print("Graph mapping: ",graph_id_map)

Size of graphs  90
Graph mapping:  {20230104: 0, 20230105: 1, 20230106: 2, 20230107: 3, 20230108: 4, 20230109: 5, 20230110: 6, 20230111: 7, 20230112: 8, 20230113: 9, 20230114: 10, 20230115: 11, 20230116: 12, 20230117: 13, 20230118: 14, 20230119: 15, 20230120: 16, 20230121: 17, 20230122: 18, 20230123: 19, 20230124: 20, 20230125: 21, 20230126: 22, 20230127: 23, 20230128: 24, 20230129: 25, 20230130: 26, 20230131: 27, 20230201: 28, 20230202: 29, 20230203: 30, 20230204: 31, 20230205: 32, 20230206: 33, 20230207: 34, 20230208: 35, 20230209: 36, 20230210: 37, 20230211: 38, 20230212: 39, 20230213: 40, 20230214: 41, 20230215: 42, 20230216: 43, 20230217: 44, 20230218: 45, 20230219: 46, 20230220: 47, 20230221: 48, 20230222: 49, 20230223: 50, 20230224: 51, 20230225: 52, 20230226: 53, 20230227: 54, 20230228: 55, 20230301: 56, 20230302: 57, 20230303: 58, 20230304: 59, 20230305: 60, 20230306: 61, 20230307: 62, 20230308: 63, 20230309: 64, 20230310: 65, 20230311: 66, 20230312: 67, 20230313: 68, 20230314

### 依据src，dst补充结点映射关系

In [85]:
def add_single_graph_node_map(graph_df, node_id_map):
    """传入graph的edges数据 添加结点映射"""
    node_id = len(node_id_map)
    # geohash_id -> node_id
    for _, node_row in graph_df.iterrows():
        geohash_id1 = node_row["geohash6_point1"]
        geohash_id2 = node_row["geohash6_point2"]
        if geohash_id1 not in node_id_map:
            node_id_map[geohash_id1] = node_id
            node_id += 1
            
        if geohash_id2 not in node_id_map:
            node_id_map[geohash_id2] = node_id
            node_id += 1
    return node_id_map

In [86]:
# 按照日期升序排列
edges_df = edges_df.sort_values(by=['date_id'],ascending=[True])
node_id_maps=load_data(node_id_file)
# 按照日期成组
graph_edges_dfs = edges_df.groupby(by="date_id")
for graph_id,graph_edges_df in enumerate(graph_edges_dfs):
    node_id_map = node_id_maps[graph_id]
    node_id_maps[graph_id] = add_single_graph_node_map(graph_edges_df[1],node_id_map)
node_nums = set()
for node_id_map in node_id_maps:
    node_nums.add(len(node_id_map))
print(node_nums)
save_data(node_id_maps,node_id_file)


{1152, 1153, 1154, 1149, 1150, 1151}


## 4 获取每个图的结点数量

In [87]:
def save_graph_num_nodes(output_file,node_id_maps):
    graphs_row = []
    for graph_id, node_id_map in enumerate(node_id_maps):
        print(f"graph {graph_id} num_nodes is {len(node_id_map)}")
        graph_row = [graph_id,len(node_id_map)]
        graphs_row.append(graph_row)
    df = pd.DataFrame(graphs_row,columns=['graph_id','num_nodes']) 
    df.to_csv(output_file,index=False)

In [88]:
save_graph_num_nodes(graph_propertity_file,node_id_maps)

graph 0 num_nodes is 1152
graph 1 num_nodes is 1150
graph 2 num_nodes is 1152
graph 3 num_nodes is 1152
graph 4 num_nodes is 1151
graph 5 num_nodes is 1152
graph 6 num_nodes is 1151
graph 7 num_nodes is 1153
graph 8 num_nodes is 1152
graph 9 num_nodes is 1151
graph 10 num_nodes is 1152
graph 11 num_nodes is 1151
graph 12 num_nodes is 1151
graph 13 num_nodes is 1151
graph 14 num_nodes is 1151
graph 15 num_nodes is 1151
graph 16 num_nodes is 1151
graph 17 num_nodes is 1151
graph 18 num_nodes is 1152
graph 19 num_nodes is 1150
graph 20 num_nodes is 1151
graph 21 num_nodes is 1152
graph 22 num_nodes is 1151
graph 23 num_nodes is 1152
graph 24 num_nodes is 1152
graph 25 num_nodes is 1151
graph 26 num_nodes is 1152
graph 27 num_nodes is 1153
graph 28 num_nodes is 1151
graph 29 num_nodes is 1151
graph 30 num_nodes is 1150
graph 31 num_nodes is 1150
graph 32 num_nodes is 1151
graph 33 num_nodes is 1150
graph 34 num_nodes is 1152
graph 35 num_nodes is 1151
graph 36 num_nodes is 1151
graph 37 nu

## 5 新增src,dst,graph_id,node_id列数据

### 新增graph_id

In [89]:
nodes_df['graph_id'] = nodes_df['date_id'].map(graph_id_map)
edges_df['graph_id'] = edges_df['date_id'].map(graph_id_map)

### 新增node_id

In [90]:
# 需要依据graph_id 分组映射
node_id_maps=load_data("../data/node_id_maps.pkl")

nodes_groupby_graph_id_df = nodes_df.groupby(by="graph_id")
result = []
for graph_id,nodes_groupby_df in nodes_groupby_graph_id_df:
    node_id_map = node_id_maps[graph_id]
    nodes_groupby_df['node_id'] = nodes_groupby_df['geohash_id'].map(node_id_map)
    result.append(nodes_groupby_df)
result = pd.concat(result)
print(result["node_id"])
result.to_csv(output_node_file,index=False)

102597       0
101845       1
101844       2
101843       3
101842       4
          ... 
760       1133
761       1134
762       1135
756       1136
0         1137
Name: node_id, Length: 102598, dtype: int64


### 新增src，dst的node_id

In [91]:
# 需要依据graph_id 分组映射
node_id_maps=load_data("../data/node_id_maps.pkl")

edges_groupby_graph_id_df = edges_df.groupby(by="graph_id")
result = []
for graph_id,edges_groupby_df in edges_groupby_graph_id_df:
    node_id_map = node_id_maps[graph_id]
    edges_groupby_df['src'] = edges_groupby_df['geohash6_point1'].map(node_id_map)
    edges_groupby_df['dst'] = edges_groupby_df['geohash6_point2'].map(node_id_map)
    result.append(edges_groupby_df)
result = pd.concat(result)
print(result["src"])
print(result["dst"])
result.to_csv(output_edge_file,index=False)

938659     505
220256     876
986582     818
633506     348
895172     340
          ... 
848215     480
650400     883
23587     1048
168142     536
246317     404
Name: src, Length: 1048575, dtype: int64
938659     923
220256    1049
986582     619
633506     434
895172     467
          ... 
848215     791
650400     825
23587      601
168142     864
246317     828
Name: dst, Length: 1048575, dtype: int64
