# MAXP 2021初赛数据探索和处理-1

由于节点的Feature维度比较高，所以先处理节点的ID，并保存。Feature的处理放到第2部分。

In [1]:
import pandas as pd
import numpy as np
import os
import gc

In [2]:
# path
base_path = '..'
publish_path = 'dataset'

link_p1_path = os.path.join(base_path, publish_path, 'link_phase1.csv')
train_nodes_path = os.path.join(base_path, publish_path, 'train_nodes.csv')
val_nodes_path = os.path.join(base_path, publish_path, 'validation_nodes.csv')

In [3]:
link_p1_path

'../dataset/link_phase1.csv'

## 读取边列表并统计节点数量

In [5]:
edge_df = pd.read_csv(link_p1_path)
print(edge_df.shape)
edge_df.head()

(29168650, 3)


Unnamed: 0,paper_id,reference_paper_id,phase
0,f10da75ad1eaf16eb2ffe0d85b76b332,711ef25bdb2c2421c0131af77b3ede1d,phase1
1,9ac5a4327bd4f3dcb424c93ca9b84087,2d91c73304c5e8a94a0e5b4956093f71,phase1
2,9d91bfd4703e55dd814dfffb3d63fc33,33d4fdfe3967a1ffde9311bfe6827ef9,phase1
3,e1bdbce05528952ed6579795373782d4,4bda690abec912b3b7b228b01fb6819a,phase1
4,eb623ac4b10df96835921edabbde2951,c1a05bdfc88a73bf2830e705b2f39dbb,phase1


In [7]:
edge_df.phase.describe()

count     29168650
unique           1
top         phase1
freq      29168650
Name: phase, dtype: object

In [8]:
nodes = pd.concat([edge_df['paper_id'], edge_df['reference_paper_id']])
nodes = pd.DataFrame(nodes.drop_duplicates())
nodes.rename(columns={0:'paper_id'}, inplace=True)

print(nodes.shape)
nodes.head(4)

(3031367, 1)


Unnamed: 0,paper_id
0,f10da75ad1eaf16eb2ffe0d85b76b332
1,9ac5a4327bd4f3dcb424c93ca9b84087
2,9d91bfd4703e55dd814dfffb3d63fc33
3,e1bdbce05528952ed6579795373782d4


**在边列表，一共出现了3,031,367个节点(paper_id)**  
**存在一些游离的结点**

## 读取并查看train_nodes和validation_nodes里面的节点

train_nodes.csv的列：
- ID : 字符串类型
- Feature : list的字符串类型，eval($\cdot$)后为长度为300的列表，元素为浮点数
- Label  
共`3063061`个结点

validation_nodes.csv的列（列的含义同train_nodes.csv）：
- ID
- Feature  
共`591972`个结点

train和validation中结点总数为`3655033`，二者之间**无重复**

In [9]:
def process_node(line):
    nid, feat_json, label = line.strip().split('\"')
    
    feat_list = [float(feat[1:-1]) for feat in feat_json[1:-1].split(', ')]
    
    if len(feat_list) != 300:
        print('此行数据有问题 {}'.format(line))
    
    return nid[:-1], feat_list, label[1:]

In [10]:
# 先构建ID和Label的关系，保证ID的顺序和Feature的顺序一致即可
nid_list = []
label_list = []
tr_val_list = []  # 表示该节点是train还是val

with open(train_nodes_path, 'r') as f:
    i = 0
    
    for line in f:
        if i > 0:
            nid, _, label = process_node(line)
            nid_list.append(nid)
            label_list.append(label)
            tr_val_list.append(0)             # 0表示train的点
        i += 1
        if i % 100000 == 0:
            print('Processed {} train rows'.format(i))
    print(f"there are {i-1} rows in train data")

with open(val_nodes_path, 'r') as f:
    i = 0
    
    for line in f:
        if i > 0:
            nid, _, label = process_node(line)
            nid_list.append(nid)
            label_list.append(label)
            tr_val_list.append(1)             # 1表示validation的点
        i += 1
        if i % 100000 == 0:
            print('Processed {} validation rows'.format(i))
    print(f"there are {i-1} rows in val data")
            
nid_arr = np.array(nid_list)
label_arr = np.array(label_list)
tr_val_arr = np.array(tr_val_list)
    
nid_label_df = pd.DataFrame({'paper_id':nid_arr, 'Label': label_arr, 'Split_ID':tr_val_arr})  # Split_ID表示结点用于train还是validation

Processed 100000 train rows
Processed 200000 train rows
Processed 300000 train rows
Processed 400000 train rows
Processed 500000 train rows
Processed 600000 train rows
Processed 700000 train rows
Processed 800000 train rows
Processed 900000 train rows
Processed 1000000 train rows
Processed 1100000 train rows
Processed 1200000 train rows
Processed 1300000 train rows
Processed 1400000 train rows
Processed 1500000 train rows
Processed 1600000 train rows
Processed 1700000 train rows
Processed 1800000 train rows
Processed 1900000 train rows
Processed 2000000 train rows
Processed 2100000 train rows
Processed 2200000 train rows
Processed 2300000 train rows
Processed 2400000 train rows
Processed 2500000 train rows
Processed 2600000 train rows
Processed 2700000 train rows
Processed 2800000 train rows
Processed 2900000 train rows
Processed 3000000 train rows
there are 3063061 rows in train data
Processed 100000 validation rows
Processed 200000 validation rows
Processed 300000 validation rows
Pro

In [11]:
nid_label_df.reset_index(inplace=True)
nid_label_df.rename(columns={'index':'node_idx'}, inplace=True)
print(nid_label_df.shape)
nid_label_df.head(4)

(3655033, 4)


Unnamed: 0,node_idx,paper_id,Label,Split_ID
0,0,bfdee5ab86ef5e68da974d48a138c28e,S,0
1,1,78f43b8b62f040347fec0be44e5f08bd,,0
2,2,a971601a0286d2701aa5cde46e63a9fd,G,0
3,3,ac4b88a72146bae66cedfd1c13e1552d,,0


In [12]:
# 检查ID在Train和Validation是否有重复
ids = nid_label_df.paper_id.drop_duplicates()
ids.shape

(3655033,)

**train和validation一共有3,655,033个节点**

## 下面交叉比对边列表里的paper id和节点列表里的ID，检查是否有匹配不上的节点
边列表里的结点是所有连通分量的中的结点，但是在整个Graph中，有些结点是游离的，不与任何其他节点相连。同时，边列表中的结点有一部分没有出现在train+validation里，这部分结点是没有特征的。

In [13]:
inboth = nid_label_df.merge(nodes, on='paper_id', how='inner')
print(inboth.shape)

(3030948, 4)


In [14]:
nodes

Unnamed: 0,paper_id
0,f10da75ad1eaf16eb2ffe0d85b76b332
1,9ac5a4327bd4f3dcb424c93ca9b84087
2,9d91bfd4703e55dd814dfffb3d63fc33
3,e1bdbce05528952ed6579795373782d4
4,eb623ac4b10df96835921edabbde2951
...,...
29167229,cde1da11cecd3c87c79bb89bc0bea7be
29167820,ddc16ecd4b254963c17d507e4f6015a8
29168063,b85c56d5a5abd88891830c48656e1311
29168372,6419d0b0fb0fefeb46c859b4068ab09e


In [15]:
edge_node = nodes.merge(nid_label_df, on='paper_id', how='left')
print(edge_node.shape)
print('共有{}边列表的节点在给出的节点列表里没有对应，缺乏特征'.format(edge_node[edge_node.node_idx.isna()].shape[0]))
edge_node[edge_node.node_idx.isna()].head(4)

(3031367, 4)
共有419边列表的节点在给出的节点列表里没有对应，缺乏特征


Unnamed: 0,paper_id,node_idx,Label,Split_ID
1124,cc388eaec8838ce383d8a8792014fedb,,,
1184,5d899f41e52f751fef843cf7b1d05b4a,,,
14342,2b2004ec3c99a44b5cb6045ca547453e,,,
15803,d657c4451a9617f4eec96d3b2e6092c7,,,


In [16]:
edge_node.head(10)

Unnamed: 0,paper_id,node_idx,Label,Split_ID
0,f10da75ad1eaf16eb2ffe0d85b76b332,529879.0,,0.0
1,9ac5a4327bd4f3dcb424c93ca9b84087,410481.0,D,0.0
2,9d91bfd4703e55dd814dfffb3d63fc33,2196044.0,D,0.0
3,e1bdbce05528952ed6579795373782d4,2545623.0,,0.0
4,eb623ac4b10df96835921edabbde2951,165064.0,I,0.0
5,483366c9bf1eba07ccc3d22c140e4bc1,635911.0,,0.0
6,ac69e8bb64c395d87320d359cff70d21,1240427.0,D,0.0
7,1bee88cf5c39e00bea8d663986f28b16,24396.0,N,0.0
8,46c2fe0c44959d4e09b5b1b51a6d93d1,312247.0,,0.0
9,de82e62f49f4dd32c870559944e57cb6,1529690.0,,0.0


### 合并边列表里独特的节点和train和validation的节点到一起，构成全部节点列表

In [17]:
# 获取未能匹配上的节点，并构建新的节点DataFrame，然后和原有的Train/Validation节点Concat起来
diff_nodes = edge_node[edge_node.node_idx.isna()]
diff_nodes.ID = diff_nodes.paper_id
diff_nodes.Split_ID = 1  # 用于训练
diff_nodes.node_idx = 0
diff_nodes.reset_index(inplace=True)
diff_nodes.drop(['index'], axis=1, inplace=True)
diff_nodes.node_idx = diff_nodes.node_idx + diff_nodes.index + 3655033  # 这些结点在train+val后继续编号
diff_nodes = diff_nodes[['node_idx', 'paper_id', 'Label', 'Split_ID']]
diff_nodes.head(4)

  diff_nodes.ID = diff_nodes.paper_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,node_idx,paper_id,Label,Split_ID
0,3655033,cc388eaec8838ce383d8a8792014fedb,,1
1,3655034,5d899f41e52f751fef843cf7b1d05b4a,,1
2,3655035,2b2004ec3c99a44b5cb6045ca547453e,,1
3,3655036,d657c4451a9617f4eec96d3b2e6092c7,,1


In [18]:
diff_nodes

Unnamed: 0,node_idx,paper_id,Label,Split_ID
0,3655033,cc388eaec8838ce383d8a8792014fedb,,1
1,3655034,5d899f41e52f751fef843cf7b1d05b4a,,1
2,3655035,2b2004ec3c99a44b5cb6045ca547453e,,1
3,3655036,d657c4451a9617f4eec96d3b2e6092c7,,1
4,3655037,f02b02fac4a14cf1baa1c0b1b4c9f2f9,,1
...,...,...,...,...
414,3655447,ce9f6a2278633a2dd47838b5e62e15f1,,1
415,3655448,caed47d55d1e193ecb1fa97a415c13dd,,1
416,3655449,c82eb6be79a245392fb626b9a7e1f246,,1
417,3655450,926a31f6b378575204aae30b5dfa6dd3,,1


In [19]:
nid_label_df.shape

(3655033, 4)

In [21]:
# Concatenate这419个未匹配到的节点到总的node的最后，从而让nid能接上
nid_label_df = pd.concat([nid_label_df, diff_nodes])
nid_label_df.tail(4)

Unnamed: 0,node_idx,paper_id,Label,Split_ID
415,3655448,caed47d55d1e193ecb1fa97a415c13dd,,1
416,3655449,c82eb6be79a245392fb626b9a7e1f246,,1
417,3655450,926a31f6b378575204aae30b5dfa6dd3,,1
418,3655451,bbace2419c3f827158ea4602f3eb35fa,,1


In [22]:
nid_label_df.shape

(3655452, 4)

In [36]:
591972 + 419

592391

In [35]:
nid_label_df[nid_label_df['Split_ID'] == 1]

Unnamed: 0,node_idx,paper_id,Label,Split_ID
3063061,3063061,c39457cc34fa969b03819eaa4f9b7a52,,1
3063062,3063062,668b9d0c53e9b6e2c6b1093102f976b3,,1
3063063,3063063,ca5c7bc1b40c0ef3c3f864aed032ca90,,1
3063064,3063064,44f810c0c000cda27ce618add55e815f,,1
3063065,3063065,3c206335d88637d36d83c2942586be98,,1
...,...,...,...,...
414,3655447,ce9f6a2278633a2dd47838b5e62e15f1,,1
415,3655448,caed47d55d1e193ecb1fa97a415c13dd,,1
416,3655449,c82eb6be79a245392fb626b9a7e1f246,,1
417,3655450,926a31f6b378575204aae30b5dfa6dd3,,1


In [22]:
# 保存ID和Label到本地文件
nid_label_df.to_csv(os.path.join(base_path, publish_path, './IDandLabels.csv'), index=False)
# 保存未匹配上的节点用于feature的处理
diff_nodes.to_csv(os.path.join(base_path, publish_path, './diff_nodes.csv'), index=False)
