### 导入相关python库文件

In [1]:
import re
import os
import itertools
from collections import defaultdict
from itertools import islice, chain

import networkx as nx
import numpy as np
import pickle as pkl
from scipy.sparse import csr_matrix

from datetime import datetime
from datetime import timedelta
import dateutil.parser

In [2]:
def lines_per_n(f, n):
    for line in f:
        yield ''.join(chain([line], itertools.islice(f, n - 1)))

def getDateTimeFromISO8601String(s):
    d = dateutil.parser.parse(s)
    return d

### 处理节点信息

In [3]:
node_data = defaultdict(lambda : ())  
with open('vis.graph.nodeList.json') as f:
    for chunk in lines_per_n(f, 5): # 每五行处理一次信息
        chunk = chunk.split("\n")
        id_string = chunk[1].split(":")[1]
        x = [x.start() for x in re.finditer('\"', id_string)]
        id =  id_string[x[0]+1:x[1]]  # 计算id信息
        # 计算name
        name_string = chunk[2].split(":")[1]
        x = [x.start() for x in re.finditer('\"', name_string)]
        name =  name_string[x[0]+1:x[1]] # 计算name信息
        # 计算idx
        idx_string = chunk[3].split(":")[1]
        x1 = idx_string.find('(')
        x2 = idx_string.find(')')
        idx =  idx_string[x1+1:x2] # 记录id的索引
        
        # print("ID:{}, IDX:{:<4}, NAME:{}".format(id, idx, name))
        node_data[name] = (id,idx) # 加入节点信息到存储的字典中

In [4]:
len(node_data)

143

In [5]:
node_data # 节点name:(id, 编号idx)

defaultdict(<function __main__.<lambda>()>,
            {'albert.meyers@enron.com': ('55098b62251497209062421f', '0'),
             'andrea.ring@enron.com': ('55098b622514972090624220', '1'),
             'andrew.lewis@enron.com': ('55098b622514972090624221', '2'),
             'andy.zipper@enron.com': ('55098b622514972090624222', '3'),
             'barry.tycholiz@enron.com': ('55098b622514972090624223', '4'),
             'benjamin.rogers@enron.com': ('55098b622514972090624224', '5'),
             'bill.rapp@enron.com': ('55098b622514972090624225', '6'),
             'bill.williams@enron.com': ('55098b622514972090624226', '7'),
             'brad.mckay@enron.com': ('55098b622514972090624227', '8'),
             'brenda.whitehead@enron.com': ('55098b622514972090624228', '9'),
             'cara.semperger@enron.com': ('55098b622514972090624229', '10'),
             'charles.weldon@enron.com': ('55098b62251497209062422a', '11'),
             'chris.dorland@enron.com': ('55098b6225149720

### 处理边信息

In [6]:
links = []
ts = []
with open('vis.digraph.allEdges.json') as f:
    for chunk in lines_per_n(f, 5):
        chunk = chunk.split("\n")
        # 连接的边
        name_string = chunk[2].split(":")[1]
        x = [x.start() for x in re.finditer('\"', name_string)]
        from_id, to_id = name_string[x[0]+1:x[1]].split("_") # src, dst
        # 时间编码
        time_string = chunk[3].split("ISODate")[1]
        x = [x.start() for x in re.finditer('\"', time_string)]
        timestamp = getDateTimeFromISO8601String(time_string[x[0]+1:x[1]])
        ts.append(timestamp) # 时间信息
        links.append((from_id, to_id, timestamp)) # 连接边信息

In [7]:
links[1]

('tracy.geaccone@enron.com',
 'rod.hayslett@enron.com',
 datetime.datetime(2001, 11, 20, 16, 52, 42, tzinfo=tzutc()))

In [10]:
 print ("# interactions", len(links))

# interactions 22784


In [9]:
print (min(ts), max(ts))

1998-11-13 12:07:00+00:00 2002-06-21 22:40:19+00:00


In [11]:
links.sort(key =lambda x: x[2]) # 对links按照时间进行从小到大的排序

In [12]:
links[1]

('mark.taylor@enron.com',
 'tana.jones@enron.com',
 datetime.datetime(1998, 11, 13, 12, 7, tzinfo=tzutc()))

### 制作时间切片上的图

In [13]:
# split edges 
SLICE_MONTHS = 2 # 按月的时间间隔
START_DATE = min(ts) + timedelta(200) # 开始时间
END_DATE = max(ts) - timedelta(200) # 结束时间
print("Spliting Time Interval: \n Start Time : {}, End Time : {}".format(START_DATE, END_DATE))

Spliting Time Interval: 
 Start Time : 1999-06-01 12:07:00+00:00, End Time : 2001-12-03 22:40:19+00:00


In [14]:
slice_links = defaultdict(lambda: nx.MultiGraph()) # 创建关联图
for (a, b, time) in links: # 遍历连接边
    datetime_object = time
    if datetime_object > END_DATE: # 如果超过最大时间，则认为是最大时间的。
        months_diff = (END_DATE - START_DATE).days//30 # 如果时间大于最大，则按照最大最小时间来计算时间间隔
    else:
        months_diff = (datetime_object - START_DATE).days//30 # 否则计算时间间隔，并按月划分
    slice_id = months_diff // SLICE_MONTHS  # 进一步分割
    slice_id = max(slice_id, 0)

    if slice_id not in slice_links.keys(): # 为每个时间进行快照
        slice_links[slice_id] = nx.MultiGraph() # 如果该时刻不存在，则建立该时刻的graph
        if slice_id > 0:
            slice_links[slice_id].add_nodes_from(slice_links[slice_id-1].nodes(data=True)) # 将前一时刻的节点加入到该时刻图中，即节点不会消失
            assert (len(slice_links[slice_id].edges()) == 0)
    slice_links[slice_id].add_edge(a,b, date=datetime_object) # 添加节点a，b之间的连接边信息

In [15]:
len(slice_links)

16

In [20]:
slice_links[0].nodes

NodeView(('mark.taylor@enron.com', 'tana.jones@enron.com', 'michelle.cash@enron.com', 'marie.heard@enron.com', 'richard.sanders@enron.com', 'elizabeth.sager@enron.com', 'dan.hyvl@enron.com', 'sara.shackleton@enron.com', 'jeffrey.hodge@enron.com', 'stacy.dickson@enron.com', 'louise.kitchen@enron.com', 'brenda.whitehead@enron.com', 'fletcher.sturm@enron.com', 'gerald.nemec@enron.com', 'susan.scott@enron.com', 'mike.mcconnell@enron.com', 'kevin.presto@enron.com', 'greg.whalley@enron.com'))

### 处理各切片图的节点标签，用idx替代

In [17]:
# print statics of each graph
used_nodes = []
for id, slice in slice_links.items():
    print("In snapshoot {:<2}, #Nodes={:<5}, #Edges={:<5}".format(id, \
                        slice.number_of_nodes(), slice.number_of_edges()))
    for node in slice.nodes(): # 遍历每个图中所有节点
        if not node in used_nodes:
            used_nodes.append(node) # 加入到节点变量中

In snapshoot 0 , #Nodes=18   , #Edges=237  
In snapshoot 1 , #Nodes=23   , #Edges=184  
In snapshoot 2 , #Nodes=24   , #Edges=216  
In snapshoot 3 , #Nodes=50   , #Edges=570  
In snapshoot 4 , #Nodes=66   , #Edges=649  
In snapshoot 5 , #Nodes=79   , #Edges=877  
In snapshoot 6 , #Nodes=98   , #Edges=1273 
In snapshoot 7 , #Nodes=110  , #Edges=2164 
In snapshoot 8 , #Nodes=117  , #Edges=2688 
In snapshoot 9 , #Nodes=125  , #Edges=2612 
In snapshoot 10, #Nodes=131  , #Edges=2601 
In snapshoot 11, #Nodes=135  , #Edges=2852 
In snapshoot 12, #Nodes=137  , #Edges=1448 
In snapshoot 13, #Nodes=138  , #Edges=581  
In snapshoot 14, #Nodes=141  , #Edges=2125 
In snapshoot 15, #Nodes=143  , #Edges=1707 


In [19]:
len(used_nodes)

143

In [21]:
# remap nodes in graphs. Cause start time is not zero, the node index is not consistent
nodes_consistent_map = {node:idx for idx, node in enumerate(used_nodes)} # 建立节点到对应的索引之中
for id, slice in slice_links.items():
    slice_links[id] = nx.relabel_nodes(slice, nodes_consistent_map) # 重新标记图中的节点

In [22]:
slice_links[0].nodes

NodeView((0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17))

### 利用one-hot编码，将节点属性进行编码

In [23]:
# One-Hot features
onehot = np.identity(slice_links[max(slice_links.keys())].number_of_nodes()) # 为最后一个图中的所有节点，建立one-hot向量

In [26]:
onehot

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [27]:
onehot.shape

(143, 143)

In [28]:
graphs = []
for id, slice in slice_links.items(): # 遍历切片
    tmp_feature = []
    for node in slice.nodes(): # 遍历一张图的所有节点
        tmp_feature.append(onehot[node]) # 加入one-hot特征
    slice.graph["feature"] = csr_matrix(tmp_feature) # 稀疏矩阵，添加图中特征
    graphs.append(slice) # 将图保存到list中

In [31]:
graphs[0].graph["feature"].A.shape

(18, 143)

In [32]:
graphs[-1].graph["feature"].A.shape

(143, 143)

### 保存处理结果

In [29]:
# save 保存图信息
save_path = "../../data/Enron/graph.pkl"
with open(save_path, "wb") as f:
    pkl.dump(graphs, f)
print("Processed Data Saved at {}".format(save_path))

Processed Data Saved at ../../data/Enron/graph.pkl
