forked from unknownuser13570/FODGE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_data.py
155 lines (139 loc) · 5.08 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
Loader functions for the datasets. You can add yours to here.
The function requires 2 inputs:
- name: Name of dataset as the name of the file without suffix
- path: Path to where the file is located
The functions return 2 outputs:
- dict_snapshots: Dictionary of snapshots, where keys are time stamps and values are list of edges occurring at this
time.
- dict_weights: Dictionary of weights according to dict_snapshots - keys are the time stamps and values are list of
weight edges corresponding to the order of edges in dict_snapshots.
Note that if you change the name of the function "data_loader", you have to change it as well in the file "main.py" in
in line 51.
"""
from datetime import datetime
import os
import numpy as np
import calendar
def data_loader(name, path):
"""
Data loader for all datasets used.
:param name: Name of the file (str, without .txt)
:param path: Path to where it is
:return:
"""
if name == "dblp":
dict_snapshots, dict_weights = load_dblp(name, path)
else:
dict_snapshots, dict_weights, times = {}, {}, []
with open(os.path.join(path, name + ".txt"), 'r') as filehandle:
for line in filehandle:
a = line.split(" ")
if len(a) == 1:
a = line.split(",")
if "%" in a[0]:
continue
else:
node1 = a[0]
node2 = a[1]
if node1 == node2:
continue
try:
w = float(a[2])
t = a[3].split("\n")[0]
except:
w = 1.
t = a[2].split("\n")[0]
if t == '0':
date = int(t)
else:
x = datetime.fromtimestamp(0)
x = add_months(x, int(t) - 1)
times.append(x)
month = x.month
year = x.year
date = "{}.{}".format(month, year)
if dict_snapshots.get(date) is None:
dict_snapshots.update({date: [(node1, node2)]})
else:
dict_snapshots[date].append((node1, node2))
if dict_weights.get(date) is None:
dict_weights.update({date: [w]})
else:
dict_weights[date].append(w)
times.sort()
sorted_times = sort_keys(times)
dict_snapshots = sort_dict_snapshots(dict_snapshots, sorted_times)
if name == "facebook-wosn-wall":
del dict_snapshots["10.2004"]
del dict_snapshots["11.2004"]
del dict_snapshots["12.2004"]
return dict_snapshots, dict_weights
def load_dblp(name, path, is_weighted=False):
"""
DBLP dataset
"""
dict_snapshots = {}
with open(os.path.join(path, name + ".txt"), 'r') as filehandle:
for line in filehandle:
a = line.split(" ")
node1 = a[0]
node2 = a[1]
if is_weighted:
w = float(a[2])
t = a[3].split("\n")[0]
else:
w = 1.
t = a[2].split("\n")[0]
if dict_snapshots.get(t) is None:
dict_snapshots.update({t: [(node1, node2, w)]})
else:
dict_snapshots[t].append((node1, node2, w))
sorted_dict = dict(sorted(dict_snapshots.items(), key=lambda kv: kv[0]))
keys = sorted_dict.keys()
dict_weights = {key: [] for key in keys}
new_dict = {key: [] for key in keys}
for key in keys:
for edge in sorted_dict[key]:
dict_weights[key].append(edge[2])
new_dict[key].append((edge[0], edge[1]))
return new_dict, dict_weights
def sort_keys(times):
"""
Function to sort the times
"""
sorted_times = []
for t in times:
year = t.year
month = t.month
date = "{}.{}".format(month, year)
if date in sorted_times:
continue
else:
sorted_times.append(date)
return sorted_times
def sort_dict_snapshots(dict_snapshots, times):
"""
Sort dictionary of snapshots by time
:param dict_snapshots:
:param times:
:return:
"""
new_dict = {t: dict_snapshots[t] for t in times}
return new_dict
def load_embedding(path, file_name):
"""
Given a .npy file - embedding of a given graph. return the embedding dictionary
:param path: Where this file is saved.
:param file_name: The name of the file
:return: Embedding dictionary
"""
data = np.load(os.path.join(path, '{}.npy'.format(file_name)), allow_pickle=True)
dict_embedding = data.item()
return dict_embedding
def add_months(sourcedate, months):
month = sourcedate.month - 1 + months
year = sourcedate.year + month // 12
month = month % 12 + 1
day = min(sourcedate.day, calendar.monthrange(year, month)[1])
return datetime(year, month, day)