In [60]:
%%time
#import useful packages, all of them are important but not necessarily used in this code
#enable inline plotting in Python Notebook
#supress warnings

%pylab inline
import time
import csv
import random
from collections import defaultdict

Populating the interactive namespace from numpy and matplotlib
CPU times: user 2.98 ms, sys: 999 µs, total: 3.98 ms
Wall time: 3.91 ms


In [2]:
RANDOM_SEED = 1

In [9]:
def write_edges_to_txt(fileName, dataset):
    with open(fileName, mode='w') as f:
        data = csv.writer(f, delimiter='/t')
        for d in dataset:
            data.writerow(d)

In [26]:
def generate_negative_entries(allEdges, testEdges):
    '''
    generate negative entries (edges that don't exist) for test set
    allEdges: (source, target) tuple
    testEdges: (source, target) tuple
    '''
    targetsPerSource = defaultdict(set)
    nodes = set()
    for source, target in allEdges:
        targetsPerSource[source].add(target)
        nodes.add(source)
        nodes.add(target)
       
    test_positive_num = len(testEdges)
    testEdges_label = [1] * test_positive_num
    
    targetsPerTestSource = defaultdict(set)
    for source, target in testEdges:
        targetsPerTestSource[source].add(target)
    
    for source in targetsPerTestSource:
        diff = list(nodes.difference(targetsPerTestSource[source]))
        neg_num = min(len(targetsPerTestSource[source]), len(diff))
        rand_targets = random.sample(diff, neg_num)
        testEdges += [(source, rt) for rt in rand_targets]
        testEdges_label += [0] * neg_num
    
    return testEdges, testEdges_label

In [96]:
# read in all edges and shuffle the data
with open("edges.txt", 'r') as f:
    reader = csv.reader(f, delimiter=' ')
    allEdges = [tuple(row) for row in reader]
allEdges = [(int(e[0]), int(e[1])) for e in allEdges]

random.seed(RANDOM_SEED)
random.shuffle(allEdges)
random.seed()
edges_num = len(allEdges)

In [88]:
len(allEdges)

289003

In [89]:
# split the data into training set and test set, train:test = 8:2
train_num = int(len(allEdges)*0.8)
print('training set:', train_num)
test_num = edge_num - train_num
print('test set:', test_num)
trainEdges = allEdges[:train_num]
testEdges = allEdges[train_num:]

training set: 231202
test set: 57801


In [90]:
testEdgesWithNeg, labels = generate_negative_entries(allEdges, testEdges)

In [97]:
print('test set with negative entries:', len(set(testEdgesWithNeg)))

test set with negative entries: 115602


In [93]:
testSet = [(int(e[0]), int(e[1]), el) for e, el in zip(testEdgesWithNeg, labels)]

In [95]:
write_edges_to_txt('trainEdges.txt', sorted(trainEdges))
write_edges_to_txt('testEdgesWithLabel.txt', sorted(testSet))