In [1]:
import networkx as nx
import json 
import numpy as np
import pandas as pd
from collections import defaultdict, Counter

## Creating EdgeList

In [2]:
adjList = defaultdict(set)
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        paperID = data['id']
        references = data.get('references', [])
        for referencedPaper in references:
            adjList[paperID].add(referencedPaper)
            adjList[referencedPaper].add(paperID)
            
IDsInGraph = list(adjList.keys())
IDtoIndex = dict()
for i in range(len(IDsInGraph)):
    IDtoIndex[IDsInGraph[i]] = i
with open("./data/fscnmfIDsInGraph.json", 'w') as f:
    json.dump(IDsInGraph, f)

In [None]:
with open("./data/fscnmfIDsInGraph.json", 'r') as f:
    IDsInGraph = json.load(f)
IDtoIndex = dict()
for i in range(len(IDsInGraph)):
    IDtoIndex[IDsInGraph[i]] = i

In [None]:
adjList = {IDtoIndex[key]: [IDtoIndex[v] for v in values] for key, values in adjList.items()}
G = nx.from_dict_of_lists(adjList)

In [None]:
path = './data/edgelist.csv'
nx.write_edgelist(G, path, comments='#', delimiter=',',  data = False)

## Features for each Paper

In [None]:
def normalize(lst):
    s = sum(lst)
    return [float(i)/s for i in lst]

In [None]:
AITopLevelTopics = ['Artificial intelligence', 'Computer vision', 'Data mining',
                     'Data science', 'Machine learning', 'Natural language processing',
                     'Pattern recognition', 'Speech recognition']
topicSet = set(AITopLevelTopics)

In [None]:
featureList = []
tmp = []
IDsInGraphSet = set(IDsInGraph)
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        if (data['id'] in IDsInGraphSet):
            feature = [0] * len(AITopLevelTopics)
            featureMap = defaultdict(int)
            for fos in data.get('fos', []):
                if fos['name'] in topicSet:
                    featureMap[fos['name']] = fos['w']
            for i in range(len(AITopLevelTopics)):
                feature[i] = featureMap.get(AITopLevelTopics[i], 0)
            try:
                feature = normalize(feature)
            except:
                feature = feature
            feature.insert(0, IDtoIndex[data['id']])
            featureList.append(feature)
        else:
            tmp.append(data['id'])
featureList.sort()                                           ## Sort by Node IDs for the FSC NMF package

In [None]:
import csv
with open('./data/features.csv', "w") as file:
    writer = csv.writer(file)
    nodeID = 0
    header = ['NodeID', *AITopLevelTopics]
    writer.writerow(header)
    for row in featureList:
        writer.writerow(row)
        nodeID += 1

## Label Classification

In [3]:
def average(lis):
    return sum(lis) / len(lis)

In [9]:
IDList = []
labelList = []
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        paperID = data['id']
#         if paperID not in IDtoIndex:
#             continue
        fosList = defaultdict(int)
        for fos in data.get('fos',[]):
            if (fos['name'] == 'Data mining' or fos['name'] == 'Data science'):
                fosList['Data'] = max(fosList['Data'], fos['w'])
            if (fos['name'] == 'Natural language processing' or fos['name'] == 'Speech recognition'):
                fosList['NLP'] = max(fosList['NLP'], fos['w'])
            if (fos['name'] == 'Computer vision'):
                fosList['CV'] = fos['w']
        if (len(fosList.keys())):
            IDList.append(data['id'])
            fosList = [(key, value) for key, value in fosList.items()]
            fosList = sorted(fosList, key = lambda x: x[1],    reverse=True)
            labelList.append(fosList[0][0])
assert len(labelList) == len(IDList)

In [26]:
embeddingDict = dict()                   ## mappping from paperID(not Index) to embedding
IndextoID = {val: key for key, val in IDtoIndex.items()}
import csv
with open('./data/fscNMFEmbeddings.csv', 'rU') as file:
    rows = csv.reader(file, delimiter=',')
    rows = list(rows)[1 : ]
for row in rows:
    embeddingDict[IndextoID[int(float(row[0]))]] = [float(f) for f in row[1:]]
#     row = list(row)
#     embeddingDict[IndextoID[row[0]]] = row[1:]

  after removing the cwd from sys.path.


In [12]:
embeddings = []
for id in IDList:
    if id in embeddingDict:
        embeddings.append(embeddingDict[id])
    else:
        embeddings.append([0] * 32)
del embeddingDict
assert len(embeddings) == len(labelList)

In [13]:
from sklearn import preprocessing
X = np.asarray(embeddings)
del embeddings
le = preprocessing.LabelEncoder()
le.fit(labelList)
Y = le.transform(labelList)

In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier


names = [
 "Random Forest", "Neural Net", "AdaBoost", "Linear SVC" ]

classifiers = [
    RandomForestClassifier(verbose=True, n_jobs = -1),
    MLPClassifier(verbose=True, early_stopping=True),
    AdaBoostClassifier(),
    OneVsRestClassifier(BaggingClassifier(LinearSVC(),n_jobs = -1))]


In [15]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, KFold

kfold = KFold(n_splits=5, shuffle=True)
for name, clf in zip(names, classifiers):
    precScores = []
    recallScores = []
    f1Scores = []
    count = 1
    for train_index, test_index in kfold.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        print('Fitting: ', count)
        clf.fit(X_train, y_train)
        print('count ', count)
        y_pred = clf.predict(X_test)
        prec, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        precScores.append(prec)
        recallScores.append(recall)
        f1Scores.append(fscore)
        count += 1
    print('Name', name,'. Avg Precision: ', average(precScores), '. Avg Recall: ', average(recallScores), '. Avg F-1 Score: ', average(f1Scores) )


Fitting:  1


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.7s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.1s finished


count  1
Fitting:  2


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.9s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.


count  2


[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.1s finished


Fitting:  3


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.7s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.


count  3


[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.1s finished


Fitting:  4


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.9s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.1s finished


count  4
Fitting:  5


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.0s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.1s finished


count  5
Name Random Forest . Avg Precision:  0.9913456520057522 . Avg Recall:  0.9912923211169847 . Avg F-1 Score:  0.9912997362567912
Fitting:  1
Iteration 1, loss = 0.37350758
Validation score: 0.987753
Iteration 2, loss = 0.04625790
Validation score: 0.987162
Iteration 3, loss = 0.03006978
Validation score: 0.987320
Iteration 4, loss = 0.02586067
Validation score: 0.988934
Iteration 5, loss = 0.02415536
Validation score: 0.988974
Iteration 6, loss = 0.02334396
Validation score: 0.988777
Iteration 7, loss = 0.02290932
Validation score: 0.989210
Iteration 8, loss = 0.02260781
Validation score: 0.989053
Iteration 9, loss = 0.02245857
Validation score: 0.989289
Iteration 10, loss = 0.02228938
Validation score: 0.989564
Iteration 11, loss = 0.02218753
Validation score: 0.989210
Iteration 12, loss = 0.02207763
Validation score: 0.989368
Iteration 13, loss = 0.02197393
Validation score: 0.989092
Iteration 14, loss = 0.02193024
Validation score: 0.990195
Iteration 15, loss = 0.02188653
Val

count  1
Fitting:  2
count  2
Fitting:  3
count  3
Fitting:  4
count  4
Fitting:  5
count  5
Name AdaBoost . Avg Precision:  0.9856940592049384 . Avg Recall:  0.9855302640455106 . Avg F-1 Score:  0.9854933968537871
Fitting:  1
count  1
Fitting:  2
count  2
Fitting:  3
count  3
Fitting:  4
count  4
Fitting:  5
count  5
Name Linear SVC . Avg Precision:  0.9881540904585124 . Avg Recall:  0.9880852252279514 . Avg F-1 Score:  0.9881050120621259


In [5]:
len(IDList)

314740

In [8]:
len(IDsInGraph)

471633

In [16]:
import scipy.sparse as sp

"""
Disclaimer: functions defined from lines 15 to 36 in this file come from 
tkipf/gae original repository on Graph Autoencoders. Moreover, the
mask_test_edges function is borrowed from philipjackson's mask_test_edges 
pull request on this same repository.
"""

def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

def preprocess_graph(adj):
    adj = sp.coo_matrix(adj)
    adj_ = adj + sp.eye(adj.shape[0])
    degree_mat_inv_sqrt = sp.diags(np.power(np.array(adj_.sum(1)), -0.5).flatten())
    adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt)
    return sparse_to_tuple(adj_normalized)

def construct_feed_dict(adj_normalized, adj, features, placeholders):
    # Construct feed dictionary
    feed_dict = dict()
    feed_dict.update({placeholders['features']: features})
    feed_dict.update({placeholders['adj']: adj_normalized})
    feed_dict.update({placeholders['adj_orig']: adj})
    return feed_dict

def mask_test_edges(adj, test_percent=1., val_percent=0.):
    """ Randomly removes some edges from original graph to create
    test and validation sets for link prediction task
    :param adj: complete sparse adjacency matrix of the graph
    :param test_percent: percentage of edges in test set
    :param val_percent: percentage of edges in validation set
    :return: train incomplete adjacency matrix, validation and test sets
    """
    # Remove diagonal elements
    adj = adj - sp.dia_matrix((adj.diagonal()[None, :], [0]), shape=adj.shape)
    adj.eliminate_zeros()
    # Check that diag is zero:
    #assert adj.diagonal().sum() == 0

    edges_positive, _, _ = sparse_to_tuple(adj)
    # Filtering out edges from lower triangle of adjacency matrix
    edges_positive = edges_positive[edges_positive[:,1] > edges_positive[:,0],:]
    # val_edges, val_edges_false, test_edges, test_edges_false = None, None, None, None

    # number of positive (and negative) edges in test and val sets:
    num_test = int(np.floor(edges_positive.shape[0] / (100. / test_percent)))
    num_val = 0

    # sample positive edges for test and val sets:
    edges_positive_idx = np.arange(edges_positive.shape[0])
    np.random.shuffle(edges_positive_idx)
    val_edge_idx = edges_positive_idx[:num_val]
    test_edge_idx = edges_positive_idx[num_val:(num_val + num_test)]
    test_edges = edges_positive[test_edge_idx] # positive test edges
    val_edges = edges_positive[val_edge_idx] # positive val edges
    train_edges = np.delete(edges_positive, np.hstack([test_edge_idx, val_edge_idx]), axis = 0) # positive train edges

    # the above strategy for sampling without replacement will not work for
    # sampling negative edges on large graphs, because the pool of negative
    # edges is much much larger due to sparsity, therefore we'll use
    # the following strategy:
    # 1. sample random linear indices from adjacency matrix WITH REPLACEMENT
    # (without replacement is super slow). sample more than we need so we'll
    # probably have enough after all the filtering steps.
    # 2. remove any edges that have already been added to the other edge lists
    # 3. convert to (i,j) coordinates
    # 4. swap i and j where i > j, to ensure they're upper triangle elements
    # 5. remove any duplicate elements if there are any
    # 6. remove any diagonal elements
    # 7. if we don't have enough edges, repeat this process until we get enough
    positive_idx, _, _ = sparse_to_tuple(adj) # [i,j] coord pairs for all true edges
    positive_idx = positive_idx[:,0]*adj.shape[0] + positive_idx[:,1] # linear indices
    test_edges_false = np.empty((0,2),dtype='int64')
    idx_test_edges_false = np.empty((0,),dtype='int64')

    while len(test_edges_false) < len(test_edges):
        # step 1:
        idx = np.random.choice(adj.shape[0]**2, 2*(num_test - len(test_edges_false)), replace = True)
        # step 2:
        idx = idx[~np.in1d(idx, positive_idx, assume_unique = True)]
        idx = idx[~np.in1d(idx, idx_test_edges_false, assume_unique = True)]
        # step 3:
        rowidx = idx // adj.shape[0]
        colidx = idx % adj.shape[0]
        coords = np.vstack((rowidx,colidx)).transpose()
        # step 4:
        lowertrimask = coords[:,0] > coords[:,1]
        coords[lowertrimask] = coords[lowertrimask][:,::-1]
        # step 5:
        coords = np.unique(coords, axis = 0) # note: coords are now sorted lexicographically
        np.random.shuffle(coords) # not anymore
        # step 6:
        coords = coords[coords[:,0] != coords[:,1]]
        # step 7:
        coords = coords[:min(num_test, len(idx))]
        test_edges_false = np.append(test_edges_false, coords, axis = 0)
        idx = idx[:min(num_test, len(idx))]
        idx_test_edges_false = np.append(idx_test_edges_false, idx)

    val_edges_false = np.empty((0,2), dtype = 'int64')
    idx_val_edges_false = np.empty((0,), dtype = 'int64')
#     while len(val_edges_false) < len(val_edges):
#         # step 1:
#         idx = np.random.choice(adj.shape[0]**2, 2*(num_val - len(val_edges_false)), replace = True)
#         # step 2:
#         idx = idx[~np.in1d(idx, positive_idx, assume_unique = True)]
#         idx = idx[~np.in1d(idx, idx_test_edges_false, assume_unique = True)]
#         idx = idx[~np.in1d(idx, idx_val_edges_false, assume_unique = True)]
#         # step 3:
#         rowidx = idx // adj.shape[0]
#         colidx = idx % adj.shape[0]
#         coords = np.vstack((rowidx,colidx)).transpose()
#         # step 4:
#         lowertrimask = coords[:,0] > coords[:,1]
#         coords[lowertrimask] = coords[lowertrimask][:,::-1]
#         # step 5:
#         coords = np.unique(coords, axis = 0) # note: coords are now sorted lexicographically
#         np.random.shuffle(coords) # not any more
#         # step 6:
#         coords = coords[coords[:,0] != coords[:,1]]
#         # step 7:
#         coords = coords[:min(num_val, len(idx))]
#         val_edges_false = np.append(val_edges_false, coords, axis = 0)
#         idx = idx[:min(num_val, len(idx))]
#         idx_val_edges_false = np.append(idx_val_edges_false, idx)

    # sanity checks:
#     train_edges_linear = train_edges[:,0]*adj.shape[0] + train_edges[:,1]
#     test_edges_linear = test_edges[:,0]*adj.shape[0] + test_edges[:,1]
#     assert not np.any(np.in1d(idx_test_edges_false, positive_idx))
#     assert not np.any(np.in1d(idx_val_edges_false, positive_idx))
#     assert not np.any(np.in1d(val_edges[:,0]*adj.shape[0]+val_edges[:,1], train_edges_linear))
#     assert not np.any(np.in1d(test_edges_linear, train_edges_linear))
#     assert not np.any(np.in1d(val_edges[:,0]*adj.shape[0]+val_edges[:,1], test_edges_linear))

    # Re-build adj matrix
    data = np.ones(train_edges.shape[0])
    adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape)
    adj_train = adj_train + adj_train.T
    return adj_train, val_edges, val_edges_false, test_edges, test_edges_false

In [17]:
adjList = defaultdict(set)                          # Convert set to list later for node2vec, set: to handle duplicates
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        paperID = data['id']
        references = data.get('references', [])
        for referencedPaper in references:
            adjList[paperID].add(referencedPaper)
            adjList[referencedPaper].add(paperID)

adjList = {key: list(values) for key, values in adjList.items()}
G = nx.from_dict_of_lists(adjList)

nnodes = G.number_of_nodes()
avgDegree = sum(d for n, d in G.degree()) / float(nnodes)
print('Number of nodes: ', nnodes, '. Number of edges: ', G.number_of_edges(), '. Avg Degree: ', avgDegree)

adj_sparse = nx.to_scipy_sparse_matrix(G)
# Perform train-test split
adj_train, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj_sparse)
print('Constructing new graph')
G_train = nx.from_scipy_sparse_matrix(adj_train) # new graph object with only non-hidden edges



Number of nodes:  471633 . Number of edges:  5464345 . Avg Degree:  23.172021465843144
Constructing new graph


In [25]:
nodes = list(G.nodes())

In [18]:
def getEdgeEmbedding(embedding1, embedding2, policy='Hadamard'):
    if (policy=='Hadamard'):
        return embedding1 * embedding2
    elif (policy=='Avg'):
        return (embedding1 + embedding2) / 2
def average(lis):
    return (sum(lis) / len(lis))

In [27]:
X = []


            
edges = [*test_edges, *test_edges_false]
for edge in edges :

    u = nodes[edge[0]]
    v = nodes[edge[1]]
    if u in embeddingDict:
        embedding1 = np.asarray(embeddingDict[u])
    else:
        embedding1 = np.asarray([0] * 32)
    if v in embeddingDict:
        embedding2 = np.asarray(embeddingDict[v])
    else:
        embedding2 = np.asarray([0] * 32)
    edgeEmbedding =  getEdgeEmbedding(embedding1, embedding2)
    X.append(edgeEmbedding)

Y = np.asarray([1] * len(test_edges) + [0] * len(test_edges_false))

del embeddingDict    
X = np.asarray(X)
    

In [28]:

kfold = KFold(n_splits=5, shuffle=True)
for name, clf in zip(names, classifiers):
    precScores = []
    recallScores = []
    f1Scores = []
    count = 1
    for train_index, test_index in kfold.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        print('Fitting: ', count)
        clf.fit(X_train, y_train)
        print('count ', count)
        y_pred = clf.predict(X_test)
        prec, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        precScores.append(prec)
        recallScores.append(recall)
        f1Scores.append(fscore)
        count += 1
    print('Name', name,'. Avg Precision: ', average(precScores), '. Avg Recall: ', average(recallScores), '. Avg F-1 Score: ', average(f1Scores) )


Fitting:  1


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.1s finished


count  1
Fitting:  2


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.


count  2
Fitting:  3


[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.7s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.1s finished


count  3
Fitting:  4


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.4s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.1s finished


count  4
Fitting:  5


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.1s finished


count  5
Name Random Forest . Avg Precision:  0.740860822956358 . Avg Recall:  0.7387222328644621 . Avg F-1 Score:  0.7381424303229813
Fitting:  1
Iteration 1, loss = 0.69369339
Validation score: 0.500515
Iteration 2, loss = 0.69273241
Validation score: 0.499485
Iteration 3, loss = 0.69249914
Validation score: 0.500515
Iteration 4, loss = 0.69212476
Validation score: 0.734073
Iteration 5, loss = 0.69156321
Validation score: 0.504289
Iteration 6, loss = 0.69083401
Validation score: 0.596248
Iteration 7, loss = 0.68988860
Validation score: 0.732929
Iteration 8, loss = 0.68865197
Validation score: 0.728354
Iteration 9, loss = 0.68690612
Validation score: 0.593275
Iteration 10, loss = 0.68486385
Validation score: 0.625529
Iteration 11, loss = 0.68232982
Validation score: 0.700332
Iteration 12, loss = 0.67935097
Validation score: 0.732357
Iteration 13, loss = 0.67586542
Validation score: 0.731328
Iteration 14, loss = 0.67208213
Validation score: 0.700446
Iteration 15, loss = 0.66814190
Vali

In [24]:
test_edges

array([[143157, 421449],
       [ 86911, 145379],
       [ 98634, 307101],
       ...,
       [152374, 389145],
       [ 59286, 470841],
       [206475, 243476]], dtype=int32)