In [1]:
import json
import numpy as np
import hnswlib
import networkx as nx
from collections import defaultdict

In [2]:
import scipy.sparse as sp

"""
Disclaimer: functions defined from lines 15 to 36 in this file come from 
tkipf/gae original repository on Graph Autoencoders. Moreover, the
mask_test_edges function is borrowed from philipjackson's mask_test_edges 
pull request on this same repository.
"""

def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

def preprocess_graph(adj):
    adj = sp.coo_matrix(adj)
    adj_ = adj + sp.eye(adj.shape[0])
    degree_mat_inv_sqrt = sp.diags(np.power(np.array(adj_.sum(1)), -0.5).flatten())
    adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt)
    return sparse_to_tuple(adj_normalized)

def construct_feed_dict(adj_normalized, adj, features, placeholders):
    # Construct feed dictionary
    feed_dict = dict()
    feed_dict.update({placeholders['features']: features})
    feed_dict.update({placeholders['adj']: adj_normalized})
    feed_dict.update({placeholders['adj_orig']: adj})
    return feed_dict

def mask_test_edges(adj, test_percent=1., val_percent=0.):
    """ Randomly removes some edges from original graph to create
    test and validation sets for link prediction task
    :param adj: complete sparse adjacency matrix of the graph
    :param test_percent: percentage of edges in test set
    :param val_percent: percentage of edges in validation set
    :return: train incomplete adjacency matrix, validation and test sets
    """
    # Remove diagonal elements
    adj = adj - sp.dia_matrix((adj.diagonal()[None, :], [0]), shape=adj.shape)
    adj.eliminate_zeros()
    # Check that diag is zero:
    #assert adj.diagonal().sum() == 0

    edges_positive, _, _ = sparse_to_tuple(adj)
    # Filtering out edges from lower triangle of adjacency matrix
    edges_positive = edges_positive[edges_positive[:,1] > edges_positive[:,0],:]
    # val_edges, val_edges_false, test_edges, test_edges_false = None, None, None, None

    # number of positive (and negative) edges in test and val sets:
    num_test = int(np.floor(edges_positive.shape[0] / (100. / test_percent)))
    num_val = 0

    # sample positive edges for test and val sets:
    edges_positive_idx = np.arange(edges_positive.shape[0])
    np.random.shuffle(edges_positive_idx)
    val_edge_idx = edges_positive_idx[:num_val]
    test_edge_idx = edges_positive_idx[num_val:(num_val + num_test)]
    test_edges = edges_positive[test_edge_idx] # positive test edges
    val_edges = edges_positive[val_edge_idx] # positive val edges
    train_edges = np.delete(edges_positive, np.hstack([test_edge_idx, val_edge_idx]), axis = 0) # positive train edges

    # the above strategy for sampling without replacement will not work for
    # sampling negative edges on large graphs, because the pool of negative
    # edges is much much larger due to sparsity, therefore we'll use
    # the following strategy:
    # 1. sample random linear indices from adjacency matrix WITH REPLACEMENT
    # (without replacement is super slow). sample more than we need so we'll
    # probably have enough after all the filtering steps.
    # 2. remove any edges that have already been added to the other edge lists
    # 3. convert to (i,j) coordinates
    # 4. swap i and j where i > j, to ensure they're upper triangle elements
    # 5. remove any duplicate elements if there are any
    # 6. remove any diagonal elements
    # 7. if we don't have enough edges, repeat this process until we get enough
    positive_idx, _, _ = sparse_to_tuple(adj) # [i,j] coord pairs for all true edges
    positive_idx = positive_idx[:,0]*adj.shape[0] + positive_idx[:,1] # linear indices
    test_edges_false = np.empty((0,2),dtype='int64')
    idx_test_edges_false = np.empty((0,),dtype='int64')

    while len(test_edges_false) < len(test_edges):
        # step 1:
        idx = np.random.choice(adj.shape[0]**2, 2*(num_test - len(test_edges_false)), replace = True)
        # step 2:
        idx = idx[~np.in1d(idx, positive_idx, assume_unique = True)]
        idx = idx[~np.in1d(idx, idx_test_edges_false, assume_unique = True)]
        # step 3:
        rowidx = idx // adj.shape[0]
        colidx = idx % adj.shape[0]
        coords = np.vstack((rowidx,colidx)).transpose()
        # step 4:
        lowertrimask = coords[:,0] > coords[:,1]
        coords[lowertrimask] = coords[lowertrimask][:,::-1]
        # step 5:
        coords = np.unique(coords, axis = 0) # note: coords are now sorted lexicographically
        np.random.shuffle(coords) # not anymore
        # step 6:
        coords = coords[coords[:,0] != coords[:,1]]
        # step 7:
        coords = coords[:min(num_test, len(idx))]
        test_edges_false = np.append(test_edges_false, coords, axis = 0)
        idx = idx[:min(num_test, len(idx))]
        idx_test_edges_false = np.append(idx_test_edges_false, idx)

    val_edges_false = np.empty((0,2), dtype = 'int64')
    idx_val_edges_false = np.empty((0,), dtype = 'int64')
#     while len(val_edges_false) < len(val_edges):
#         # step 1:
#         idx = np.random.choice(adj.shape[0]**2, 2*(num_val - len(val_edges_false)), replace = True)
#         # step 2:
#         idx = idx[~np.in1d(idx, positive_idx, assume_unique = True)]
#         idx = idx[~np.in1d(idx, idx_test_edges_false, assume_unique = True)]
#         idx = idx[~np.in1d(idx, idx_val_edges_false, assume_unique = True)]
#         # step 3:
#         rowidx = idx // adj.shape[0]
#         colidx = idx % adj.shape[0]
#         coords = np.vstack((rowidx,colidx)).transpose()
#         # step 4:
#         lowertrimask = coords[:,0] > coords[:,1]
#         coords[lowertrimask] = coords[lowertrimask][:,::-1]
#         # step 5:
#         coords = np.unique(coords, axis = 0) # note: coords are now sorted lexicographically
#         np.random.shuffle(coords) # not any more
#         # step 6:
#         coords = coords[coords[:,0] != coords[:,1]]
#         # step 7:
#         coords = coords[:min(num_val, len(idx))]
#         val_edges_false = np.append(val_edges_false, coords, axis = 0)
#         idx = idx[:min(num_val, len(idx))]
#         idx_val_edges_false = np.append(idx_val_edges_false, idx)

    # sanity checks:
#     train_edges_linear = train_edges[:,0]*adj.shape[0] + train_edges[:,1]
#     test_edges_linear = test_edges[:,0]*adj.shape[0] + test_edges[:,1]
#     assert not np.any(np.in1d(idx_test_edges_false, positive_idx))
#     assert not np.any(np.in1d(idx_val_edges_false, positive_idx))
#     assert not np.any(np.in1d(val_edges[:,0]*adj.shape[0]+val_edges[:,1], train_edges_linear))
#     assert not np.any(np.in1d(test_edges_linear, train_edges_linear))
#     assert not np.any(np.in1d(val_edges[:,0]*adj.shape[0]+val_edges[:,1], test_edges_linear))

    # Re-build adj matrix
    data = np.ones(train_edges.shape[0])
    adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape)
    adj_train = adj_train + adj_train.T
    return adj_train, val_edges, val_edges_false, test_edges, test_edges_false

## Reading Embeddings and creating Indexer for NN search

In [3]:
IDList = []                                # List of paper IDs
NNList = []                                # List of list, NNList[i]: NNs to paper whose id is IDList[i]
embeddings = []                            # Embeddings read from the input file

with open('./data/dblp_Abstract_2Thresholded_USE_Trans_Embeddings.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        paperID = data['id']
        embedding = data['embedding']
        IDList.append(paperID)
        embeddings.append(embedding)

In [4]:
numElements = len(IDList)
dimension = len(embeddings[0])
embeddings = np.asarray(embeddings)
data_labels = np.arange(numElements)

In [5]:
p = hnswlib.Index(space='cosine', dim=dimension)  # the space can be changed - keeps the data, alters the distance function.

# Increase the total capacity (max_elements), so that it will handle the new data
p.load_index("./models/USETranshnswlib.bin", max_elements = numElements)
labels, _ = p.knn_query(embeddings, k = 4)
del p
del embeddings
del data_labels

## Building Adjacency List for Node Embeddings

### Creating Citation Adjacency List

In [6]:
adjList = defaultdict(set)                          # Convert set to list later for node2vec, set: to handle duplicates
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        paperID = data['id']
        references = data.get('references', [])
        for referencedPaper in references:
            adjList[paperID].add(referencedPaper)
            adjList[referencedPaper].add(paperID)

### Augmenting Adj List with FastText NNs

In [7]:
nnToKeep = 3
id = 0
for label in labels:
    paperID = IDList[id]
    label = [IDList[index] for index in label if index != id]
    if (len(label) > nnToKeep):
        del label[nnToKeep:]
    for referencedPaper in label:
        adjList[paperID].add(referencedPaper)
        adjList[referencedPaper].add(paperID)
    id += 1

### Creating NetworkX Graph and reporting graph statistics

In [8]:
adjList = {key: list(values) for key, values in adjList.items()}
G = nx.from_dict_of_lists(adjList)

nnodes = G.number_of_nodes()
avgDegree = sum(d for n, d in G.degree()) / float(nnodes)
print('Number of nodes: ', nnodes, '. Number of edges: ', G.number_of_edges(), '. Avg Degree: ', avgDegree)

Number of nodes:  475839 . Number of edges:  6760367 . Avg Degree:  28.414514152896253


In [9]:
adj_sparse = nx.to_scipy_sparse_matrix(G)

In [None]:
adj_sparse.data

In [10]:
# Perform train-test split
adj_train, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj_sparse)
print('Constructing new graph')
G_train = nx.from_scipy_sparse_matrix(adj_train) # new graph object with only non-hidden edges

Constructing new graph


In [11]:
G_train.number_of_nodes() == G.number_of_nodes()

True

In [12]:
nodes = list(G.nodes())

In [None]:
for adjV in adjList[u]:
    if adjV == v:
        print('Found')
    
## paperID mapping: G.nodes() are paperIDs one to one mapped with G_train.nodes which are 0 to |V| - 1
## test edges and test edges negative are integer

## Node2Vec Embeddings

In [None]:
from node2vec import Node2Vec
walkLength = 8
node2vec = Node2Vec(G_train, walk_length = walkLength)#, workers = 12, temp_folder = './data/tmp_data')
          

Computing transition probabilities:   1%|          | 5509/475839 [22:03<43:16:25,  3.02it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Computing transition probabilities:   3%|▎         | 13006/475839 [43:04<32:26:49,  3.96it/s] 

In [None]:
model = node2vec.fit()  # returns a gensim wv model

In [None]:
outFileName = './models/node2vec_USE_2Citation_Embeddings_WL_' + str(walkLength) + '_NN_' + str(nnToKeep) + '.kv'
model.wv.save_word2vec_format(outFileName)

## Edge Prediction

In [None]:
def getEdgeEmbedding(embedding1, embedding2, policy='Hadamard'):
    if (policy=='Hadamard'):
        return embedding1 * embedding2
    elif (policy=='Avg'):
        return (embedding1 + embedding2) / 2
def average(lis):
    return (sum(lis) / len(lis))

In [None]:
word2VecMode = False

In [None]:
X = []

embeddingDict = dict()

if (word2VecMode):
    embeddingFileName = './data/dblpAbstract_2Thresholded_FT_Embeddings.json'
    with open(embeddingFileName, 'r') as file:
        for line in file:
            data = json.loads(line)
            embeddingDict[data['id']] = data['embedding']
            
edges = [*test_edges, *test_edges_false]
for edge in edges :
    if (word2VecMode):
        paperID1 = nodes[edge[0]]
        paperID2 = nodes[edge[1]]
        embedding1 = np.asarray(embeddingDict[paperID1])
        embedding2 = np.asarray(embeddingDict[paperID2])
    else:
        u = str(edge[0])
        v = str(edge[1])
        if u in model.wv.vocab:
            embedding1 = np.asarray(model.wv[u])
        else:
            embedding1 = np.asarray([0] * 128)
        if v in model.wv.vocab:
            embedding2 = np.asarray(model.wv[v])
        else:
            embedding2 = np.asarray([0] * 128)
    edgeEmbedding =  getEdgeEmbedding(embedding1, embedding2)
    X.append(edgeEmbedding)

Y = np.asarray([1] * len(test_edges) + [0] * len(test_edges_false))

del embeddingDict    
X = np.asarray(X)
    

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier


names = [
 "Random Forest", "Neural Net", "Logistic Regression", "Linear SVC" ]

classifiers = [
    RandomForestClassifier(verbose=True, n_jobs = -1),
    MLPClassifier(verbose=True, early_stopping=True),
    LogisticRegression(n_jobs=-1),
    OneVsRestClassifier(BaggingClassifier(LinearSVC(),n_jobs = -1))]


In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, KFold

kfold = KFold(n_splits=5, shuffle=True)
for name, clf in zip(names, classifiers):
    precScores = []
    recallScores = []
    f1Scores = []
    count = 1
    for train_index, test_index in kfold.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        print('Fitting: ', count)
        clf.fit(X_train, y_train)
        print('count ', count)
        y_pred = clf.predict(X_test)
        prec, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        precScores.append(prec)
        recallScores.append(recall)
        f1Scores.append(fscore)
        count += 1
    print('Name', name,'. Avg Precision: ', average(precScores), '. Avg Recall: ', average(recallScores), '. Avg F-1 Score: ', average(f1Scores) )
