In [1]:
import json
import numpy as np
import hnswlib
import networkx as nx
from collections import defaultdict
from tqdm.notebook import tqdm

## Reading Embeddings and creating Indexer for NN search

In [2]:
IDList = []                                # List of paper IDs
NNList = []                                # List of list, NNList[i]: NNs to paper whose id is IDList[i]
embeddings = []                            # Embeddings read from the input file

with open('./data/dblp_Abstract_2Thresholded_USE_Trans_Embeddings.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        paperID = data['id']
        embedding = data['embedding']
        IDList.append(paperID)
        embeddings.append(embedding)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [3]:
numElements = len(IDList)
dimension = len(embeddings[0])
embeddings = np.asarray(embeddings)
data_labels = np.arange(numElements)

In [4]:
p = hnswlib.Index(space = 'cosine', dim = dimension) # possible options are l2, cosine or ip

# Initing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = numElements, ef_construction = 200, M = dimension)

# Element insertion (can be called several times):
p.add_items(embeddings, data_labels)

# Controlling the recall by setting ef:
p.set_ef(50) # ef should always be > k

# Query dataset, k - number of closest elements (returns 2 numpy arrays)
labels, _ = p.knn_query(embeddings, k = 5)

In [5]:
index_path='./models/USETranshnswlibAbstract.bin'
print("Saving index to '%s'" % index_path)
p.save_index("./models/USETranshnswlibAbstract.bin")
del p

Saving index to './models/USETranshnswlibAbstract.bin'


In [6]:
del embeddings
del data_labels

In [None]:
p = hnswlib.Index(space='cosine', dim=dimension)  # the space can be changed - keeps the data, alters the distance function.

# Increase the total capacity (max_elements), so that it will handle the new data
p.load_index("./models/USETranshnswlib.bin", max_elements = numElements)
labels, _ = p.knn_query(embeddings, k = 4)
del p
del embeddings
del data_labels

## Examples of NN obtained using the Text Embeddings

In [None]:
titles = []
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        titles.append(data['title'])

In [None]:
count = 5
for i in range(count):
    print('Paper: ', titles[i])
    print('Nearest Papers: ', [titles[ind] for ind in labels[i] if ind != i])
    print('\n')

## Building Adjacency List for Node Embeddings

### Creating Citation Adjacency List

In [7]:
adjList = defaultdict(set)                          # Convert set to list later for node2vec, set: to handle duplicates
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        paperID = data['id']
        references = data.get('references', [])
        for referencedPaper in references:
            adjList[paperID].add(referencedPaper)
            adjList[referencedPaper].add(paperID)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




### Augmenting Adj List with FastText NNs

In [8]:
nnToKeep = 4
id = 0
for label in labels:
    paperID = IDList[id]
    label = [IDList[index] for index in label if index != id]
    if (len(label) > nnToKeep):
        del label[nnToKeep:]
    for referencedPaper in label:
        adjList[paperID].add(referencedPaper)
        adjList[referencedPaper].add(paperID)
    id += 1

### Creating NetworkX Graph and reporting graph statistics

In [9]:
adjList = {key: list(values) for key, values in adjList.items()}
G = nx.from_dict_of_lists(adjList)

nnodes = G.number_of_nodes()
avgDegree = sum(d for n, d in G.degree()) / float(nnodes)
print('Number of nodes: ', nnodes, '. Number of edges: ', G.number_of_edges(), '. Avg Degree: ', avgDegree)

Number of nodes:  475839 . Number of edges:  6857183 . Avg Degree:  28.821441706123288


## Node2Vec Embeddings

In [10]:
from node2vec import Node2Vec
walkLength = 8
node2vec = Node2Vec(G, walk_length = walkLength, p = 1.5, q = 0.4, num_walks = 12)#, workers = 12, temp_folder = './data/tmp_data')
          

Computing transition probabilities:  35%|███▌      | 168110/475839 [1:02:32<1:31:43, 55.92it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Computing transition probabilities:  50%|█████     | 238411/475839 [1:24:05<1:14:58, 52.78it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Computing transition probabilities:  69%|██████▉   | 327601/475839 [1:46:28<46:55, 52.65it/s]  IOPub message rate exceeded.
The notebook server will temporarily

In [11]:
model = node2vec.fit()  # returns a gensim wv model

In [12]:
outFileName = './models/node2vec_USE_Abstract_2Citation_Embeddings_WL_' + str(walkLength) + '_NN_' + str(nnToKeep) + '2.kv'
model.wv.save_word2vec_format(outFileName)

In [13]:
from gensim import models
loadModel = models.keyedvectors.KeyedVectors.load_word2vec_format('./models/node2vec_USE_Abstract_2Citation_Embeddings_WL_8_NN_42.kv')

In [14]:
titles = []
IDList = []
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        titles.append(data['title'])
        IDList.append(data['id'])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
loadModel.most_similar(IDList[10])

In [20]:
for id, title in zip(IDList, titles):
    if (id == '1614298861'):
        print(title)
        break

Efficient Estimation of Word Representations in Vector Space


In [15]:
def ret(paperID):
    for id, title in zip(IDList, titles):
        if (id == paperID):
            return title
            

In [16]:
[ret(id) for id, _ in loadModel.most_similar('1614298861')]

['Distributed Representations of Words and Phrases and their Compositionality',
 'Glove: Global Vectors for Word Representation',
 'Distributed Representations of Sentences and Documents',
 'Hierarchical Attention Networks for Document Classification',
 'Semantic Wide and Deep Learning for Detecting Crisis-Information Categories on Social Media',
 'Convolutional Neural Networks for Sentence Classification',
 'Joint Embedding of Hierarchical Categories and Entities for Concept Categorization and Dataless Classification.',
 'Man is to computer programmer as woman is to homemaker? debiasing word embeddings',
 'Deeper Attention to Abusive User Content Moderation',
 'Compositional Recurrent Neural Networks for Chinese Short Text Classification']

In [None]:
[ret(id) for id in adjList['1614298861']]

In [None]:
adjList['1614298861']