In [24]:
import json
from elasticsearch import Elasticsearch
import requests
from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [39]:
import re
import string

def preprocess(s):
    s = re.sub(r'\d+', '', s)
    translator = str.maketrans('', '', string.punctuation) 
    s = s.translate(translator) 
    s = s.strip()
    return s

def getSpotsAndEntities(text, rhoThreshold = 0.1, long_text = 0):
    url = 'https://tagme.d4science.org/tagme/tag'
    params = {'lang': 'en', 'include_abstract': 'false', 'include_categories': 'false', 'gcube-token': '42aa36f7-4770-4574-8ef8-45138f3ba072-843339462', 'text': text, 'long_text': long_text}
    rhoThreshold = rhoThreshold
    entities = []
    spots = []
    r = requests.get(url = url, params = params) 
    data = r.json()
    for annotation in data['annotations']:
        if annotation['rho'] > rhoThreshold:
            entities.append(annotation['title'])
            spots.append(annotation['spot'])
    spots = Counter(spots)
    spots = [(s, spots[s]) for s in spots.keys()]
    entities = Counter(entities)
    entities = [(s, entities[s]) for s in entities.keys()]
    return spots, entities

In [11]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
def embed(inputTexts):
    return model.encode(inputTexts)

## Entity -Similarity Matrix Search

In [15]:
def cosineSimilarity(a, b):
    a = np.asarray(a)
    b = np.asarray(b)
    return dot(a, b)/(norm(a)*norm(b))

def l1similarity(a, b):
    a = np.asarray(a)
    b = np.asarray(b)
    return 1 / ( 1+ np.linalg.norm((a - b), ord=1))



In [54]:
dictForTitles = dict()
dictForBody = dict()
titleDictLoaded = False
bodyDictLoaded = False
queryEntitySpotDict = dict()
def getQueryEntitiesAndSpots(query, long_text = 0):
    global queryEntitySpotDict 
    if query in queryEntitySpotDict:
        return queryEntitySpotDict[query]['entities'], queryEntitySpotDict[query]['spots']
    else:
        spotsWithFreq, entitiesWithFreq = getSpotsAndEntities(query, long_text = 0)
        queryEntitySpotDict[query]= {'entities': entitiesWithFreq,'spots' : spotsWithFreq}
        return entitiesWithFreq, spotsWithFreq
def loadEntityDict(method='title'):
    global dictForTitles
    global dictForBody
    if method == 'title':
        with open('./data/Explicit_Semantic_Ranking_Dataset/TitleEntitiesPerPaper.json', 'r') as file:
            for line in file:
                dictForTitles = json.loads(line)
    elif method == 'body':
        with open('./data/Explicit_Semantic_Ranking_Dataset/BodyEntitiesPerPaper.json', 'r') as file:
            for line in file:
                dictForBody = json.loads(line)
def retrieveSpots(docID, method='title'):
    '''Returns pre computed spot mentions for this docID, where each element is a tuple of (spot name, frequency)'''
    global titleDictLoaded 
    global bodyDictLoaded
    if titleDictLoaded == False and method == 'title':
        loadEntityDict(method='title')
        titleDictLoaded = True
    elif bodyDictLoaded == False and method == 'body':
        loadEntityDict(method='body')
        bodyDictLoaded = True
        
    if method == 'title':
        return dictForTitles[docID]['spots']
    elif method == 'body':
        return dictForBody[docID]['spots']
    
def retrieveEntities(docID, method='title'):
    '''Returns pre computed entities for this docID, where each element is a tuple of (entity name, frequency)'''
    global titleDictLoaded 
    global bodyDictLoaded
    if titleDictLoaded == False and method == 'title':
        loadEntityDict(method='title')
        titleDictLoaded = True
    elif bodyDictLoaded == False and method == 'body':
        loadEntityDict(method='body')
        bodyDictLoaded = True
        
    if method == 'title':
        return dictForTitles[docID]['entities']
    elif method == 'body':
        return dictForBody[docID]['entities']
    
def retrieveEntityEmbedding(entity):
    try:
        return entityEmbeddingDict[entity]
    except:
        return  embed([entity])[0]
    

def computeSimilarityMatrix(query, text, method = 'title'):
    _, queryEntitiesWithFreq = getQueryEntitiesAndSpots(query, long_text = 0)   ## since query is expected to be short
    _, docEntitiesWithFreq = getSpotsAndEntities(text)
    docEntityFrequencies = [entityTuple[1] for entityTuple in docEntitiesWithFreq]
    queryEntityFrequencies = [entityTuple[1] for entityTuple in queryEntitiesWithFreq]

    queryEntities = []
    for entityTuple in queryEntitiesWithFreq:
        queryEntities.append(preprocess(entityTuple[0]))
    queryEntityEmbeddings = embed(queryEntities)
    docEntityEmbeddings = embed([preprocess(entityTuple[0]) for entityTuple in docEntitiesWithFreq])

    
    numDocEntities = len(docEntitiesWithFreq)
    numQueryEntities = len(queryEntitiesWithFreq)
    simMatrix = np.zeros((numDocEntities, numQueryEntities))
    for i in range(numDocEntities):
        for j in range(numQueryEntities):
            simMatrix[i][j] = max(0, cosineSimilarity(docEntityEmbeddings[i], queryEntityEmbeddings[j]))
    return simMatrix,  queryEntityFrequencies, docEntityFrequencies

def reduceMatrix(simMatrix,  queryEntityFrequencies, docEntityFrequencies, axis = 'column', pooling = 'max'):
    if axis == 'column':
        axis = 0
    else:
        axis = 1
    if pooling == 'max':
        try:
            return np.max(simMatrix, axis = axis) # along columns
        except:
            return np.zeros(1)
    
def reduceVector(vector, reduction = 'avg'):
    if not  vector.size:
        return 0
    if reduction == 'avg':
        return sum(vector) / len(vector)
    
def semanticScore(query, text, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simMatrix,  queryEntityFrequencies, docEntityFrequencies = computeSimilarityMatrix(query, text, method = method)
    vector = reduceMatrix(simMatrix,  queryEntityFrequencies, docEntityFrequencies, axis = axis, pooling = pooling)
    score = reduceVector(vector, reduction = reduction)
    return score





def normalize(lis):
    _min = min(lis)
    _max = max(lis)
    lis  = [(x - _min)/(_max - _min) for x in lis]
    return lis


In [26]:
titles = []
IDList = []
with open('./data/Explicit_Semantic_Ranking_Dataset/s2_doc.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        titles.append(data['title'][0])
        IDList.append(data['docno'])
def ret(paperID):
    for id, title in zip(IDList, titles):
        if (id == paperID):
            return title

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Preparing Dataset

In [17]:
records = dict()
with open('./data/Explicit_Semantic_Ranking_Dataset/s2_doc.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        records[data['docno']] = (data['title'][0], data['numCitedBy'][0], data['numKeyCitations'][0])
        
def getPaperDetails(docno):
    return records[docno]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [21]:
queryList = []
with open('./data/Explicit_Semantic_Ranking_Dataset/s2_query.json') as file:
    for line in file:
        data = json.loads(line)
        queryList.append(data['query'])
queryEntityList = []
for query in tqdm(queryList):
    _, queryEntitiesWithFreq = getQueryEntitiesAndSpots(query, long_text = 0) 
    queryEntities = []
    for entityTuple in queryEntitiesWithFreq:
        queryEntities.append(preprocess(entityTuple[0]))
    queryEntityList.append(queryEntities)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [33]:
titleList = []
citationList =  []
keyCitationList = []
queryIDList = []
Y = []
with open('./data/Explicit_Semantic_Ranking_Dataset/s2.qrel') as file:
    for line in tqdm(file):
        lineString = line.split()
        qid = int(lineString[0]) - 1    # - 1 to account for our 0 based indexing, while theirs is 1 based
        docno = lineString[2]
        relScore = int(lineString[-1])
        if docno in records:
            title, citations, keyCitations = records[docno]
            queryIDList.append(qid)
            titleList.append(title)
            citationList.append(citations)
            keyCitationList.append(keyCitations)
            Y.append(relScore)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [55]:
semScores = []
for qid, title in tqdm(zip(queryIDList, titleList)):
    semScores.append(semanticScore(queryList[qid], title, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
## semScores citationList and keyCitationList are the feature vectors, Y to predict

In [58]:
X = []
for score, citation, keyCitation in zip(semScores, citationList, keyCitationList):
    X.append([score, citation, keyCitation])

In [60]:
with open('./data/Explicit_Semantic_Ranking_Dataset/trainData.json', 'w') as outfile:
    tmp = dict()
    tmp['X'] = X
    tmp['Y'] = Y
    json.dump(tmp, outfile)
    outfile.write('\n')