In [2]:
import json
from elasticsearch import Elasticsearch
import requests
from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [39]:
import re
import string

def preprocess(s):
    s = re.sub(r'\d+', '', s)
    translator = str.maketrans('', '', string.punctuation) 
    s = s.translate(translator) 
    s = s.strip()
    return s

def getSpotsAndEntities(text, rhoThreshold = 0.1, long_text = 0):
    url = 'https://tagme.d4science.org/tagme/tag'
    params = {'lang': 'en', 'include_abstract': 'false', 'include_categories': 'false', 'gcube-token': '42aa36f7-4770-4574-8ef8-45138f3ba072-843339462', 'text': text, 'long_text': long_text}
    rhoThreshold = rhoThreshold
    entities = []
    spots = []
    r = requests.get(url = url, params = params) 
    data = r.json()
    for annotation in data['annotations']:
        if annotation['rho'] > rhoThreshold:
            entities.append(annotation['title'])
            spots.append(annotation['spot'])
    spots = Counter(spots)
    spots = [(s, spots[s]) for s in spots.keys()]
    entities = Counter(entities)
    entities = [(s, entities[s]) for s in entities.keys()]
    return spots, entities

In [11]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
def embed(inputTexts):
    return model.encode(inputTexts)

## Entity -Similarity Matrix Search

In [15]:
def cosineSimilarity(a, b):
    a = np.asarray(a)
    b = np.asarray(b)
    return dot(a, b)/(norm(a)*norm(b))

def l1similarity(a, b):
    a = np.asarray(a)
    b = np.asarray(b)
    return 1 / ( 1+ np.linalg.norm((a - b), ord=1))



In [54]:
dictForTitles = dict()
dictForBody = dict()
titleDictLoaded = False
bodyDictLoaded = False
queryEntitySpotDict = dict()
def getQueryEntitiesAndSpots(query, long_text = 0):
    global queryEntitySpotDict 
    if query in queryEntitySpotDict:
        return queryEntitySpotDict[query]['entities'], queryEntitySpotDict[query]['spots']
    else:
        spotsWithFreq, entitiesWithFreq = getSpotsAndEntities(query, long_text = 0)
        queryEntitySpotDict[query]= {'entities': entitiesWithFreq,'spots' : spotsWithFreq}
        return entitiesWithFreq, spotsWithFreq
def loadEntityDict(method='title'):
    global dictForTitles
    global dictForBody
    if method == 'title':
        with open('./data/Explicit_Semantic_Ranking_Dataset/TitleEntitiesPerPaper.json', 'r') as file:
            for line in file:
                dictForTitles = json.loads(line)
    elif method == 'body':
        with open('./data/Explicit_Semantic_Ranking_Dataset/BodyEntitiesPerPaper.json', 'r') as file:
            for line in file:
                dictForBody = json.loads(line)
def retrieveSpots(docID, method='title'):
    '''Returns pre computed spot mentions for this docID, where each element is a tuple of (spot name, frequency)'''
    global titleDictLoaded 
    global bodyDictLoaded
    if titleDictLoaded == False and method == 'title':
        loadEntityDict(method='title')
        titleDictLoaded = True
    elif bodyDictLoaded == False and method == 'body':
        loadEntityDict(method='body')
        bodyDictLoaded = True
        
    if method == 'title':
        return dictForTitles[docID]['spots']
    elif method == 'body':
        return dictForBody[docID]['spots']
    
def retrieveEntities(docID, method='title'):
    '''Returns pre computed entities for this docID, where each element is a tuple of (entity name, frequency)'''
    global titleDictLoaded 
    global bodyDictLoaded
    if titleDictLoaded == False and method == 'title':
        loadEntityDict(method='title')
        titleDictLoaded = True
    elif bodyDictLoaded == False and method == 'body':
        loadEntityDict(method='body')
        bodyDictLoaded = True
        
    if method == 'title':
        return dictForTitles[docID]['entities']
    elif method == 'body':
        return dictForBody[docID]['entities']
    
def retrieveEntityEmbedding(entity):
    try:
        return entityEmbeddingDict[entity]
    except:
        return  embed([entity])[0]
    

def computeSimilarityMatrix(query, text, method = 'title'):
    _, queryEntitiesWithFreq = getQueryEntitiesAndSpots(query, long_text = 0)   ## since query is expected to be short
    _, docEntitiesWithFreq = getSpotsAndEntities(text)
    docEntityFrequencies = [entityTuple[1] for entityTuple in docEntitiesWithFreq]
    queryEntityFrequencies = [entityTuple[1] for entityTuple in queryEntitiesWithFreq]

    queryEntities = []
    for entityTuple in queryEntitiesWithFreq:
        queryEntities.append(preprocess(entityTuple[0]))
    queryEntityEmbeddings = embed(queryEntities)
    docEntityEmbeddings = embed([preprocess(entityTuple[0]) for entityTuple in docEntitiesWithFreq])

    
    numDocEntities = len(docEntitiesWithFreq)
    numQueryEntities = len(queryEntitiesWithFreq)
    simMatrix = np.zeros((numDocEntities, numQueryEntities))
    for i in range(numDocEntities):
        for j in range(numQueryEntities):
            simMatrix[i][j] = max(0, cosineSimilarity(docEntityEmbeddings[i], queryEntityEmbeddings[j]))
    return simMatrix,  queryEntityFrequencies, docEntityFrequencies

def reduceMatrix(simMatrix,  queryEntityFrequencies, docEntityFrequencies, axis = 'column', pooling = 'max'):
    if axis == 'column':
        axis = 0
    else:
        axis = 1
    if pooling == 'max':
        try:
            return np.max(simMatrix, axis = axis) # along columns
        except:
            return np.zeros(1)
    
def reduceVector(vector, reduction = 'avg'):
    if not  vector.size:
        return 0
    if reduction == 'avg':
        return sum(vector) / len(vector)
    
def semanticScore(query, text, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simMatrix,  queryEntityFrequencies, docEntityFrequencies = computeSimilarityMatrix(query, text, method = method)
    vector = reduceMatrix(simMatrix,  queryEntityFrequencies, docEntityFrequencies, axis = axis, pooling = pooling)
    score = reduceVector(vector, reduction = reduction)
    return score





def normalize(lis):
    _min = min(lis)
    _max = max(lis)
    lis  = [(x - _min)/(_max - _min) for x in lis]
    return lis


In [26]:
titles = []
IDList = []
with open('./data/Explicit_Semantic_Ranking_Dataset/s2_doc.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        titles.append(data['title'][0])
        IDList.append(data['docno'])
def ret(paperID):
    for id, title in zip(IDList, titles):
        if (id == paperID):
            return title

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Preparing Dataset

In [18]:
records = dict()
with open('./data/Explicit_Semantic_Ranking_Dataset/s2_doc.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        records[data['docno']] = (data['title'][0], data['numCitedBy'][0], data['numKeyCitations'][0])
        
def getPaperDetails(docno):
    return records[docno]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [16]:
queryList = []
with open('./data/Explicit_Semantic_Ranking_Dataset/s2_query.json') as file:
    for line in file:
        data = json.loads(line)
        queryList.append(data['query'])

In [21]:
queryEntityList = []
for query in tqdm(queryList):
    _, queryEntitiesWithFreq = getQueryEntitiesAndSpots(query, long_text = 0) 
    queryEntities = []
    for entityTuple in queryEntitiesWithFreq:
        queryEntities.append(preprocess(entityTuple[0]))
    queryEntityList.append(queryEntities)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [19]:
titleList = []
citationList =  []
keyCitationList = []
queryIDList = []
Y = []
with open('./data/Explicit_Semantic_Ranking_Dataset/s2.qrel') as file:
    for line in tqdm(file):
        lineString = line.split()
        qid = int(lineString[0]) - 1    # - 1 to account for our 0 based indexing, while theirs is 1 based
        docno = lineString[2]
        relScore = int(lineString[-1])
        if docno in records:
            title, citations, keyCitations = records[docno]
            queryIDList.append(qid)
            titleList.append(title)
            citationList.append(citations)
            keyCitationList.append(keyCitations)
            Y.append(relScore)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [55]:
semScores = []
for qid, title in tqdm(zip(queryIDList, titleList)):
    semScores.append(semanticScore(queryList[qid], title, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
## semScores citationList and keyCitationList are the feature vectors, Y to predict

In [58]:
X = []
for score, citation, keyCitation in zip(semScores, citationList, keyCitationList):
    X.append([score, citation, keyCitation])

In [60]:
with open('./data/Explicit_Semantic_Ranking_Dataset/trainData.json', 'w') as outfile:
    tmp = dict()
    tmp['X'] = X
    tmp['Y'] = Y
    json.dump(tmp, outfile)
    outfile.write('\n')

## Loading the above data

In [13]:
X = []
Y = []
with open('./data/Explicit_Semantic_Ranking_Dataset/trainData.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        X = data['X']
        Y = data['Y']

In [23]:
X = np.asarray(X)
Y = np.asarray(Y)

In [7]:
from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
X = mm_scaler.fit_transform(X)

In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier


names = [
 "Random Forest", "Neural Net", "AdaBoost", "Linear SVC" ]

classifiers = [
    RandomForestClassifier(verbose=True, n_jobs = -1),
    MLPClassifier(verbose=True, early_stopping=True),
    AdaBoostClassifier(),
    OneVsRestClassifier(BaggingClassifier(LinearSVC(),n_jobs = -1))]

In [24]:
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import precision_recall_fscore_support

def average(lis):
    return sum(lis) / len(lis)

kfold = KFold(n_splits=5, shuffle=True)
for name, clf in zip(names, classifiers):
    precScores = []
    recallScores = []
    f1Scores = []
    count = 1
    for train_index, test_index in kfold.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        print('Fitting: ', count)
        clf.fit(X_train, y_train)
        print('count ', count)
        y_pred = clf.predict(X_test)
        prec, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        precScores.append(prec)
        recallScores.append(recall)
        f1Scores.append(fscore)
        count += 1
    print('Name', name,'. Avg Precision: ', average(precScores), '. Avg Recall: ', average(recallScores), '. Avg F-1 Score: ', average(f1Scores) )

Fitting:  1


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


count  1
Fitting:  2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s


count  2
Fitting:  3


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s


count  3
Fitting:  4


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s


count  4
Fitting:  5


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s


count  5
Name Random Forest . Avg Precision:  0.4543500384048433 . Avg Recall:  0.5083833085477822 . Avg F-1 Score:  0.4718505867079064
Fitting:  1
Iteration 1, loss = inf
Validation score: 0.251773
Iteration 2, loss = 7.55720958
Validation score: 0.546099
Iteration 3, loss = 5.92031146
Validation score: 0.340426
Iteration 4, loss = 2.79012216
Validation score: 0.503546
Iteration 5, loss = 2.40280621
Validation score: 0.514184
Iteration 6, loss = 1.72917018
Validation score: 0.553191
Iteration 7, loss = 1.42025555
Validation score: 0.453901


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


Iteration 8, loss = 1.40104929
Validation score: 0.507092
Iteration 9, loss = 1.48958578
Validation score: 0.510638
Iteration 10, loss = 1.29540101
Validation score: 0.556738
Iteration 11, loss = 1.36199916
Validation score: 0.517730
Iteration 12, loss = 1.28004453
Validation score: 0.560284
Iteration 13, loss = 1.30965289
Validation score: 0.503546
Iteration 14, loss = 1.24674873
Validation score: 0.489362
Iteration 15, loss = 1.25831572
Validation score: 0.531915
Iteration 16, loss = 1.21208737
Validation score: 0.553191
Iteration 17, loss = 1.25130266
Validation score: 0.468085
Iteration 18, loss = 1.32752573
Validation score: 0.556738
Iteration 19, loss = 1.20412401
Validation score: 0.556738
Iteration 20, loss = 1.28067435
Validation score: 0.574468
Iteration 21, loss = 1.23103942
Validation score: 0.553191
Iteration 22, loss = 1.25470570
Validation score: 0.496454
Iteration 23, loss = 1.32698958
Validation score: 0.549645
Iteration 24, loss = 1.35811087
Validation score: 0.446809

  _warn_prf(average, modifier, msg_start, len(result))


Iteration 8, loss = 1.55942045
Validation score: 0.510638
Iteration 9, loss = 1.31639635
Validation score: 0.549645
Iteration 10, loss = 1.34548840
Validation score: 0.546099
Iteration 11, loss = 1.38888211
Validation score: 0.507092
Iteration 12, loss = 1.31206225
Validation score: 0.528369
Iteration 13, loss = 1.38432008
Validation score: 0.507092
Iteration 14, loss = 1.26879006
Validation score: 0.553191
Iteration 15, loss = 1.27050075
Validation score: 0.549645
Iteration 16, loss = 1.35947195
Validation score: 0.478723
Iteration 17, loss = 1.38938739
Validation score: 0.539007
Iteration 18, loss = 1.42779408
Validation score: 0.553191
Iteration 19, loss = 1.56977012
Validation score: 0.549645
Iteration 20, loss = 2.02541527
Validation score: 0.361702
Iteration 21, loss = 1.60466002
Validation score: 0.500000
Iteration 22, loss = 1.43753049
Validation score: 0.539007
Iteration 23, loss = 1.27437860
Validation score: 0.528369
Iteration 24, loss = 1.29087814
Validation score: 0.553191

  _warn_prf(average, modifier, msg_start, len(result))


Iteration 14, loss = 1.31422661
Validation score: 0.514184
Iteration 15, loss = 1.37623651
Validation score: 0.542553
Iteration 16, loss = 1.34076714
Validation score: 0.539007
Iteration 17, loss = 1.45086597
Validation score: 0.556738
Iteration 18, loss = 1.34954359
Validation score: 0.553191
Iteration 19, loss = 1.38979100
Validation score: 0.521277
Iteration 20, loss = 1.43792396
Validation score: 0.443262
Iteration 21, loss = 1.40911255
Validation score: 0.539007
Iteration 22, loss = 1.39495887
Validation score: 0.553191
Iteration 23, loss = 1.22858851
Validation score: 0.556738
Iteration 24, loss = 1.22353711
Validation score: 0.503546
Iteration 25, loss = 1.36109716
Validation score: 0.556738
Iteration 26, loss = 1.57559025
Validation score: 0.450355
Iteration 27, loss = 1.40988502
Validation score: 0.542553
Iteration 28, loss = 1.44468334
Validation score: 0.546099
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
count  3
Fitting:  4
I

  _warn_prf(average, modifier, msg_start, len(result))


Iteration 2, loss = 5.84561753
Validation score: 0.531915
Iteration 3, loss = 4.67587797
Validation score: 0.468085
Iteration 4, loss = 3.14843456
Validation score: 0.404255
Iteration 5, loss = 2.09158184
Validation score: 0.489362
Iteration 6, loss = 1.89343067
Validation score: 0.581560
Iteration 7, loss = 1.83432283
Validation score: 0.485816
Iteration 8, loss = 1.68983197
Validation score: 0.521277
Iteration 9, loss = 1.53681557
Validation score: 0.563830
Iteration 10, loss = 1.41095601
Validation score: 0.546099
Iteration 11, loss = 1.31538008
Validation score: 0.592199
Iteration 12, loss = 1.40162333
Validation score: 0.595745
Iteration 13, loss = 1.61374687
Validation score: 0.411348
Iteration 14, loss = 1.50499774
Validation score: 0.443262
Iteration 15, loss = 1.43038855
Validation score: 0.570922
Iteration 16, loss = 1.38090427
Validation score: 0.546099
Iteration 17, loss = 1.39145766
Validation score: 0.517730
Iteration 18, loss = 1.34181574
Validation score: 0.581560
Itera

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation score: 0.574468
Iteration 11, loss = 1.39867342
Validation score: 0.489362
Iteration 12, loss = 1.41331768
Validation score: 0.478723
Iteration 13, loss = 1.29936838
Validation score: 0.503546
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
count  5
Name Neural Net . Avg Precision:  0.40207363787293815 . Avg Recall:  0.5626668983576877 . Avg F-1 Score:  0.4300249990181776
Fitting:  1
count  1
Fitting:  2
count  2
Fitting:  3


  _warn_prf(average, modifier, msg_start, len(result))


count  3
Fitting:  4
count  4
Fitting:  5
count  5
Name AdaBoost . Avg Precision:  0.4133169652514864 . Avg Recall:  0.5629380576748998 . Avg F-1 Score:  0.4353654910203583
Fitting:  1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


count  1
Fitting:  2


  _warn_prf(average, modifier, msg_start, len(result))


count  2
Fitting:  3


  _warn_prf(average, modifier, msg_start, len(result))


count  3
Fitting:  4


  _warn_prf(average, modifier, msg_start, len(result))


count  4
Fitting:  5


  _warn_prf(average, modifier, msg_start, len(result))


count  5
Name Linear SVC . Avg Precision:  0.41553591404060075 . Avg Recall:  0.4836480020690548 . Avg F-1 Score:  0.4135348188974411


  _warn_prf(average, modifier, msg_start, len(result))


## Other similairity scores

In [21]:
from fuzzywuzzy import fuzz
fuzzyScores = []
for qid, title in tqdm(zip(queryIDList, titleList)):
    fuzzyScores.append(fuzz.token_sort_ratio(queryList[qid], title))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [22]:
X = []
for score, citation, keyCitation in zip(fuzzyScores, citationList, keyCitationList):
    X.append([score, citation, keyCitation])