## Indexing into ES

In [70]:
import json
from elasticsearch import Elasticsearch
import requests
from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [46]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [48]:
records = []
with open('./data/Explicit_Semantic_Ranking_Dataset/s2_doc.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        tmp = dict()
        tmp['keyPhrases'] = data.get('keyPhrases', [])
        tmp['paperAbstract'] = data['paperAbstract'][0]
        tmp['title'] = data['title'][0]
        tmp['id'] = data['docno']
        records.append(tmp)
        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [49]:
records[0]

{'keyPhrases': ['Isomorphism',
  'Duality',
  'DUAL',
  'Undirected Graph',
  'Relational'],
 'paperAbstract': "We provide a correspondence between the subjects of duality and density in classes of finite relational structures. The purpose of du-ality is to characterise the structures C that do not admit a homo-morphism into a given target B by the existence of a homomorphism from a structure A into C. Density is the order-theoretic property of containing no covers (or 'gaps'). We show that the covers in the skeleton of a category of finite relational models correspond naturally to certain instances of duality statements, and we characterise these covers.",
 'title': 'Duality Theorems for Finite Structures (Characterising Gaps and Good Characterisations)',
 'id': '6e4eddf4d6671c37537bb5d1c9623353b62e8531'}

In [50]:
for record in tqdm(records):
    res = es.index(index='esr',doc_type='paper',id=record['id'],body=record)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=8541.0), HTML(value='')))




## Searching

In [61]:
import re
import string

def preprocess(s):
    s = re.sub(r'\d+', '', s)
    translator = str.maketrans('', '', string.punctuation) 
    s = s.translate(translator) 
    s = s.strip()
    return s

def getSpotsAndEntities(text, rhoThreshold = 0.1, long_text = 0):
    url = 'https://tagme.d4science.org/tagme/tag'
    params = {'lang': 'en', 'include_abstract': 'false', 'include_categories': 'true', 'gcube-token': '42aa36f7-4770-4574-8ef8-45138f3ba072-843339462', 'text': text, 'long_text': long_text}
    rhoThreshold = rhoThreshold
    entities = []
    spots = []
    r = requests.get(url = url, params = params) 
    data = r.json()
    for annotation in data['annotations']:
        if annotation['rho'] > rhoThreshold:
            entities.append(annotation['title'])
            spots.append(annotation['spot'])
    spots = Counter(spots)
    spots = [(s, spots[s]) for s in spots.keys()]
    entities = Counter(entities)
    entities = [(s, entities[s]) for s in entities.keys()]
    return spots, entities

In [21]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
def embed(inputTexts):
    return model.encode(inputTexts)

In [15]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
def elasticSearch(queryList, index='esr', K = 10):
    queryBody = {
    "query": {
        "multi_match" : {
            "query" : "sentence embeddings",
            "fields" : ['title', 'paperAbstract']
        }
    }
}
    queryBody['size'] = K
    searchResults = []
    for query in queryList:
        queryBody['query']['multi_match']['query'] = query
        res= es.search(index=index,body=queryBody)
        searchResults.append([hit['_id'] for hit in res['hits']['hits']])
    return searchResults

In [22]:
queryList = []
with open('./data/Explicit_Semantic_Ranking_Dataset/s2_query.json') as file:
    for line in file:
        data = json.loads(line)
        queryList.append(data['query'])

In [23]:
queryList = queryList[:10]

In [51]:
esResults = elasticSearch(queryList, K = 100)

In [52]:
tmpDict = dict()
tmpDict['esResults'] = esResults
with open('./data/Explicit_Semantic_Ranking_Dataset/papersForEntity.json', 'w') as outfile:
    json.dump(tmpDict, outfile)

In [53]:
paperIDs = set()
for result in esResults:
    paperIDs.update(result)

In [54]:
records = dict()
papersFileName = './data/Explicit_Semantic_Ranking_Dataset/s2_doc.json'
with open(papersFileName, 'r') as file:
    for line in file:
        data = json.loads(line)
        if (data['docno'] in paperIDs):
            records[data['docno']] = {'title': data['title'][0], 'abstract': data['paperAbstract'][0]}

In [62]:
dictForTitles = dict() ## dict[paperId] = {'entities': entityCounterList,'spots': spotscounterList}
for key in tqdm(records.keys()):
    text = preprocess(records[key]['title'])
    spots, entities = getSpotsAndEntities(text, rhoThreshold = 0.1)
    dictForTitles[key] = {'entities': entities, 'spots': spots}
with open('./data/Explicit_Semantic_Ranking_Dataset/TitleEntitiesPerPaper.json', 'w') as outfile:
    json.dump(dictForTitles, outfile)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=963.0), HTML(value='')))




In [63]:
with open('./data/TitleEntitiesPerPaper.json', 'r') as file:
    for line in file:
        dictForTitles = json.loads(line)
entityList = [[entityTuple[0] for entityTuple in tmpDict['entities']] for tmpDict in  dictForTitles.values() ]
entitySet = set()
for entitySubList in entityList:
    entitySet.update(entitySubList)
entityList = list(entitySet)

n = 100     # block size
entities = [entityList[i:i + n] for i in range(0, len(entityList), n)]

In [None]:

count = 0
with open('./data/Explicit_Semantic_Ranking_Dataset/entity_Roberta_Embeddings.json', 'w') as outfile:
    for entitySubList in tqdm(entities):
        entitySubList = [preprocess(entity) for entity in entitySubList]
        embeddings = embed(entitySubList)#.numpy().tolist()
        for embedding, entity in zip(embeddings, entitySubList):
            outDict = dict()
            outDict['entity'] = entity
            outDict['embedding'] = embedding.tolist()
            count += 1
            json.dump(outDict, outfile)
            outfile.write('\n')
assert count == len(entityList)

## Entity -Similarity Matrix Search

In [72]:
def cosineSimilarity(a, b):
    a = np.asarray(a)
    b = np.asarray(b)
    return dot(a, b)/(norm(a)*norm(b))

def l1similarity(a, b):
    a = np.asarray(a)
    b = np.asarray(b)
    return 1 / ( 1+ np.linalg.norm((a - b), ord=1))



In [68]:
dictForTitles = dict()
dictForBody = dict()
titleDictLoaded = False
bodyDictLoaded = False
queryEntitySpotDict = dict()
def getQueryEntitiesAndSpots(query, long_text = 0):
    global queryEntitySpotDict 
    if query in queryEntitySpotDict:
        return queryEntitySpotDict[query]['entities'], queryEntitySpotDict[query]['spots']
    else:
        spotsWithFreq, entitiesWithFreq = getSpotsAndEntities(query, long_text = 0)
        queryEntitySpotDict[query]= {'entities': entitiesWithFreq,'spots' : spotsWithFreq}
        return entitiesWithFreq, spotsWithFreq
def loadEntityDict(method='title'):
    global dictForTitles
    global dictForBody
    if method == 'title':
        with open('./data/Explicit_Semantic_Ranking_Dataset/TitleEntitiesPerPaper.json', 'r') as file:
            for line in file:
                dictForTitles = json.loads(line)
    elif method == 'body':
        with open('./data/Explicit_Semantic_Ranking_Dataset/BodyEntitiesPerPaper.json', 'r') as file:
            for line in file:
                dictForBody = json.loads(line)
def retrieveSpots(docID, method='title'):
    '''Returns pre computed spot mentions for this docID, where each element is a tuple of (spot name, frequency)'''
    global titleDictLoaded 
    global bodyDictLoaded
    if titleDictLoaded == False and method == 'title':
        loadEntityDict(method='title')
        titleDictLoaded = True
    elif bodyDictLoaded == False and method == 'body':
        loadEntityDict(method='body')
        bodyDictLoaded = True
        
    if method == 'title':
        return dictForTitles[docID]['spots']
    elif method == 'body':
        return dictForBody[docID]['spots']
    
def retrieveEntities(docID, method='title'):
    '''Returns pre computed entities for this docID, where each element is a tuple of (entity name, frequency)'''
    global titleDictLoaded 
    global bodyDictLoaded
    if titleDictLoaded == False and method == 'title':
        loadEntityDict(method='title')
        titleDictLoaded = True
    elif bodyDictLoaded == False and method == 'body':
        loadEntityDict(method='body')
        bodyDictLoaded = True
        
    if method == 'title':
        return dictForTitles[docID]['entities']
    elif method == 'body':
        return dictForBody[docID]['entities']
    
def retrieveEntityEmbedding(entity):
    try:
        return entityEmbeddingDict[entity]
    except:
        return  embed([entity])[0]
    

def computeSimilarityMatrix(query, docID, method = 'title'):
    _, queryEntitiesWithFreq = getQueryEntitiesAndSpots(query, long_text = 0)   ## since query is expected to be short
    docEntitiesWithFreq = retrieveEntities(docID, method = method)
    docEntityFrequencies = [entityTuple[1] for entityTuple in docEntitiesWithFreq]
    queryEntityFrequencies = [entityTuple[1] for entityTuple in queryEntitiesWithFreq]

    queryEntities = []
    for entityTuple in queryEntitiesWithFreq:
        queryEntities.append(preprocess(entityTuple[0]))
    queryEntityEmbeddings = embed(queryEntities)
    docEntityEmbeddings = [retrieveEntityEmbedding(entityTuple[0]) for entityTuple in docEntitiesWithFreq]
    
    numDocEntities = len(docEntitiesWithFreq)
    numQueryEntities = len(queryEntitiesWithFreq)
    simMatrix = np.zeros((numDocEntities, numQueryEntities))
    for i in range(numDocEntities):
        for j in range(numQueryEntities):
            simMatrix[i][j] = max(0, cosineSimilarity(docEntityEmbeddings[i], queryEntityEmbeddings[j]))
    return simMatrix,  queryEntityFrequencies, docEntityFrequencies

def reduceMatrix(simMatrix,  queryEntityFrequencies, docEntityFrequencies, axis = 'column', pooling = 'max'):
    if axis == 'column':
        axis = 0
    else:
        axis = 1
    if pooling == 'max':
        try:
            return np.max(simMatrix, axis = axis) # along columns
        except:
            return np.zeros(1)
    
def reduceVector(vector, reduction = 'avg'):
    if reduction == 'avg':
        return sum(vector) / len(vector)
    
def semanticScore(query, docID, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simMatrix,  queryEntityFrequencies, docEntityFrequencies = computeSimilarityMatrix(query, docID, method = method)
    vector = reduceMatrix(simMatrix,  queryEntityFrequencies, docEntityFrequencies, axis = axis, pooling = pooling)
    score = reduceVector(vector, reduction = reduction)
    return score


def search(query, docIDList, K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simScores = [ semanticScore(query, docID, method = method, axis = axis, pooling = pooling, reduction = reduction) for docID in docIDList]
    IDsWithScore = [(score, ID) for score, ID in zip(simScores, docIDList)]
   
    IDsWithScore.sort(reverse=True)
    IDsWithScore = IDsWithScore[:K]                    ## Keep top-K documents only
    
    return [ID for _,ID in IDsWithScore]


def normalize(lis):
    _min = min(lis)
    _max = max(lis)
    lis  = [(x - _min)/(_max - _min) for x in lis]
    return lis


In [73]:
results1 = []
for i in tqdm(range(len(queryList))):
    results1.append(search(queryList[i], esResults[i], K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [81]:
## Store search results to file
with open('./data/Explicit_Semantic_Ranking_Dataset/entity_search_resultsSiamese.json', 'w') as outfile:

    outDict = dict()
    outDict['result'] = results1
    json.dump(outDict, outfile)
    outfile.write('\n')

In [17]:
titles = []
IDList = []
with open('./data/Explicit_Semantic_Ranking_Dataset/s2_doc.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        titles.append(data['title'][0])
        IDList.append(data['docno'])
def ret(paperID):
    for id, title in zip(IDList, titles):
        if (id == paperID):
            return title

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [120]:
[ret(id) for id   in results1[0]]

[['Unsupervised Feature Learning and Deep Learning: A Review and New Perspectives'],
 ['New types of deep neural network learning for speech recognition and related applications: an overview'],
 ['Deep Learning Architectures for Soil Property Prediction'],
 ['Privacy-Preserving Deep Learning'],
 ['Learning Hierarchical Representations for Video Analysis Using Deep Learning'],
 ['Deep Learning Algorithms with Applications to Video Analytics for A Smart City: A Survey'],
 ['Evaluation of Deep Learning based Pose Estimation for Sign Language'],
 ['Comparing Time and Frequency Domain for Audio Event Recognition Using Deep Learning'],
 ['Deep Learning using Support Vector Machines'],
 ['Kernel Methods for Deep Learning']]

In [74]:
queryList

['deep learning',
 'artificial intelligence',
 'information retrieval',
 'machine learning',
 'question answering',
 'noun phrases',
 'penn treebank',
 'speech recognition',
 'data mining',
 'computer vision']

In [58]:
len(paperIDs)

963

In [59]:
queryList

['deep learning',
 'artificial intelligence',
 'information retrieval',
 'machine learning',
 'question answering',
 'noun phrases',
 'penn treebank',
 'speech recognition',
 'data mining',
 'computer vision']

## Evaluation 

In [112]:
def dcg(relevanceScores, k = 10, method=0):
    """
    Returns discounted cumulative gain (dcg)
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    relevanceScores = np.asfarray(relevanceScores)[:k]
    if relevanceScores.size:
        if method == 0:
            return relevanceScores[0] + np.sum(relevanceScores[1:] / np.log2(np.arange(2, relevanceScores.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0

def ndcgMax(relevanceScores, k=10, method=0):
    return dcg(sorted(relevanceScores, reverse=True), k, method)

def ndcg(relevanceScores, ndcgMax, k = 10, method=0):
    return dcg(relevanceScores, k, method) / ndcgMax
    
    

In [94]:
queryToIdx = {queryList[i]:i for i in range(len(queryList))}
annotationDict = [{} for i in range(len(queryList) + 1)]    # + 1 for 1 based indexing in
with open('./data/Explicit_Semantic_Ranking_Dataset/s2.qrel') as file:
    for line in file:
        lineString = line.split()
        qid = int(lineString[0])
        if qid > 10:
            break
        docno = lineString[2]
        relScore = int(lineString[-1])
        annotationDict[qid][docno] = relScore

In [98]:
relevanceScores= []

fileName = './data/Explicit_Semantic_Ranking_Dataset/entity_search_resultsSiamese.json'
with open(fileName, 'r') as file:
    for line in file:
        data = json.loads(line)
        result = data['result']
        for i in range(len(result)):
            result[i] = [annotationDict[i + 1].get(ID, 0) for ID in result[i]]
        relevanceScores.append(result)

In [99]:
relevanceScores

[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 1, 0, 0, 0, 0, 0, 2],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [2, 0, 0, 0, 0, 0, 4, 0, 0, 4],
  [0, 3, 2, 0, 0, 0, 0, 0, 0, 0]]]

In [110]:
[ret(ID) for ID in results1[3]]

[['Tuning Metaheuristics - A Machine Learning Perspective'],
 ['Supervised Machine Learning: A Review of Classification Techniques'],
 ['Machine Learning, Neural and Statistical Classification'],
 ['Introduction to machine learning for brain imaging'],
 ['Machine Learning - An Algorithmic Perspective'],
 ['Bioinformatics - the machine learning approach'],
 ['C4.5: Programs for Machine Learning'],
 ['Introduction to Machine Learning Introduction to Machine Learning Introduction to Machine Learning Introduction to Machine Learning Introduction to Machine Learning'],
 ['Softprop: Softmax Neural Network Backpropagation Learning'],
 ['Information Geometry and Information Theory in Machine Learning']]

In [101]:
annotationDict[1]

{'227759bc318163b2f2490690b828263f3f020cfb': 2,
 '373f76633cc1f6c7a421e31c989842021a52fca4': 4,
 '72d32c986b47d6b880dad0c3f155fe23d2939038': 3,
 '39f63dbdce9207b87878290c0e3983e84cfcecd9': 1,
 '5ca4abab527f6b0270e50548f0dea30638c9b86e': 0,
 '013cd20c0eaffb9cab80875a43086e0c3224fe20': 2,
 '2c03df8b48bf3fa39054345bafabfeff15bfd11d': 3,
 '12d1d070a53d4084d88a77b8b143bad51c40c38f': 0,
 '4728bac8f82149c844c50045fd62c550622b7a01': 0,
 '5352b7ca90cbe4938f8e71a25d49517e7f94670a': 0,
 '76e282712f35424d160d801a72e48372ab891a50': 0,
 'ad8c2721ef54c9326684762db7c9fc1378e83797': 0,
 'a62b58c267fddfa06545a7fc63a3c62ef7dc9e15': 0,
 '1d7705be75f4e29210373c2b40ee5cb6e46f0007': 0,
 '940b84cae7f8cef27351c7e0ff472cc3a80aff8c': 0,
 '1c40786dc5cc14efeb3a08e08bfdfdc52995b3ea': 0,
 '2315fc6c2c0c4abd2443e26a26e7bb86df8e24cc': 4,
 '0a8149fb5aa8a5684e7d530c264451a5cb9250f5': 0,
 '22ce15125f1c8fb466f241ce010e670d23dcc764': 0,
 'd26a48aff2abc3460c1018d5b410766f698d696c': 0,
 '553a6530b0802da9bec354d0a70fde254f6a5e

In [117]:
[(ret(ID), annotationDict[1][ID]) for ID in annotationDict[1].keys()]

[(['Convolutional deep belief networks for scalable unsupervised learning of hierarchical representations'],
  2),
 (['A Fast Learning Algorithm for Deep Belief Nets'], 4),
 (['Deep Learning of Representations: Looking Forward'], 3),
 (['Context-Dependent Pre-Trained Deep Neural Networks for Large-Vocabulary Speech Recognition'],
  1),
 (['Exact solutions to the nonlinear dynamics of learning in deep linear neural networks'],
  0),
 (['Representation Learning: A Review and New Perspectives'], 2),
 (['Deep Residual Learning for Image Recognition'], 3),
 (['Reinforcement Learning: A Survey'], 0),
 (['Building high-level features using large scale unsupervised learning'], 0),
 (['Scalable stacking and learning for building deep architectures'], 0),
 (None, 0),
 (['Unsupervised learning of hierarchical representations with convolutional deep belief networks'],
  0),
 (None, 0),
 (['Deep learning in speech synthesis'], 0),
 (None, 0),
 (None, 0),
 (['ImageNet Classification with Deep Convol

In [113]:
ndcgMaxPerQuery = []
for i in range(len(queryList)):
    ndcgMaxPerQuery.append(ndcgMax(list(annotationDict[i].values())))
ndcgMaxPerQuery[-1] = ndcgMaxPerQuery[0]

In [114]:
ndcgMaxPerQuery

[0,
 15.75169688297418,
 6.561606311644851,
 17.40255958161545,
 11.323465818787765,
 8.385424265341916,
 12.69009057132933,
 15.070595335185766,
 17.146988662861663,
 0]

In [115]:
ndcgMax([4] * 10)

21.01797804708183

In [119]:
[ret(ID) for ID in esResults[7][:10]]

[['Survey on speech emotion recognition: Features, classification schemes, and databases'],
 ['Efficient voice activity detection algorithms using long-term speech information'],
 ['The aurora experimental framework for the performance evaluation of speech recognition systems under noisy conditions'],
 ['Normalized amplitude modulation features for large vocabulary noise-robust speech recognition'],
 ['A coupled HMM for audio-visual speech recognition'],
 ['Speech production knowledge in automatic speech recognition.'],
 ['Convolutional Neural Networks for Speech Recognition'],
 ['Audio-visual continuous speech recognition using a coupled hidden Markov model'],
 ['Recent Development of Open-source Speech Recognition Engine Julius'],
 ['Speaker independent audio-visual continuous speech recognition']]