In [119]:
import json
from tqdm.notebook import tqdm
import re 
import numpy as np
from numpy import dot
from numpy.linalg import norm
from collections import Counter, defaultdict
import requests
from fuzzywuzzy import fuzz

In [19]:
import tensorflow as tf
import tensorflow_hub as hub
module_url = "./module/UnivTrans" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
def embed(inputText):
    return model(inputText)

In [25]:
entityEmbeddingDict = dict()
with open('./data/entity_USE_Embeddings.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        entityEmbeddingDict[data['entity']] = data['embedding']

In [20]:
import re
import string

def cosineSimilarity(a, b):
    a = np.asarray(a)
    b = np.asarray(b)
    return dot(a, b)/(norm(a)*norm(b))

def preprocess(s):
    s = re.sub(r'\d+', '', s)
    translator = str.maketrans('', '', string.punctuation) 
    s = s.translate(translator) 
    s = s.strip()
    return s

def getEntitiesAndSpots(text, rhoThreshold = 0.1, long_text = 0):
    url = 'https://tagme.d4science.org/tagme/tag'
    params = {'lang': 'en', 'include_abstract': 'false', 'include_categories': 'true', 'gcube-token': '42aa36f7-4770-4574-8ef8-45138f3ba072-843339462', 'text': text, 'long_text': long_text}
    rhoThreshold = rhoThreshold
    entities = []
    spots = []
    r = requests.get(url = url, params = params) 
    data = r.json()
    for annotation in data['annotations']:
        if annotation['rho'] > rhoThreshold:
            entities.append(annotation['title'])
            spots.append(annotation['spot'])
    spots = Counter(spots)
    spots = [(s, spots[s]) for s in spots.keys()]
    entities = Counter(entities)
    entities = [(s, entities[s]) for s in entities.keys()]
    return spots, entities

In [159]:

         
dictForTitles = dict()
dictForBody = dict()
titleDictLoaded = False
bodyDictLoaded = False
def loadEntityDict(method='title'):
    global dictForTitles
    global dictForBody
    if method == 'title':
        with open('./data/TitleEntitiesPerPaper.json', 'r') as file:
            for line in file:
                dictForTitles = json.loads(line)
    elif method == 'body':
        with open('./data/BodyEntitiesPerPaper.json', 'r') as file:
            for line in file:
                dictForBody = json.loads(line)
def retrieveSpots(docID, method='title'):
    '''Returns pre computed spot mentions for this docID, where each element is a tuple of (spot name, frequency)'''
    global titleDictLoaded 
    global bodyDictLoaded
    if titleDictLoaded == False and method == 'title':
        loadEntityDict(method='title')
        titleDictLoaded = True
    elif bodyDictLoaded == False and method == 'body':
        loadEntityDict(method='body')
        bodyDictLoaded = True
        
    if method == 'title':
        return dictForTitles[docID]['spots']
    elif method == 'body':
        return dictForBody[docID]['spots']
    
def retrieveEntities(docID, method='title'):
    '''Returns pre computed entities for this docID, where each element is a tuple of (entity name, frequency)'''
    global titleDictLoaded 
    global bodyDictLoaded
    if titleDictLoaded == False and method == 'title':
        loadEntityDict(method='title')
        titleDictLoaded = True
    elif bodyDictLoaded == False and method == 'body':
        loadEntityDict(method='body')
        bodyDictLoaded = True
        
    if method == 'title':
        return dictForTitles[docID]['entities']
    elif method == 'body':
        return dictForBody[docID]['entities']
    
def retrieveEntityEmbedding(entity):
    try:
        return entityEmbeddingDict[entity]
    except:
        return  embed([entity]).numpy().tolist()[0]
    
def computeFuzzySimilarityMatrix(query, docID, method = 'title'):
    querySpotsWithFreq, _ = getEntitiesAndSpots(query, long_text = 0)   ## since query is expected to be short
    docSpotsWithFreq = retrieveSpots(docID, method = method)
    docSpotFrequencies = [entityTuple[1] for entityTuple in docSpotsWithFreq]
    querySpotFrequencies = [entityTuple[1] for entityTuple in querySpotsWithFreq]

    querySpots = []
    for entityTuple in querySpotsWithFreq:
        querySpots.append(preprocess(entityTuple[0]))
    docSpots = [preprocess(entityTuple[0]) for entityTuple in docSpotsWithFreq]

    numDocSpots = len(docSpotsWithFreq)
    numQuerySpots = len(querySpotsWithFreq)
    simMatrix = np.zeros((numDocSpots, numQuerySpots))
    for i in range(numDocSpots):
        for j in range(numQuerySpots):
            simMatrix[i][j] = fuzz.token_sort_ratio(docSpots[i], querySpots[j])
    return simMatrix,  querySpotFrequencies, docSpotFrequencies

def computeSimilarityMatrix(query, docID, method = 'title'):
    _, queryEntitiesWithFreq = getEntitiesAndSpots(query, long_text = 0)   ## since query is expected to be short
    docEntitiesWithFreq = retrieveEntities(docID, method = method)
    docEntityFrequencies = [entityTuple[1] for entityTuple in docEntitiesWithFreq]
    queryEntityFrequencies = [entityTuple[1] for entityTuple in queryEntitiesWithFreq]

    queryEntities = []
    for entityTuple in queryEntitiesWithFreq:
        queryEntities.append(preprocess(entityTuple[0]))
    queryEntityEmbeddings = embed(queryEntities).numpy().tolist()
    docEntityEmbeddings = [retrieveEntityEmbedding(entityTuple[0]) for entityTuple in docEntitiesWithFreq]
    
    numDocEntities = len(docEntitiesWithFreq)
    numQueryEntities = len(queryEntitiesWithFreq)
    simMatrix = np.zeros((numDocEntities, numQueryEntities))
    for i in range(numDocEntities):
        for j in range(numQueryEntities):
            simMatrix[i][j] = max(0, cosineSimilarity(docEntityEmbeddings[i], queryEntityEmbeddings[j]))
    return simMatrix,  queryEntityFrequencies, docEntityFrequencies

def reduceMatrix(simMatrix,  queryEntityFrequencies, docEntityFrequencies, axis = 'column', pooling = 'max'):
    if axis == 'column':
        axis = 0
    else:
        axis = 1
    if pooling == 'max':
        try:
            return np.max(simMatrix, axis = axis) # along columns
        except:
            return np.zeros(1)
    
def reduceVector(vector, reduction = 'avg'):
    if reduction == 'avg':
        return sum(vector) / len(vector)
    
def semanticScore(query, docID, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simMatrix,  queryEntityFrequencies, docEntityFrequencies = computeSimilarityMatrix(query, docID, method = method)
    vector = reduceMatrix(simMatrix,  queryEntityFrequencies, docEntityFrequencies, axis = axis, pooling = pooling)
    score = reduceVector(vector, reduction = reduction)
    return score

def fuzzyScore(query, docID, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simMatrix,  queryEntityFrequencies, docEntityFrequencies = computeFuzzySimilarityMatrix(query, docID, method = method)
    vector = reduceMatrix(simMatrix,  queryEntityFrequencies, docEntityFrequencies, axis = axis, pooling = pooling)
    score = reduceVector(vector, reduction = reduction)
    return score

def search(query, docIDList, K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simScores = [ semanticScore(query, docID, method = method, axis = axis, pooling = pooling, reduction = reduction) for docID in docIDList]
    IDsWithScore = [(score, ID) for score, ID in zip(simScores, docIDList)]
   
    IDsWithScore.sort(reverse=True)
    IDsWithScore = IDsWithScore[:K]                    ## Keep top-K documents only
    
    return [ID for _,ID in IDsWithScore]

def searchFuzzy(query, docIDList, K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simScores = [ fuzzyScore(query, docID, method = method, axis = axis, pooling = pooling, reduction = reduction) for docID in docIDList]
    IDsWithScore = [(score, ID) for score, ID in zip(simScores, docIDList)]
   
    IDsWithScore.sort(reverse=True)
    IDsWithScore = IDsWithScore[:K]                    ## Keep top-K documents only
    
    return [ID for _,ID in IDsWithScore]

def normalize(lis):
    _min = min(lis)
    _max = max(lis)
    lis  = [(x - _min)/(_max - _min) for x in lis]
    return lis

def averageScores(scores1, scores2):
    array_1 = np.array(scores1)
    array_2 = np.array(scores2)

    weight_1 = 0.3
    weight_2 = 0.7
    meanArray = weight_1*array_1 + weight_2*array_2
    return meanArray.tolist()

def searchFusion(query, docIDList, K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simScoresFuzzy = [ fuzzyScore(query, docID, method = method, axis = axis, pooling = pooling, reduction = reduction) for docID in docIDList]
    simScoresFuzzy = normalize(simScoresFuzzy)
    simScoresSem = [ semanticScore(query, docID, method = method, axis = axis, pooling = pooling, reduction = reduction) for docID in docIDList]
    simScores = averageScores(simScoresFuzzy, simScoresSem)
    IDsWithScore = [(score, ID) for score, ID in zip(simScores, docIDList)]
   
    IDsWithScore.sort(reverse=True)
    IDsWithScore = IDsWithScore[:K]                    ## Keep top-K documents only
    
    return [ID for _,ID in IDsWithScore]

In [77]:
with open('./data/papersForEntity.json', 'r') as file:
    for line in file:
        data = json.loads(line)
embeddingResults = data['embeddingResults']
esResults = data['esResults']

In [91]:
queryList = ['converting text to speech', 'big data', 'efficient estimation of word representations in vector space', 'natural language interface', 'reinforcement learning in video game']

In [92]:
## ES ID List input, semantic search, title
results1 = []
for i in range(len(queryList)):
    results1.append(search(query, esResults[i], K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'))

In [93]:
## Embedding ID List input, semantic search, title
results2 = []
for i in range(len(queryList)):
    results2.append(search(query, embeddingResults[i], K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'))

In [116]:
## elastic search ID List input, semantic search, body
results3 = []
for i in range(len(queryList)):
    results3.append(search(query, esResults[i], K = 10, method = 'body', axis = 'column', pooling = 'max', reduction = 'avg'))

In [179]:
## elastic search ID List input, fuzzy search, body
results4 = []
for i in range(len(queryList)):
    results4.append(searchFuzzy(query, esResults[i], K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'))

In [160]:
## elastic search ID List input, fusion search, semantic weight 0.7, title
results5 = []
for i in range(len(queryList)):
    results5.append(searchFusion(query, esResults[i], K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'))

In [167]:
## elastic search ID List input, fusion search, semantic weight 0.7, body
results6 = []
for i in range(len(queryList)):
    results6.append(searchFusion(query, esResults[i], K = 10, method = 'body', axis = 'column', pooling = 'max', reduction = 'avg'))

In [None]:
## elastic search ID List input, fusion search, semantic weight 0.5, body
results6 = []
for i in range(len(queryList)):
    results6.append(searchFusion(query, esResults[i], K = 10, method = 'body', axis = 'column', pooling = 'max', reduction = 'avg'))

In [79]:
titles = []
IDList = []
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        titles.append(data['title'])
        IDList.append(data['id'])
def ret(paperID):
    for id, title in zip(IDList, titles):
        if (id == paperID):
            return title

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [82]:
[ret(ID) for ID in tmpResults]

['Scribe: deep integration of human and machine intelligence to caption speech in real time',
 'Generating expressive speech for storytelling applications',
 'Vulnerability of speaker verification systems against voice conversion spoofing attacks: The case of telephone speech',
 'Converting dependency structures to phrase structures',
 'An unrestricted vocabulary Arabic speech synthesis system',
 'Visual signals in text comprehension: How to restore them when oralizing a text via a speech synthesis?',
 'Towards speech-to-text translation without speech recognition.',
 'Quaero Speech-to-Text and Text Translation Evaluation Systems',
 'Text-to-speech conversion technology',
 'Use of text syntactical structures in detection of document duplicates',
 'Segmenting unrestricted Chinese text into prosodic words instead of lexical words']

In [88]:
query = 'text to speech'
tmpResults = search(query, embeddingResults[0], K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg')

In [182]:
results3 = []
for i in range(len(queryList)):
    results3.append(search(query, esResults[i], K = 10, method = 'body', axis = 'row', pooling = 'max', reduction = 'avg'))


In [184]:
tmpResults = results3[2]
[ret(ID) for ID in tmpResults]

['Deriving Adjectival Scales from Continuous Space Word Representations',
 'Improving Vector Space Word Representations Using Multilingual Correlation',
 'Vector Space Representations of Documents in Classifying Finnish Social Media Texts',
 'Detecting Compositionality of Multi-Word Expressions using Nearest Neighbours in Vector Space Models',
 'Multi-Prototype Vector-Space Models of Word Meaning',
 'Co-learning of Word Representations and Morpheme Representations',
 'PART-OF-SPEECH INDUCTION FROM SCRATCH',
 'A Structured Vector Space Model for Word Meaning in Context',
 'Non-distributional Word Vector Representations',
 'Modelling Word Meaning using Efficient Tensor Representations']

In [181]:
[ret(ID) for ID in tmpResults]

['Multi-objective tree search approaches for general video game playing',
 'The Reinforcement Learning Competition 2014',
 'Learning to compete, compromise, and cooperate in repeated general-sum games',
 'ViZDoom: A Doom-based AI research platform for visual reinforcement learning',
 'An object-oriented approach to reinforcement learning in an action game',
 'Robust, Efficient, Globally-Optimized Reinforcement Learning with the Parti-Game Algorithm',
 'Learning character behaviors using agent modeling in games',
 'Deep Learning for Video Game Playing.',
 'Video summarization using reinforcement learning in eigenspace',
 'Learning and knowledge generation in General Games']

In [164]:
[ret(ID) for ID in results5[2]]

['VECTOR SPACE MODELS OF WORD MEANING AND PHRASE MEANING: A SURVEY',
 'PART-OF-SPEECH INDUCTION FROM SCRATCH',
 'Vector Space Representations of Documents in Classifying Finnish Social Media Texts',
 'Improving Vector Space Word Representations Using Multilingual Correlation',
 'A Generative Model of Vector Space Semantics',
 'Word Representations in Vector Space and their Applications for Arabic',
 'Modeling N400 amplitude using vector space models of word representation.',
 'Efficient Estimation of Word Representations in Vector Space',
 'A Structured Vector Space Model for Word Meaning in Context',
 'From Word to Sense Embeddings: A Survey on Vector Representations of Meaning.']

In [None]:
resutlIDList = []
for r1, r2, r3, r4, r5 in zip(results1, results2, results3, results4, results5):
    lis = list(set().union(r1, r2, r3, r4, r5))
    resutlIDList.append(lis)

from functools import reduce
import operator
IDs = set(reduce(operator.concat, resutlIDList))

PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
i = 0
records = dict()
with open(PapersOutFileName, 'r') as file:
    for line in tqdm(file):
        if i % 2 != 0:
            data = json.loads(line)
            if (data['id'] in IDs):
                records[data['id']] = {'title': data['title'], 'abstract': data['abstract'], 'fos': ', '.join(data['fos'])}
        i += 1
        
import random

rows = []
for query, idSubList in tqdm(zip(queryList, resutlIDList)):
    for ID in idSubList:
        localDict = records[ID]
        rows.append([query, ID, localDict['title'], localDict['abstract'], localDict['fos'], 0])
random.shuffle(rows)
        
with open('./data/entityAnnotations.csv', "w") as file:
    writer = csv.writer(file)
    header = ['query', 'id', 'title', 'abstract', 'fos', 'score']
    writer.writerow(header)
    for row in rows:
            writer.writerow(row)

results = [results1, results2, results3, results4, results5]
len(results)

with open('./data/entity_search_results.json', 'w') as outfile:
    for i in tqdm(range(len(results))):
        outDict = dict()
        outDict['id'] = i
        outDict['result'] = results[i]
        json.dump(outDict, outfile)
        outfile.write('\n')