In [1]:
import json
from tqdm.notebook import tqdm
import re 
import numpy as np
from numpy import dot
from numpy.linalg import norm
from collections import Counter, defaultdict
import requests
from fuzzywuzzy import fuzz

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
module_url = "./module/UnivTrans" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
def embed(inputText):
    return model(inputText).numpy().tolist()
entityEmbeddingDict = dict()
with open('./data/entity_USE_Embeddings.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        entityEmbeddingDict[data['entity']] = data['embedding']

In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
def embed(inputText):
    return model.encode(inputText)
entityEmbeddingDict = dict()
with open('./data/entity_Roberta_Embeddings.json', 'r') as file:
    for line in file:
        data = json.loads(line)
        entityEmbeddingDict[data['entity']] = data['embedding']

In [3]:
import re
import string

def cosineSimilarity(a, b):
    a = np.asarray(a)
    b = np.asarray(b)
    return dot(a, b)/(norm(a)*norm(b))

def l1similarity(a, b):
    a = np.asarray(a)
    b = np.asarray(b)
    return 1 / ( 1+ np.linalg.norm((a - b), ord=1))

def preprocess(s):
    s = re.sub(r'\d+', '', s)
    translator = str.maketrans('', '', string.punctuation) 
    s = s.translate(translator) 
    s = s.strip()
    return s

def getSpotsAndEntities(text, rhoThreshold = 0.1, long_text = 0):
    if text == 'efficient estimation of word representations in vector space':              ## Error handling for inaccurate entity detector
        spots = [['estimation', 1], ['word representations', 1], ['vector space', 1]]
        entities = [['estimation', 1], ['word embedding', 1], ['vector space', 1]]
        return spots, entities
    url = 'https://tagme.d4science.org/tagme/tag'
    params = {'lang': 'en', 'include_abstract': 'false', 'include_categories': 'true', 'gcube-token': '42aa36f7-4770-4574-8ef8-45138f3ba072-843339462', 'text': text, 'long_text': long_text}
    rhoThreshold = rhoThreshold
    entities = []
    spots = []
    r = requests.get(url = url, params = params) 
    data = r.json()
    for annotation in data['annotations']:
        if annotation['rho'] > rhoThreshold:
            entities.append(annotation['title'])
            spots.append(annotation['spot'])
    spots = Counter(spots)
    spots = [(s, spots[s]) for s in spots.keys()]
    entities = Counter(entities)
    entities = [(s, entities[s]) for s in entities.keys()]
    return spots, entities

In [4]:

         
dictForTitles = dict()
dictForBody = dict()
titleDictLoaded = False
bodyDictLoaded = False
def loadEntityDict(method='title'):
    global dictForTitles
    global dictForBody
    if method == 'title':
        with open('./data/TitleEntitiesPerPaper.json', 'r') as file:
            for line in file:
                dictForTitles = json.loads(line)
    elif method == 'body':
        with open('./data/BodyEntitiesPerPaper.json', 'r') as file:
            for line in file:
                dictForBody = json.loads(line)
def retrieveSpots(docID, method='title'):
    '''Returns pre computed spot mentions for this docID, where each element is a tuple of (spot name, frequency)'''
    global titleDictLoaded 
    global bodyDictLoaded
    if titleDictLoaded == False and method == 'title':
        loadEntityDict(method='title')
        titleDictLoaded = True
    elif bodyDictLoaded == False and method == 'body':
        loadEntityDict(method='body')
        bodyDictLoaded = True
        
    if method == 'title':
        return dictForTitles[docID]['spots']
    elif method == 'body':
        return dictForBody[docID]['spots']
    
def retrieveEntities(docID, method='title'):
    '''Returns pre computed entities for this docID, where each element is a tuple of (entity name, frequency)'''
    global titleDictLoaded 
    global bodyDictLoaded
    if titleDictLoaded == False and method == 'title':
        loadEntityDict(method='title')
        titleDictLoaded = True
    elif bodyDictLoaded == False and method == 'body':
        loadEntityDict(method='body')
        bodyDictLoaded = True
        
    if method == 'title':
        return dictForTitles[docID]['entities']
    elif method == 'body':
        return dictForBody[docID]['entities']
    
def retrieveEntityEmbedding(entity):
    try:
        return entityEmbeddingDict[entity]
    except:
        return  embed([entity])[0]
    
def computeFuzzySimilarityMatrix(query, docID, method = 'title'):
    querySpotsWithFreq, _ = getSpotsAndEntities(query, long_text = 0)   ## since query is expected to be short
    docSpotsWithFreq = retrieveSpots(docID, method = method)
    docSpotFrequencies = [entityTuple[1] for entityTuple in docSpotsWithFreq]
    querySpotFrequencies = [entityTuple[1] for entityTuple in querySpotsWithFreq]

    querySpots = []
    for entityTuple in querySpotsWithFreq:
        querySpots.append(preprocess(entityTuple[0]))
    docSpots = [preprocess(entityTuple[0]) for entityTuple in docSpotsWithFreq]

    numDocSpots = len(docSpotsWithFreq)
    numQuerySpots = len(querySpotsWithFreq)
    simMatrix = np.zeros((numDocSpots, numQuerySpots))
    for i in range(numDocSpots):
        for j in range(numQuerySpots):
            simMatrix[i][j] = fuzz.token_sort_ratio(docSpots[i], querySpots[j])
    return simMatrix,  querySpotFrequencies, docSpotFrequencies

def computeSimilarityMatrix(query, docID, method = 'title'):
    _, queryEntitiesWithFreq = getSpotsAndEntities(query, long_text = 0)   ## since query is expected to be short
    docEntitiesWithFreq = retrieveEntities(docID, method = method)
    docEntityFrequencies = [entityTuple[1] for entityTuple in docEntitiesWithFreq]
    queryEntityFrequencies = [entityTuple[1] for entityTuple in queryEntitiesWithFreq]

    queryEntities = []
    for entityTuple in queryEntitiesWithFreq:
        queryEntities.append(preprocess(entityTuple[0]))
    queryEntityEmbeddings = embed(queryEntities)
    docEntityEmbeddings = [retrieveEntityEmbedding(entityTuple[0]) for entityTuple in docEntitiesWithFreq]
    
    numDocEntities = len(docEntitiesWithFreq)
    numQueryEntities = len(queryEntitiesWithFreq)
    simMatrix = np.zeros((numDocEntities, numQueryEntities))
    for i in range(numDocEntities):
        for j in range(numQueryEntities):
            simMatrix[i][j] = max(0, cosineSimilarity(docEntityEmbeddings[i], queryEntityEmbeddings[j]))
    return simMatrix,  queryEntityFrequencies, docEntityFrequencies

def reduceMatrix(simMatrix,  queryEntityFrequencies, docEntityFrequencies, axis = 'column', pooling = 'max'):
    if axis == 'column':
        axis = 0
    else:
        axis = 1
    if pooling == 'max':
        try:
            return np.max(simMatrix, axis = axis) # along columns
        except:
            return np.zeros(1)
    
def reduceVector(vector, reduction = 'avg'):
    if reduction == 'avg':
        return sum(vector) / len(vector)
    
def semanticScore(query, docID, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simMatrix,  queryEntityFrequencies, docEntityFrequencies = computeSimilarityMatrix(query, docID, method = method)
    vector = reduceMatrix(simMatrix,  queryEntityFrequencies, docEntityFrequencies, axis = axis, pooling = pooling)
    score = reduceVector(vector, reduction = reduction)
    return score

def fuzzyScore(query, docID, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simMatrix,  queryEntityFrequencies, docEntityFrequencies = computeFuzzySimilarityMatrix(query, docID, method = method)
    vector = reduceMatrix(simMatrix,  queryEntityFrequencies, docEntityFrequencies, axis = axis, pooling = pooling)
    score = reduceVector(vector, reduction = reduction)
    return score

def search(query, docIDList, K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simScores = [ semanticScore(query, docID, method = method, axis = axis, pooling = pooling, reduction = reduction) for docID in docIDList]
    IDsWithScore = [(score, ID) for score, ID in zip(simScores, docIDList)]
   
    IDsWithScore.sort(reverse=True)
    IDsWithScore = IDsWithScore[:K]                    ## Keep top-K documents only
    
    return [ID for _,ID in IDsWithScore]

def searchFuzzy(query, docIDList, K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simScores = [ fuzzyScore(query, docID, method = method, axis = axis, pooling = pooling, reduction = reduction) for docID in docIDList]
    IDsWithScore = [(score, ID) for score, ID in zip(simScores, docIDList)]
   
    IDsWithScore.sort(reverse=True)
    IDsWithScore = IDsWithScore[:K]                    ## Keep top-K documents only
    
    return [ID for _,ID in IDsWithScore]

def normalize(lis):
    _min = min(lis)
    _max = max(lis)
    lis  = [(x - _min)/(_max - _min) for x in lis]
    return lis

def averageScores(scores1, scores2):
    array_1 = np.array(scores1)
    array_2 = np.array(scores2)

    weight_1 = 0.3
    weight_2 = 0.7
    meanArray = weight_1*array_1 + weight_2*array_2
    return meanArray.tolist()

def searchFusion(query, docIDList, K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'):
    simScoresFuzzy = [ fuzzyScore(query, docID, method = method, axis = axis, pooling = pooling, reduction = reduction) for docID in docIDList]
    simScoresFuzzy = normalize(simScoresFuzzy)
    simScoresSem = [ semanticScore(query, docID, method = method, axis = axis, pooling = pooling, reduction = reduction) for docID in docIDList]
    simScores = averageScores(simScoresFuzzy, simScoresSem)
    IDsWithScore = [(score, ID) for score, ID in zip(simScores, docIDList)]
   
    IDsWithScore.sort(reverse=True)
    IDsWithScore = IDsWithScore[:K]                    ## Keep top-K documents only
    
    return [ID for _,ID in IDsWithScore]

In [5]:
with open('./data/papersForEntity2.json', 'r') as file:
    for line in file:
        data = json.loads(line)
#embeddingResults = data['embeddingResults']
esResults = data['esResults']

In [6]:
queryList = ['converting text to speech', 'big data', 'efficient estimation of word representations in vector space', 'natural language interface', 'reinforcement learning in video game']

In [12]:
## ES ID List input, semantic search, title
results1 = []
for i in range(len(queryList)):
    results1.append(search(queryList[i], esResults[i], K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'))

In [12]:
## Embedding ID List input, semantic search, title
results2 = []
for i in range(len(queryList)):
    results2.append(search(queryList[i], embeddingResults[i], K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'))

In [21]:
## elastic search ID List input, semantic search, body
results3 = []
for i in range(len(queryList)):
    results3.append(search(queryList[i], esResults[i], K = 10, method = 'body', axis = 'column', pooling = 'max', reduction = 'avg'))

In [14]:
## elastic search ID List input, fuzzy search, body
results4 = []
for i in range(len(queryList)):
    results4.append(searchFuzzy(queryList[i], esResults[i], K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'))

In [15]:
## elastic search ID List input, fusion search, semantic weight 0.7, title
results5 = []
for i in range(len(queryList)):
    results5.append(searchFusion(queryList[i], esResults[i], K = 10, method = 'title', axis = 'column', pooling = 'max', reduction = 'avg'))

In [16]:
## elastic search ID List input, fusion search, semantic weight 0.7, body
results6 = []
for i in range(len(queryList)):
    results6.append(searchFusion(queryList[i], esResults[i], K = 10, method = 'body', axis = 'column', pooling = 'max', reduction = 'avg'))

In [None]:
## elastic search ID List input, fusion search, semantic weight 0.5, body
#results7 = []
#for i in range(len(queryList)):
#    results7.append(searchFusion(query, esResults[i], K = 10, method = 'body', axis = 'column', pooling = 'max', reduction = 'avg'))

In [4]:
titles = []
IDList = []
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        titles.append(data['title'])
        IDList.append(data['id'])
def ret(paperID):
    for id, title in zip(IDList, titles):
        if (id == paperID):
            return title

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [24]:
resutlIDList = []
for r1, r2, r3, r4, r5, r6 in zip(results1, results2, results3, results4, results5, results6):
    lis = list(set().union(r1, r2, r3, r4, r5, r6))
    resutlIDList.append(lis)

from functools import reduce
import operator
IDs = set(reduce(operator.concat, resutlIDList))
print('Number of papers to be annotated: ', len(IDs))

PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
i = 0
records = dict()
with open(PapersOutFileName, 'r') as file:
    for line in tqdm(file):
        if i % 2 != 0:
            data = json.loads(line)
            if (data['id'] in IDs):
                records[data['id']] = {'title': data['title'], 'abstract': data['abstract'], 'fos': ', '.join(data['fos'])}
        i += 1
        
import random
import csv

rows = []
for query, idSubList in tqdm(zip(queryList, resutlIDList)):
    for ID in idSubList:
        localDict = records[ID]
        rows.append([query, ID, localDict['title'], localDict['abstract'], localDict['fos'], 0])
random.shuffle(rows)
        
with open('./data/entityAnnotations.csv', "w") as file:
    writer = csv.writer(file)
    header = ['query', 'id', 'title', 'abstract', 'fos', 'score']
    writer.writerow(header)
    for row in rows:
            writer.writerow(row)

results = [results1, results2, results3, results4, results5, results6]
len(results)

with open('./data/entity_search_results.json', 'w') as outfile:
    for i in tqdm(range(len(results))):
        outDict = dict()
        outDict['id'] = i
        outDict['result'] = results[i]
        json.dump(outDict, outfile)
        outfile.write('\n')

Number of papers to be annotated:  121


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




['Learning a Natural Language Interface with Neural Programmer',
 'Eviza: A Natural Language Interface for Visual Analysis',
 'A natural language interface for querying general and individual knowledge',
 'Interacting with data warehouse by using a natural language interface',
 'Constructing an interactive natural language interface for relational databases',
 'Accessing Touristic Knowledge Bases through a Natural Language Interface',
 'Conversation-Based Natural Language Interface to Relational Databases',
 'Towards a Natural Language Interface for CAD',
 'A natural language interface for performing database updates',
 'KID Designing a Knowledge-Based Natural Language Interface']

In [27]:
## elastic search ID List input, semantic search, body
results7 = []
for i in range(len(queryList)):
    results7.append(search(queryList[i], esResults[i], K = 10, method = 'body', axis = 'column', pooling = 'max', reduction = 'avg'))

In [24]:
[ret(ID) for ID in results3[2]]

['Salience Estimation via Variational Auto-Encoders for Multi-Document Summarization.',
 'Vector Space Representations of Documents in Classifying Finnish Social Media Texts',
 'Efficient Non-parametric Estimation of Multiple Embeddings per Word in Vector Space',
 'Learning Latent Vector Spaces for Product Search',
 'Semantic visualization for spherical representation',
 'Compressing Neural Language Models by Sparse Word Representations',
 'Improving Language Estimation with the Paragraph Vector Model for Ad-hoc Retrieval',
 'Efficient Estimation of Word Representations in Vector Space',
 'A Generative Model of Vector Space Semantics',
 'Vector Space Models for Phrase-based Machine Translation']

In [18]:
resutlIDList = []
for r1, r2 in zip(results1,  results3):
    lis = list(set().union(r1, r2))
    resutlIDList.append(lis)


from functools import reduce
import operator
IDs = set(reduce(operator.concat, resutlIDList))
print('Number of papers to be annotated: ', len(IDs))

PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
i = 0
records = dict()
with open(PapersOutFileName, 'r') as file:
    for line in tqdm(file):
        if i % 2 != 0:
            data = json.loads(line)
            if (data['id'] in IDs):
                records[data['id']] = {'title': data['title'], 'abstract': data['abstract'], 'fos': ', '.join(data['fos'])}
        i += 1
        
import random
import csv

rows = []
for query, idSubList in tqdm(zip(queryList, resutlIDList)):
    for ID in idSubList:
        localDict = records[ID]
        rows.append([query, ID, localDict['title'], localDict['abstract'], localDict['fos'], 0])
random.shuffle(rows)
        
with open('./data/entityAnnotationsSiamese2.csv', "w") as file:
    writer = csv.writer(file)
    header = ['query', 'id', 'title', 'abstract', 'fos', 'score']
    writer.writerow(header)
    for row in rows:
            writer.writerow(row)

results = [results1, results3]
len(results)

with open('./data/entity_search_resultsSiamese2.json', 'w') as outfile:
    for i in tqdm(range(len(results))):
        outDict = dict()
        outDict['id'] = i
        outDict['result'] = results[i]
        json.dump(outDict, outfile)
        outfile.write('\n')

Number of papers to be annotated:  50


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [2]:
with open('./data/papersForEntity2.json', 'r') as file:
    for line in file:
        data = json.loads(line)
#embeddingResults = data['embeddingResults']
esResults = data['esResults']

In [4]:
len(esResults[0])

100

In [7]:
[ret(ID) for ID in esResults[0][:10]]

['Converting text into agent animations: assigning gestures to text',
 'Text-to-speech conversion technology',
 'The DEMOSTHeNES speech composer.',
 'Generating expressive speech for storytelling applications',
 'Visual signals in text comprehension: How to restore them when oralizing a text via a speech synthesis?',
 'Re-engineering letter-to-sound rules',
 'Rules and Algorithms for Phonetic Transcription of Standard Malay',
 'Converting numerical classification into text classification',
 'Semantator: Semantic Annotator for Converting Biomedical Text to Linked Data',
 'An unrestricted vocabulary Arabic speech synthesis system']

In [10]:
def rankList(query, docList, K=10):

    queryEmbedding = embed([query])[0]
    docEmbeddings = embed([ret(ID) for ID in docList])
    cosineSimScores = [ cosineSimilarity(queryEmbedding, docEmbedding) for docEmbedding in docEmbeddings]

    IDsWithScore = [(score, ID) for score, ID in zip(cosineSimScores, docList)]
    
    IDsWithScore.sort(reverse=True)
    IDsWithScore = IDsWithScore[:K]                    ## Keep top-K documents only
    
    return [ID for _,ID in IDsWithScore]

In [11]:
results1 = []
for i in range(len(queryList)):
    results1.append(rankList(queryList[i], esResults[i]))

In [15]:
[ret(ID) for ID in results1[3]]

['Natural-language interface',
 'Generation in a natural language interface',
 'Natural Language Understanding',
 'Natural language learning',
 'An Intelligent Query Interface with Natural Language Support.',
 'The role of natural language in a multimodal interface',
 'A natural language interface for querying general and individual knowledge',
 'Choice of words in the generation process of a natural language interface',
 'Lifer: a natural language interface facility',
 'Databases and Natural Language Interfaces.']

In [16]:
abstracts = []

with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        abstracts.append(data['abstract'])
def retAbs(paperID):
    for id, title in zip(IDList, abstracts):
        if (id == paperID):
            return title

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [19]:
[retAbs(ID) for ID in results1[3]]

[[],
 ['The',
  'PHRED',
  '(PHR',
  'asal',
  'English',
  'Diction)',
  'generator',
  'produces',
  'the',
  'natural',
  'language',
  'output',
  'of',
  "Berkeley's",
  'UNIX',
  'Consultant',
  'system',
  '(UC).',
  'shares',
  'its',
  'knowledge',
  'base',
  'with',
  'analyzer',
  'PHRAN',
  '(PHRasal',
  'ANalyser).',
  'parser',
  'and',
  'generator,',
  'together',
  'component',
  "UC's",
  'user',
  'interface,',
  'draw',
  'from',
  'database',
  'pattern-concept',
  'pairs',
  'where',
  'basic',
  'unit',
  'linguistic',
  'patterns',
  'is',
  'phrase.',
  'Both',
  'are',
  'designed',
  'to',
  'provide',
  'multilingual',
  'capabilities,',
  'facilitate',
  'paraphrases,',
  'be',
  'adaptable',
  'individual',
  "user's",
  'vocabulary',
  'knowledge.',
  'affords',
  'extensibility,simplicity,',
  'processing',
  'speed',
  'while',
  'performing',
  'task',
  'producing',
  'utterances',
  'conceptual',
  'representations',
  'using',
  'large',
  'base.',