In [1]:
import json
from elasticsearch import Elasticsearch
import requests
from tqdm.notebook import tqdm
import re
import fasttext 
import numpy as np
from numpy import dot
from numpy.linalg import norm
import hnswlib
from collections import defaultdict, Counter
import string

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
module_url = "./module/UnivTrans" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
def embed(inputText):
    return model(inputText)

In [2]:
fasttextModel = fasttext.load_model('crawl-300d-2M-subword.bin')




In [3]:
def getQueryEmbedding(query, embeddingType='fastText'):
    
    '''Get embedding for a single query. Query is pre-processed in this function itself'''

    translator = str.maketrans('', '', string.punctuation) 
    query.strip()
    query = query.translate(translator)
    query = ' '.join(query.split())
    
    if embeddingType == 'fastText':
        embedding = fasttextModel.get_word_vector(query)
    elif embeddingType == 'USE':
        embedding = embed([query])[0].numpy()
        
    return np.asarray(embedding)


def getQueryEmbeddings(queryList, embeddingType='fastText'):
    '''Get embedding list for a list of queries. Query is pre-processed in this function itself''' 
    embeddings = []
    
    ## Preprocessing
    translator = str.maketrans('', '', string.punctuation) 
    for i in range(len(queryList)):
            queryList[i].strip()
            queryList[i] = queryList[i].translate(translator)
            queryList[i] = ' '.join(queryList[i].split())
    
    if embeddingType == 'fastText':
        for query in queryList:
            embedding = fasttextModel.get_word_vector(query)
            embeddings.append(embedding)
    elif embeddingType == 'USE':
        embeddings = embed(queryList).numpy()
        
    return np.asarray(embeddings)

In [4]:
def getDocumentEmbeddings(docIDList, method='abstract', embeddingType='fastText'):
    docIDSet = set(docIDList)
    embeddingDictForDocs = dict()
    
    if method=='abstract':
        if embeddingType == 'fastText':
            filename = './data/dblpAbstract_2Thresholded_FT_Embeddings.json'
        elif embeddingType == 'USE':
            filename = './data/dblp_Abstract_2Thresholded_USE_Trans_Embeddings.json'
        with open(filename, 'r') as file:
                for line in file:
                    data = json.loads(line)
                    if data['id'] in docIDSet:
                        embeddingDictForDocs[data['id']] = data['embedding']
                        
    elif method=='title':
        if embeddingType == 'fastText':
            filename = './data/dblpTitle_2Thresholded_FT_Embeddings.json'
        elif embeddingType == 'USE':
            filename = './data/dblp_Title_2Thresholded_USE_Trans_Embeddings.json'
        with open(filename, 'r') as file:
                for line in file:
                    data = json.loads(line)
                    if data['id'] in docIDSet:
                        embeddingDictForDocs[data['id']] = data['embedding']
    
    elif method=='fos':
        records = []
        PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
        i = 0

        with open(PapersOutFileName, 'r') as file:
            for line in file:
                if i % 2 != 0:
                    data = json.loads(line)
                    if data['id'] in docIDSet:
                        records.append(data)
                i += 1
        
        assert len(records) == len(docIDList)
        
        if embeddingType == 'fastText':
            fileName = './data/dblp_fos_FT_Phrase_embeddings.json'
        elif embeddingType == 'USE':
            fileName = './data/dblp_fos_USE_embeddings.json'
        
        embeddingDict = dict()
        with open(fileName, 'r') as file:
            for line in file:
                data = json.loads(line)
                embeddingDict[data['fos']] = np.asarray(data['embedding']) 
        
        for record in tqdm(records):
            recordEmbeddingList = []
            for fos in record['fos']:
                recordEmbeddingList.append(embeddingDict[fos])
            embeddingDictForDocs[record['id']] = np.mean(recordEmbeddingList, axis = 0)
           
            
    elif method=='fosIdf':
        records = []
        PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
        i = 0

        with open(PapersOutFileName, 'r') as file:
            for line in file:
                if i % 2 != 0:
                    data = json.loads(line)
                    records.append(data)
                i += 1
        
        if embeddingType == 'fastText':
            fileName = './data/dblp_fos_FT_Phrase_embeddings.json'
        elif embeddingType == 'USE':
            filename = './data/dblp_fos_USE_embeddings.json'
        
        embeddingDict = dict()
        fosCount = dict()
        N = len(records)
        with open(filename, 'r') as file:
            for line in file:
                data = json.loads(line)
                embeddingDict[data['fos']] = np.asarray(data['embedding']) 
                fosCount[data['fos']] = data['count']
        
        for record in tqdm(records):
            recordEmbeddingList = []
            weightList = []
            for fos in record['fos']:
                recordEmbeddingList.append(embeddingDict[fos] * (N / fosCount[fos]))
                weightList.append((N / fosCount[fos]))
            embeddingDictForDocs[record['id']] = np.mean(recordEmbeddingList, axis = 0) / np.sum(weightList)
            
    embeddings = []
    for docID in docIDList:
        embeddings.append(embeddingDictForDocs[docID])
    assert len(embeddings) == len(docIDList)
    return np.asarray(embeddings)



def getAllDocumentEmbeddings(method='abstract', embeddingType='fastText'):
    
    embeddings = []
    
    if method=='abstract':
        if embeddingType == 'fastText':
            filename = './data/dblpAbstract_2Thresholded_FT_Embeddings.json'
        elif embeddingType == 'USE':
            filename = './data/dblp_Abstract_2Thresholded_USE_Trans_Embeddings.json'
        with open(filename, 'r') as file:
                for line in file:
                    data = json.loads(line)
                    embedding = data['embedding']
                    embeddings.append(embedding)
    elif method=='title':
        if embeddingType == 'fastText':
            filename = './data/dblpTitle_2Thresholded_FT_Embeddings.json'
        elif embeddingType == 'USE':
            filename = './data/dblp_Title_2Thresholded_USE_Trans_Embeddings.json'
        with open(filename, 'r') as file:
                for line in file:
                    data = json.loads(line)
                    embedding = data['embedding']
                    embeddings.append(embedding)
    
    elif method=='fos':
        records = []
        PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
        i = 0

        with open(PapersOutFileName, 'r') as file:
            for line in file:
                if i % 2 != 0:
                    data = json.loads(line)
                    records.append(data)
                i += 1
        
        if embeddingType == 'fastText':
            fileName = './data/dblp_fos_FT_Phrase_embeddings.json'
        elif embeddingType == 'USE':
            filename = './data/dblp_fos_USE_embeddings.json'
        
        embeddingDict = dict()
        with open(fileName, 'r') as file:
            for line in file:
                data = json.loads(line)
                embeddingDict[data['fos']] = np.asarray(data['embedding']) 
        
        for record in tqdm(records):
            recordEmbeddingList = []
            for fos in record['fos']:
                recordEmbeddingList.append(embeddingDict[fos])
            embeddings.append(np.mean(recordEmbeddingList, axis = 0))
            
    elif method=='fosIdf':
        records = []
        PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
        i = 0

        with open(PapersOutFileName, 'r') as file:
            for line in file:
                if i % 2 != 0:
                    data = json.loads(line)
                    records.append(data)
                i += 1
        
        if embeddingType == 'fastText':
            fileName = './data/dblp_fos_FT_Phrase_embeddings.json'
        elif embeddingType == 'USE':
            fileName = './data/dblp_fos_USE_embeddings.json'
        
        embeddingDict = dict()
        fosCount = dict()
        N = len(records)
        with open(fileName, 'r') as file:
            for line in file:
                data = json.loads(line)
                embeddingDict[data['fos']] = np.asarray(data['embedding']) 
                fosCount[data['fos']] = data['count']
        
        for record in tqdm(records):
            recordEmbeddingList = []
            weightList = []
            for fos in record['fos']:
                recordEmbeddingList.append(embeddingDict[fos] * (N / fosCount[fos]))
                weightList.append((N / fosCount[fos]))
            embeddings.append(np.mean(recordEmbeddingList, axis = 0) / np.sum(weightList))
    
    return np.asarray(embeddings)

## Search using Direct embedding Match

In [5]:
def buildIndexer(docEmbeddings):
    numElements = len(docEmbeddings)
    dimension = len(docEmbeddings[0])
    embeddings = np.asarray(docEmbeddings)
    data_labels = np.arange(numElements)
    p = hnswlib.Index(space = 'cosine', dim = dimension) # possible options are l2, cosine or ip
    p.init_index(max_elements = numElements, ef_construction = 200, M = dimension)
    p.add_items(embeddings, data_labels)
    p.set_ef(30) 
    
    return p

In [6]:
def loadIndexer(filepath, numElements):
    p = hnswlib.Index(space='cosine', dim=dimension)  # the space can be changed - keeps the data, alters the distance function.
    p.load_index("./models/fastTexthnswlib.bin", max_elements =numElements)

In [7]:
def searchWithEmbedding(queryList, K=10, method='abstract', embeddingType='fastText'):
    IDList = []
    with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
        for line in file:
            data = json.loads(line)
            IDList.append(data['id'])

    docEmbeddings = getAllDocumentEmbeddings(method=method, embeddingType=embeddingType)
    queryEmbeddings = getQueryEmbeddings(queryList, embeddingType=embeddingType)
    p = buildIndexer(docEmbeddings)

    labels, _ = p.knn_query(queryEmbeddings, k = K )
    for i in range(len(labels)):
        labels[i] = [IDList[ind] for ind in labels[i]]
        
    return labels
    

In [8]:

from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim
def weight(score, esScore):
    embeddingWeight = 0.8
    s = embeddingWeight * score + (1 - embeddingWeight) * esScore
    return s


def normalize(lis):
    _min = min(lis)
    _max = max(lis)
    lis  = [(x - _min)/(_max - _min) for x in lis]
    return lis

def rankList(query, docList, esScoreList=None, K=10):
    '''
    ReRank documents in the docList wrt the query

    Parameters: 
    query (str): query wrt. which the documents will be ranked

    doclist(list[str]): IDs of the documents to rerank, len(doclist >= K)

    esScoreList(list[int]): scores of the documents as returned by Elastic Search

    k(int): number of top documents to return after re-ranking

    Returns: 
    list[str]: re-ranked list of document IDs
    '''
    queryEmbedding = getQueryEmbedding(query, embeddingType='fastText')
    docEmbeddings = getDocumentEmbeddings(docList, method ='fos', embeddingType='fastText')
    docEmbeddings = docEmbeddings.tolist()
    cosineSimScores = [ cosine_similarity(queryEmbedding, np.asarray(docEmbedding)) for docEmbedding in docEmbeddings]
    if esScoreList is None:
        IDsWithScore = [(score, ID) for score, ID in zip(cosineSimScores, docList)]
    elif esScoreList is not None:
         IDsWithScore = [(weight(score, esScore), ID) for score, ID, esScore in zip(cosineSimScores, docList, esScoreList)]
    IDsWithScore.sort(reverse=True)
    IDsWithScore = IDsWithScore[:K + 1]                    ## Keep top-K documents only
    
    return [ID for _,ID in IDsWithScore]
    
    

In [9]:
## Search with Elastic Search
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

def elasticSearch(queryList, index='dblp_v1', K = 10):
    fields = ['id', 'title', 'venue', 'authors', 'year', 'abstract', 'fos']
    queryBody = {
    "query": {
        "multi_match" : {
            "query" : "sentence embeddings",
            "fields" : ['title', 'abstract', 'authors']
        }
    }
}
    queryBody['size'] = K
    searchResults = []
    for query in queryList:
        queryBody['query']['multi_match']['query'] = query
        res= es.search(index=index,body=queryBody)
        searchResults.append([hit['_id'] for hit in res['hits']['hits']])
    return searchResults

def rankedElasticSearch(queryList, index='dblp_v1', K = 10, includeEsScores=False, rerank=True):
    fields = ['id', 'title', 'venue', 'authors', 'year', 'abstract', 'fos']
    queryBody = {
    "query": {
        "multi_match" : {
            "query" : "sentence embeddings",
            "fields" : ['title^3', 'abstract', 'authors', 'fos^2']
        }
    }
}
    if rerank:
        queryBody['size'] = 2 * K
    else:
        queryBody['size'] = K
    queryBody['query']['multi_match']['fuzziness'] = 'AUTO'
    searchResults = []
    for query in queryList:
        queryBody['query']['multi_match']['query'] = query
        res= es.search(index=index,body=queryBody)
        initList = [hit['_id'] for hit in res['hits']['hits']]
        esScoreList = [hit['_score'] for hit in res['hits']['hits']]
        esScoreList = normalize(esScoreList)
        if rerank:
            if includeEsScores:
                searchResults.append(rankList(query, initList, esScoreList = esScoreList))
            else:
                searchResults.append(rankList(query, initList))
        else:
            searchResults.append(initList)
    return searchResults

def rankedMLTElasticSearch(queryList, index='dblp_v1', K = 10, includeEsScores=False):
    fields = ['id', 'title', 'venue', 'authors', 'year', 'abstract', 'fos']
    queryBody = {
    "query": {
        "more_like_this" : {
            "like" : "sentence embeddings",
            "fields" : ['title^3', 'abstract', 'authors', 'fos^2'],
            "min_term_freq" : 1
        }
    }
}    
    queryBody['size'] = 5 * K
    
    searchResults = []
    for query in queryList:
        queryBody['query']['more_like_this']['like'] = query
        res= es.search(index=index,body=queryBody)
        initList = [hit['_id'] for hit in res['hits']['hits']]
        esScoreList = [hit['_score'] for hit in res['hits']['hits']] 
        esScoreList = normalize(esScoreList)
        if includeEsScores:
            searchResults.append(rankList(query, initList, esScoreList = esScoreList))
        else:
            searchResults.append(rankList(query, initList))
        
        
    return searchResults
    

In [10]:
queryList = ['converting word to speech', 'Big data', 'efficient estimation of word representations in vector space', 'natural language interface', 'reinforcement learning in video game']

In [None]:
results1 = searchWithEmbedding(queryList, K=10, method='title', embeddingType='USE')
results1 = [[str(ID) for ID in sublist] for sublist in results1]

In [None]:
results2 = searchWithEmbedding(queryList, K=10, method='fos', embeddingType='fastText')
results2 = [[str(ID) for ID in sublist] for sublist in results2]

In [17]:
results3 = elasticSearch(queryList)

In [18]:
results4 = rankedElasticSearch(queryList, index='dblp_v1', K = 10, includeEsScores=False, rerank=True)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [19]:
results5 = rankedMLTElasticSearch(queryList, index='dblp_v1', K = 10, includeEsScores=False)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [21]:
resutlIDList = []
for r1, r2, r3, r4, r5 in zip(results1, results2, results3, results4, results5):
    lis = list(set().union(r1, r2, r3, r4))
    resutlIDList.append(lis)

In [22]:
from functools import reduce
import operator
IDs = set(reduce(operator.concat, resutlIDList))

TypeError: reduce() of empty sequence with no initial value

In [None]:
PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
i = 0
records = dict()
with open(PapersOutFileName, 'r') as file:
    for line in tqdm(file):
        if i % 2 != 0:
            data = json.loads(line)
            if (data['id'] in IDs):
                records[data['id']] = {'title': data['title'], 'abstract': data['abstrat'], 'fos': ', '.join(data['fos'])}
        i += 1

In [None]:
import csv
import random

rows = []
    for query, idSubList in tqdm(zip(queryList, resutlIDList)):
        for ID in idSubList:
            localDict = records[ID]
            rows.append([query, ID, localDict['title'], localDict['abstract'], localDict['fos'], 0])
random.shuffle(rows)
        
with open('./data/annotations.csv', "w") as file:
    writer = csv.writer(file)
    header = ['Query', 'id', 'title', 'abstract', 'fos', 'score']
    writer.writerow(header)
    for row in rows:
            writer.writerow(row)
