In [4]:
import json
from elasticsearch import Elasticsearch
import requests
from tqdm.notebook import tqdm
import re 
import numpy as np
from numpy import dot
from numpy.linalg import norm
import hnswlib
from collections import defaultdict, Counter
import string

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
module_url = "./module/UnivTrans" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
def embed(inputText):
    return model(inputText)

In [None]:
import fasttext
fasttextModel = fasttext.load_model('crawl-300d-2M-subword.bin')

In [5]:
def getQueryEmbedding(query, embeddingType='fastText'):
    
    '''Get embedding for a single query. Query is pre-processed in this function itself'''

    translator = str.maketrans('', '', string.punctuation) 
    query.strip()
    query = query.translate(translator)
    query = ' '.join(query.split())
    
    if embeddingType == 'fastText':
        embedding = fasttextModel.get_word_vector(query)
    elif embeddingType == 'USE':
        embedding = embed([query])[0].numpy()
        
    return np.asarray(embedding)


def getQueryEmbeddings(queryList, embeddingType='fastText'):
    '''Get embedding list for a list of queries. Query is pre-processed in this function itself''' 
    embeddings = []
    
    ## Preprocessing
    translator = str.maketrans('', '', string.punctuation) 
    for i in range(len(queryList)):
            queryList[i].strip()
            queryList[i] = queryList[i].translate(translator)
            queryList[i] = ' '.join(queryList[i].split())
    
    if embeddingType == 'fastText':
        for query in queryList:
            embedding = fasttextModel.get_word_vector(query)
            embeddings.append(embedding)
    elif embeddingType == 'USE':
        embeddings = embed(queryList).numpy()
        
    return np.asarray(embeddings)

In [35]:
def getDocumentEmbeddings(docIDList, method='abstract', embeddingType='fastText'):
    docIDSet = set(docIDList)
    embeddingDictForDocs = dict()
    
    if method=='abstract':
        if embeddingType == 'fastText':
            filename = './data/dblpAbstract_2Thresholded_FT_Embeddings.json'
        elif embeddingType == 'USE':
            filename = './data/dblp_Abstract_2Thresholded_USE_Trans_Embeddings.json'
        elif embeddingType == 'tfIdf':
            filename = './data/dblpAbstract_2Thresholded_TfIdfUni_Embeddings.json'
        with open(filename, 'r') as file:
                for line in file:
                    data = json.loads(line)
                    if data['id'] in docIDSet:
                        embeddingDictForDocs[data['id']] = data['embedding']
                        
    elif method=='title':
        if embeddingType == 'fastText':
            filename = './data/dblpTitle_2Thresholded_FT_Embeddings.json'
        elif embeddingType == 'USE':
            filename = './data/dblp_Title_2Thresholded_USE_Trans_Embeddings.json'
        with open(filename, 'r') as file:
                for line in file:
                    data = json.loads(line)
                    if data['id'] in docIDSet:
                        embeddingDictForDocs[data['id']] = data['embedding']
    
    elif method=='fos':
        records = []
        PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
        i = 0

        with open(PapersOutFileName, 'r') as file:
            for line in file:
                if i % 2 != 0:
                    data = json.loads(line)
                    if data['id'] in docIDSet:
                        records.append(data)
                i += 1
        
        assert len(records) == len(docIDList)
        
        if embeddingType == 'fastText':
            fileName = './data/dblp_fos_FT_Phrase_embeddings.json'
        elif embeddingType == 'USE':
            fileName = './data/dblp_fos_USE_embeddings.json'
        
        embeddingDict = dict()
        with open(fileName, 'r') as file:
            for line in file:
                data = json.loads(line)
                embeddingDict[data['fos']] = np.asarray(data['embedding']) 
        
        for record in tqdm(records):
            recordEmbeddingList = []
            for fos in record['fos']:
                recordEmbeddingList.append(embeddingDict[fos])
            embeddingDictForDocs[record['id']] = np.mean(recordEmbeddingList, axis = 0)
           
            
    elif method=='fosIdf':
        records = []
        PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
        i = 0

        with open(PapersOutFileName, 'r') as file:
            for line in file:
                if i % 2 != 0:
                    data = json.loads(line)
                    records.append(data)
                i += 1
        
        if embeddingType == 'fastText':
            fileName = './data/dblp_fos_FT_Phrase_embeddings.json'
        elif embeddingType == 'USE':
            filename = './data/dblp_fos_USE_embeddings.json'
        
        embeddingDict = dict()
        fosCount = dict()
        N = len(records)
        with open(filename, 'r') as file:
            for line in file:
                data = json.loads(line)
                embeddingDict[data['fos']] = np.asarray(data['embedding']) 
                fosCount[data['fos']] = data['count']
        
        for record in tqdm(records):
            recordEmbeddingList = []
            weightList = []
            for fos in record['fos']:
                recordEmbeddingList.append(embeddingDict[fos] * (N / fosCount[fos]))
                weightList.append((N / fosCount[fos]))
            embeddingDictForDocs[record['id']] = np.mean(recordEmbeddingList, axis = 0) / np.sum(weightList)
            
    embeddings = []
    for docID in docIDList:
        embeddings.append(embeddingDictForDocs[docID])
    assert len(embeddings) == len(docIDList)
    return np.asarray(embeddings)



def getAllDocumentEmbeddings(method='abstract', embeddingType='fastText'):
    
    embeddings = []
    
    if method=='abstract':
        if embeddingType == 'fastText':
            filename = './data/dblpAbstract_2Thresholded_FT_Embeddings.json'
        elif embeddingType == 'USE':
            filename = './data/dblp_Abstract_2Thresholded_USE_Trans_Embeddings.json'
        with open(filename, 'r') as file:
                for line in file:
                    data = json.loads(line)
                    embedding = data['embedding']
                    embeddings.append(embedding)
    elif method=='title':
        if embeddingType == 'fastText':
            filename = './data/dblpTitle_2Thresholded_FT_Embeddings.json'
        elif embeddingType == 'USE':
            filename = './data/dblp_Title_2Thresholded_USE_Trans_Embeddings.json'
        with open(filename, 'r') as file:
                for line in file:
                    data = json.loads(line)
                    embedding = data['embedding']
                    embeddings.append(embedding)
    
    elif method=='fos':
        records = []
        PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
        i = 0

        with open(PapersOutFileName, 'r') as file:
            for line in file:
                if i % 2 != 0:
                    data = json.loads(line)
                    records.append(data)
                i += 1
        
        if embeddingType == 'fastText':
            fileName = './data/dblp_fos_FT_Phrase_embeddings.json'
        elif embeddingType == 'USE':
            filename = './data/dblp_fos_USE_embeddings.json'
        
        embeddingDict = dict()
        with open(fileName, 'r') as file:
            for line in file:
                data = json.loads(line)
                embeddingDict[data['fos']] = np.asarray(data['embedding']) 
        
        for record in tqdm(records):
            recordEmbeddingList = []
            for fos in record['fos']:
                recordEmbeddingList.append(embeddingDict[fos])
            embeddings.append(np.mean(recordEmbeddingList, axis = 0))
            
    elif method=='fosIdf':
        records = []
        PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
        i = 0

        with open(PapersOutFileName, 'r') as file:
            for line in file:
                if i % 2 != 0:
                    data = json.loads(line)
                    records.append(data)
                i += 1
        
        if embeddingType == 'fastText':
            fileName = './data/dblp_fos_FT_Phrase_embeddings.json'
        elif embeddingType == 'USE':
            fileName = './data/dblp_fos_USE_embeddings.json'
        
        embeddingDict = dict()
        fosCount = dict()
        N = len(records)
        with open(fileName, 'r') as file:
            for line in file:
                data = json.loads(line)
                embeddingDict[data['fos']] = np.asarray(data['embedding']) 
                fosCount[data['fos']] = data['count']
        
        for record in tqdm(records):
            recordEmbeddingList = []
            weightList = []
            for fos in record['fos']:
                recordEmbeddingList.append(embeddingDict[fos] * (N / fosCount[fos]))
                weightList.append((N / fosCount[fos]))
            embeddings.append(np.mean(recordEmbeddingList, axis = 0) / np.sum(weightList))
    
    return np.asarray(embeddings)

## Search using Direct embedding Match

In [7]:
def buildIndexer(docEmbeddings):
    numElements = len(docEmbeddings)
    dimension = len(docEmbeddings[0])
    embeddings = np.asarray(docEmbeddings)
    data_labels = np.arange(numElements)
    p = hnswlib.Index(space = 'cosine', dim = dimension) # possible options are l2, cosine or ip
    p.init_index(max_elements = numElements, ef_construction = 200, M = dimension)
    p.add_items(embeddings, data_labels)
    p.set_ef(30) 
    
    return p

In [8]:
def loadIndexer(filepath, numElements):
    p = hnswlib.Index(space='cosine', dim=dimension)  # the space can be changed - keeps the data, alters the distance function.
    p.load_index("./models/fastTexthnswlib.bin", max_elements =numElements)

In [9]:
def searchWithEmbedding(queryList, K=10, method='abstract', embeddingType='fastText'):
    IDList = []
    with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
        for line in file:
            data = json.loads(line)
            IDList.append(data['id'])

    docEmbeddings = getAllDocumentEmbeddings(method=method, embeddingType=embeddingType)
    queryEmbeddings = getQueryEmbeddings(queryList, embeddingType=embeddingType)
    p = buildIndexer(docEmbeddings)

    labels, _ = p.knn_query(queryEmbeddings, k = K )
    for i in range(len(labels)):
        labels[i] = [IDList[ind] for ind in labels[i]]
        
    return labels
    

In [10]:

from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim
def weight(score, esScore):
    embeddingWeight = 0.8
    s = embeddingWeight * score + (1 - embeddingWeight) * esScore
    return s


def normalize(lis):
    _min = min(lis)
    _max = max(lis)
    lis  = [(x - _min)/(_max - _min) for x in lis]
    return lis

def rankList(query, docList, esScoreList=None, K=10,  method ='fos', embeddingType='fastText'):
    '''
    ReRank documents in the docList wrt the query

    Parameters: 
    query (str): query wrt. which the documents will be ranked

    doclist(list[str]): IDs of the documents to rerank, len(doclist >= K)

    esScoreList(list[int]): scores of the documents as returned by Elastic Search

    k(int): number of top documents to return after re-ranking

    Returns: 
    list[str]: re-ranked list of document IDs
    '''
    queryEmbedding = getQueryEmbedding(query, embeddingType=embeddingType)
    docEmbeddings = getDocumentEmbeddings(docList, method = method, embeddingType=embeddingType)
    docEmbeddings = docEmbeddings.tolist()
    cosineSimScores = [ cosine_similarity(queryEmbedding, np.asarray(docEmbedding)) for docEmbedding in docEmbeddings]
    if esScoreList is None:
        IDsWithScore = [(score, ID) for score, ID in zip(cosineSimScores, docList)]
    elif esScoreList is not None:
         IDsWithScore = [(weight(score, esScore), ID) for score, ID, esScore in zip(cosineSimScores, docList, esScoreList)]
    IDsWithScore.sort(reverse=True)
    IDsWithScore = IDsWithScore[:K + 1]                    ## Keep top-K documents only
    
    return [ID for _,ID in IDsWithScore]
    
    

In [11]:
## Search with Elastic Search
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

def elasticSearch(queryList, index='dblp_v1', K = 10):
    fields = ['id', 'title', 'venue', 'authors', 'year', 'abstract', 'fos']
    queryBody = {
    "query": {
        "multi_match" : {
            "query" : "sentence embeddings",
            "fields" : ['title', 'abstract', 'authors']
        }
    }
}
    queryBody['size'] = K
    searchResults = []
    for query in queryList:
        queryBody['query']['multi_match']['query'] = query
        res= es.search(index=index,body=queryBody)
        searchResults.append([hit['_id'] for hit in res['hits']['hits']])
    return searchResults

def rankedElasticSearch(queryList, index='dblp_v1', K = 10, includeEsScores=False, rerank=True):
    fields = ['id', 'title', 'venue', 'authors', 'year', 'abstract', 'fos']
    queryBody = {
    "query": {
        "multi_match" : {
            "query" : "sentence embeddings",
            "fields" : ['title^3', 'abstract', 'authors', 'fos^2']
        }
    }
}
    if rerank:
        queryBody['size'] = 2 * K
    else:
        queryBody['size'] = K
    queryBody['query']['multi_match']['fuzziness'] = 'AUTO'
    searchResults = []
    for query in queryList:
        queryBody['query']['multi_match']['query'] = query
        res= es.search(index=index,body=queryBody)
        initList = [hit['_id'] for hit in res['hits']['hits']]
        esScoreList = [hit['_score'] for hit in res['hits']['hits']]
        esScoreList = normalize(esScoreList)
        if rerank:
            if includeEsScores:
                searchResults.append(rankList(query, initList, esScoreList = esScoreList))
            else:
                searchResults.append(rankList(query, initList))
        else:
            searchResults.append(initList)
    return searchResults

def rankedMLTElasticSearch(queryList, index='dblp_v1', K = 10, includeEsScores=False, rerank = True):
    fields = ['id', 'title', 'venue', 'authors', 'year', 'abstract', 'fos']
    queryBody = {
    "query": {
        "more_like_this" : {
            "like" : "sentence embeddings",
            "fields" : ['title^3', 'abstract', 'authors', 'fos^2'],
            "min_term_freq" : 1
        }
    }
}    
    queryBody['size'] = 5 * K
    
    searchResults = []
    for query in queryList:
        queryBody['query']['more_like_this']['like'] = query
        res= es.search(index=index,body=queryBody)
        initList = [hit['_id'] for hit in res['hits']['hits']]
        esScoreList = [hit['_score'] for hit in res['hits']['hits']] 
        esScoreList = normalize(esScoreList)
        if rerank:
            if includeEsScores:
                searchResults.append(rankList(query, initList, esScoreList = esScoreList))
            else:
                searchResults.append(rankList(query, initList))
        else:
            searchResults.append(initList)
        
        
    return searchResults

def MLT(docID, index='dblp_v1', K = 10):
    fields = ['id', 'title', 'venue', 'authors', 'year', 'abstract', 'fos']
    queryBody = {
    "query": {
        "more_like_this" : {
            "like" : [
            {
                "_index" : index,
                "_id" : docID
            }
            ],
            "fields" : ['title', 'abstract', 'authors', 'fos'],
            "min_term_freq" : 1
        }
    }
}    
    queryBody['size'] = K
    res= es.search(index=index,body=queryBody)
    return [hit['_id'] for hit in res['hits']['hits']]

In [42]:
def rankWithSVM(paperID, paperEmbedding, docList, embeddings, K = 10, method='abstract', embeddingType='USE'):
    from sklearn import svm
    import random
    
    with open("./data/orderedPaperIDs.json", 'r') as f:
        IDList = json.load(f)
    negSampleSize = 2000
    excludeSet = set(docList)
    excludeSet.add(paperID)
    negSamples = []
    for id in IDList:
        if id not in excludeSet:
            negSamples.append(id)
    random.shuffle(negSamples)
    negSamples = negSamples[:negSampleSize]
    
    y = np.zeros(negSampleSize + 1, dtype=np.float32)
    y[-1] = 1
    trainEmbeddings =  getDocumentEmbeddings(negSamples , method=method, embeddingType=embeddingType).tolist()
    trainEmbeddings.append(paperEmbedding)
    trainEmbeddings = np.asarray(trainEmbeddings)
    
    c = list(zip(trainEmbeddings, y))
    random.shuffle(c)
    trainEmbeddings, y = zip(*c)
    
    clf = svm.LinearSVC(class_weight='balanced', verbose=True, tol=1e-4, C=0.1)
    print('Fitting SVM')
    clf.fit(trainEmbeddings, y)
    print('SVM fitted')
    s = clf.decision_function(np.asarray(embeddings))
    ix = np.argsort(s)[:-K-1:-1]
    return [docList[i] for i in ix]
    
    
def rankListWithCosine(paperID, paperEmbedding, docList, embeddings, K = 10, esScoreList=None):
    paperEmbedding = np.asarray(paperEmbedding)
    
    cosineSimScores = [ cosine_similarity(paperEmbedding, np.asarray(docEmbedding)) for docEmbedding in embeddings]
    if esScoreList is None:
        IDsWithScore = [(score, ID) for score, ID in zip(cosineSimScores, docList)]
    elif esScoreList is not None:
         IDsWithScore = [(weight(score, esScore), ID) for score, ID, esScore in zip(cosineSimScores, docList, esScoreList)]
    IDsWithScore.sort(reverse=True)
    IDsWithScore = IDsWithScore[:K + 1]                    ## Keep top-K documents only

    return [ID for _,ID in IDsWithScore]
    
        

def KClosestNodes(paperID, modelFile, K = 10, loadModel = None):
    '''Given a paper, returns the K closest nodes(as per the augmented citation graph)'''
    from gensim import models
    if loadModel is None:
        loadModel = models.keyedvectors.KeyedVectors.load_word2vec_format(modelFile)
    
    return [id for id, _ in loadModel.most_similar(paperID, topn=K)]

def KClosestReranked(paperID, modelFile,K = 10, rerankScheme='exemplar', embeddingType='USE', method='abstract', loadModel = None):
    '''Given a paper, reranks the closest nodes using text information of the papers'''
    initDocList = KClosestNodes(paperID,  modelFile, K=3*K, loadModel = loadModel)
    embeddings = getDocumentEmbeddings(initDocList + [paperID], method=method, embeddingType=embeddingType).tolist()
    paperEmbedding = embeddings[-1]
    embeddings.pop()
    if rerankScheme == 'exemplar':
        return rankWithSVM(paperID, paperEmbedding, initDocList, embeddings, K = 10, method=method, embeddingType=embeddingType)
    elif rerankScheme == 'cosine':
        return rankListWithCosine(paperID, paperEmbedding, initDocList, embeddings, K = 10, esScoreList=None)
    
    
    

In [None]:
# def esSeeder(queryList, index='dblp_v1', K = 10):
#     fields = ['id', 'title', 'venue', 'authors', 'year', 'abstract', 'fos']
#     queryBody = {
#     "query": {
#         "multi_match" : {
#             "query" : "sentence embeddings",
#             "fields" : ['title', 'abstract', 'authors', 'fos']
#         }
#     }
# }
#     queryBody['size'] = K
#     searchResults = []
#     esScores = []
#     for query in queryList:
#         queryBody['query']['multi_match']['query'] = query
#         res= es.search(index=index,body=queryBody)
#         searchResults.append([hit['_id'] for hit in res['hits']['hits']])
#         esScoreList = [hit['_score'] for hit in res['hits']['hits']]
#         esScoreList = normalize(esScoreList)
#         esScores.append(esScoreList)
#     return searchResults, esScores

# def generateSeeds(queryList,K = K, method = method, embeddingType = embeddingType, index = index, seeder='es'):
#     if seeder == 'es':
#         return esSeeder(queryList = K, index = index, K = K)

# def customMLT(queryList, K=10, method='title', embeddingType='USE', index='dblp_v1'):
#     seedDocs, seedScores = generateSeeds(queryList,K=K, method=method, embeddingType = embeddingType, index = index, seeder='es')
    

## Evaluating Query Search

In [None]:
queryList = ['converting word to speech', 'Big data', 'efficient estimation of word representations in vector space', 'natural language interface', 'reinforcement learning in video game']

In [None]:
results1 = searchWithEmbedding(queryList, K=10, method='title', embeddingType='USE')
results1 = [[str(ID) for ID in sublist] for sublist in results1]

In [None]:
results2 = searchWithEmbedding(queryList, K=10, method='fos', embeddingType='fastText')
results2 = [[str(ID) for ID in sublist] for sublist in results2]

In [None]:
results3 = elasticSearch(queryList)

In [None]:
results4 = rankedElasticSearch(queryList, index='dblp_v1', K = 10, includeEsScores=False, rerank=True)

In [None]:
results5 = rankedMLTElasticSearch(queryList, index='dblp_v1', K = 10, includeEsScores=False)

In [None]:
resutlIDList = []
for r1, r2, r3, r4, r5 in zip(results1, results2, results3, results4, results5):
    lis = list(set().union(r1, r2, r3, r4, r5))
    resutlIDList.append(lis)

from functools import reduce
import operator
IDs = set(reduce(operator.concat, resutlIDList))

In [None]:
del model
del fasttextModel

In [None]:
PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
i = 0
records = dict()
with open(PapersOutFileName, 'r') as file:
    for line in tqdm(file):
        if i % 2 != 0:
            data = json.loads(line)
            if (data['id'] in IDs):
                records[data['id']] = {'title': data['title'], 'abstract': data['abstract'], 'fos': ', '.join(data['fos'])}
        i += 1

In [None]:
import csv
import random

rows = []
for query, idSubList in tqdm(zip(queryList, resutlIDList)):
    for ID in idSubList:
        localDict = records[ID]
        rows.append([query, ID, localDict['title'], localDict['abstract'], localDict['fos'], 0])
random.shuffle(rows)
        
with open('./data/annotations.csv', "w") as file:
    writer = csv.writer(file)
    header = ['query', 'id', 'title', 'abstract', 'fos', 'score']
    writer.writerow(header)
    for row in rows:
            writer.writerow(row)


In [None]:
for lis in resutlIDList:
    print(len(lis))

In [None]:
results = [results1, results2, results3, results4, results5]
len(results)

with open('./data/search_results.json', 'w') as outfile:
    for i in tqdm(range(len(results))):
        outDict = dict()
        outDict['id'] = i
        outDict['result'] = results[i]
        json.dump(outDict, outfile)
        outfile.write('\n')

In [None]:
for ID in results6:
        localDict = records[ID]
        print(localDict['title'], localDict['abstract'])

##  Evaluating Paper recommendation schemes

In [13]:
# sequence: papers representive(in the same order) as queries in query list
paperIDs = ['1583502834','2061556416', '1614298861', '168216068', '2123151547']

In [15]:
results1 = []
for paperID in paperIDs:
    results1.append(MLT(paperID, index='dblp_v1', K = 10))

In [19]:
from gensim import models
modelFile = './models/node2vec_USE_Abstract_2Citation_Embeddings_WL_8_NN_42.kv'
loadModel = models.keyedvectors.KeyedVectors.load_word2vec_format(modelFile)

In [22]:
results2 = []
for paperID in paperIDs:
    results2.append(KClosestNodes(paperID, modelFile, K = 10, loadModel = loadModel))

In [32]:
results3 = []
for paperID in tqdm(paperIDs):
    results3.append(KClosestReranked(paperID, modelFile,K = 10, rerankScheme='cosine', embeddingType='USE', method='abstract', loadModel = loadModel))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [43]:
results4 = []
for paperID in tqdm(paperIDs):
    results4.append(KClosestReranked(paperID, modelFile,K = 10, rerankScheme='exemplar', embeddingType='USE', method='abstract', loadModel = loadModel))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Fitting SVM
[LibLinear]SVM fitted
Fitting SVM
[LibLinear]SVM fitted
Fitting SVM
[LibLinear]SVM fitted
Fitting SVM
[LibLinear]SVM fitted
Fitting SVM
[LibLinear]SVM fitted



In [36]:
results5 = []
for paperID in tqdm(paperIDs):
    results5.append(KClosestReranked(paperID, modelFile,K = 10, rerankScheme='cosine', embeddingType='tfIdf', method='abstract', loadModel = loadModel))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

  """





In [45]:
results6 = []
for paperID in tqdm(paperIDs):
    results6.append(KClosestReranked(paperID, modelFile,K = 10, rerankScheme='exemplar', embeddingType='tfIdf', method='abstract',  loadModel = loadModel))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Fitting SVM
[LibLinear]SVM fitted
Fitting SVM
[LibLinear]SVM fitted
Fitting SVM
[LibLinear]SVM fitted
Fitting SVM
[LibLinear]SVM fitted
Fitting SVM
[LibLinear]SVM fitted



In [46]:
del loadModel

In [47]:
resutlIDList = []
for r1, r2, r3, r4, r5, r6 in zip(results1, results2, results3, results4, results5, results6):
    lis = list(set().union(r1, r2, r3, r4, r5, r6))
    resutlIDList.append(lis)

from functools import reduce
import operator
IDs = set(reduce(operator.concat, resutlIDList))
IDs.update(paperIDs)

In [48]:
PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
i = 0
records = dict()
with open(PapersOutFileName, 'r') as file:
    for line in tqdm(file):
        if i % 2 != 0:
            data = json.loads(line)
            if (data['id'] in IDs):
                records[data['id']] = {'title': data['title'], 'abstract': data['abstract'], 'fos': ', '.join(data['fos'])}
        i += 1

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [49]:
import csv
import random

rows = []
for paperID, idSubList in tqdm(zip(paperIDs, resutlIDList)):
    for ID in idSubList:
        localDict = records[ID]
        rows.append([records[paperID]['title'], ID, localDict['title'], localDict['abstract'], localDict['fos'], 0])
random.shuffle(rows)
        
with open('./data/recommendationAnnotations.csv', "w") as file:
    writer = csv.writer(file)
    header = ['paper', 'id', 'title', 'abstract', 'fos', 'score']
    writer.writerow(header)
    for row in rows:
            writer.writerow(row)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [50]:
results = [results1, results2, results3, results4, results5, results6]
len(results)

with open('./data/recommendation_results.json', 'w') as outfile:
    for i in tqdm(range(len(results))):
        outDict = dict()
        outDict['id'] = i
        outDict['result'] = results[i]
        json.dump(outDict, outfile)
        outfile.write('\n')

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [16]:
titles = []
IDList = []
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        titles.append(data['title'])
        IDList.append(data['id'])
def ret(paperID):
    for id, title in zip(IDList, titles):
        if (id == paperID):
            return title

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [18]:
[[ret(id) for id in sublist] for sublist in results1]

[['Novel-word pronunciation: a cross-language study',
  'A pronunciation-by-analogy module for the Festival text-to-speech synthesiser',
  'Improving pronunciation by analogy for text-to-speech applications.',
  'Evaluating the pronunciation component of text-to-speech systems for English: a performance comparison of different approaches',
  'A multistrategy approach to improving pronunciation by analogy',
  'Can syllabification improve pronunciation by analogy of English',
  'Text-to-speech conversion technology',
  'Comparative evaluation of letter-to-sound conversion techniques for English text-to-speech synthesis',
  'A Chinese text-to-speech system.',
  'Word and syllable models for German text-to-speech synthesis.'],
 ['A Parallel Distributed Weka Framework for Big Data Mining Using Spark',
  'Big Data Pre-processing: A Quality Framework',
  'Evaluating the Quality of Social Media Data in Big Data Architecture',
  'A Data Quality in Use model for Big Data',
  'Data quality: The o

In [23]:
[[ret(id) for id in sublist] for sublist in results2]

[['On the development of a name pronunciation system.',
  'A Self-Learning Approach to Transcription of Danish Proper Names',
  'Novel-word pronunciation: a cross-language study',
  'Improved surname pronunciations using decision trees.',
  'Phonetic transcription standards for european names (onomastica).',
  'Pronouncing text by analogy',
  'Learning Phonetic Rules in a Speech Recognition System',
  'Proper name pronunciations for speech technology applications',
  'Stochastic phonographic transduction for English',
  'Predicting name pronunciation for a reverse directory service.'],
 ['Low latency analytics for streaming traffic data with Apache Spark',
  'A Survey on Benchmarks for Big Data and Some More Considerations',
  'Apache hadoop goes realtime at Facebook',
  'CloudRank-D: benchmarking and ranking cloud computing systems for data processing applications',
  'Hourglass: A library for incremental processing on Hadoop',
  'Twitter Heron: Stream Processing at Scale',
  "Buildin

In [33]:
[[ret(id) for id in sublist] for sublist in results3]

[['Improving pronunciation by analogy for text-to-speech applications.',
  'Novel-word pronunciation: a cross-language study',
  'A bi-directional model of English pronunciation.',
  'On the development of a name pronunciation system.',
  'Proper name pronunciations for speech technology applications',
  'Multi-Lingual Testing of a Self-Learning Approach to Phonemic Transcription of Orthography',
  'Predicting name pronunciation for a reverse directory service.',
  'Phonetic transcription standards for european names (onomastica).',
  'Segmental duration modelling in a text-to-speech system for the galician language.',
  'Pmtools : A pronunciation modeling toolkit.',
  'The pronunciation of unfamiliar native and non-native town names.'],
 ['Big data as the new enabler in business and other intelligence',
  'Survey of real-time processing systems for big data',
  'A Brief Review on Leading Big Data Models',
  'SeqPig: simple and scalable scripting for large sequencing data sets in Hadoo

In [44]:
[[ret(id) for id in sublist] for sublist in results4]

[['On the development of a name pronunciation system.',
  'Improving pronunciation by analogy for text-to-speech applications.',
  'Predicting name pronunciation for a reverse directory service.',
  'A bi-directional model of English pronunciation.',
  'Proper name pronunciations for speech technology applications',
  'The pronunciation of unfamiliar native and non-native town names.',
  'Phonetic transcription standards for european names (onomastica).',
  'Novel-word pronunciation: a cross-language study',
  'Improved surname pronunciations using decision trees.',
  'Segmental duration modelling in a text-to-speech system for the galician language.'],
 ['Big data as the new enabler in business and other intelligence',
  'Hadoop at home: large-scale computing at a small college',
  'Apache hadoop goes realtime at Facebook',
  'SeqPig: simple and scalable scripting for large sequencing data sets in Hadoop',
  'Big Data Generation',
  'TPC-H Benchmark Analytics Scenarios and Performance

In [37]:
[[ret(id) for id in sublist] for sublist in results5]

[['On the development of a name pronunciation system.',
  'A Self-Learning Approach to Transcription of Danish Proper Names',
  'Novel-word pronunciation: a cross-language study',
  'Improved surname pronunciations using decision trees.',
  'Phonetic transcription standards for european names (onomastica).',
  'Pronouncing text by analogy',
  'Learning Phonetic Rules in a Speech Recognition System',
  'Proper name pronunciations for speech technology applications',
  'Stochastic phonographic transduction for English',
  'Predicting name pronunciation for a reverse directory service.',
  'Pmtools : A pronunciation modeling toolkit.'],
 ['A Brief Review on Leading Big Data Models',
  'Big Data Generation',
  'Big data as the new enabler in business and other intelligence',
  'Strategic Alignment of Cloud-Based Architectures for Big Data',
  'A fast and high throughput SQL query system for big data',
  'BDGS: A Scalable Big Data Generator Suite in Big Data Benchmarking',
  'Thoth: towards

In [54]:
[[ret(id) for id in sublist] for sublist in results6]

[['The pronunciation of unfamiliar native and non-native town names.',
  'Assigning phrase breaks from part-of-speech sequences.',
  'Predicting name pronunciation for a reverse directory service.',
  'Improving pronunciation by analogy for text-to-speech applications.',
  'Automating the design of compact linguistic corpora.',
  'A Self-Learning Approach to Transcription of Danish Proper Names',
  'Phonetic transcription standards for european names (onomastica).',
  'On the development of a name pronunciation system.',
  'Multi-Lingual Testing of a Self-Learning Approach to Phonemic Transcription of Orthography',
  'Proper name pronunciations for speech technology applications'],
 ['A Brief Review on Leading Big Data Models',
  'Big Data Generation',
  'Strategic Alignment of Cloud-Based Architectures for Big Data',
  'Big data as the new enabler in business and other intelligence',
  'A fast and high throughput SQL query system for big data',
  'BDGS: A Scalable Big Data Generator S

In [53]:
len(rows)

149