In [3]:
import json
import numpy as np
import csv

In [4]:
queryList = ['converting text to speech', 'big data', 'efficient estimation of word representations in vector space', 'natural language interface', 'reinforcement learning in video game']
queryToIdx = {queryList[i]:i for i in range(len(queryList))}
queryToIdx['converting word to speech'] = 0
queryToIdx['Big data'] = 1
annotationDict = [{} for i in range(len(queryList))]
annotationFile = './data/entityAnnotations2.csv'
with open (annotationFile,'r') as csv_file:
    reader =csv.reader(csv_file)
    next(reader) # skip first row
    for row in reader:
        annotationDict[queryToIdx[row[0]]][row[1]] = int(row[-1])

In [5]:
def dcg(relevanceScores, k = 10, method=0):
    """
    Returns discounted cumulative gain (dcg)
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    relevanceScores = np.asfarray(relevanceScores)[:k]
    if relevanceScores.size:
        if method == 0:
            return relevanceScores[0] + np.sum(relevanceScores[1:] / np.log2(np.arange(2, relevanceScores.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0

def ndcgMax(relevanceScores, k=10, method=0):
    return dcg(sorted(relevanceScores, reverse=True), k, method)

def ndcg(relevanceScores, ndcgMax, k = 10, method=0):
    return dcg(relevanceScores, k, method) / ndcgMax
    
    

In [72]:
relevanceScores= []
### search_results.json 5, entity_search_results.json 6, entity_search_resultsSiamese.json 1
filePath = './data/'

fileName = 'entity_search_results.json'
annotationFile = filePath + fileName

with open(annotationFile, 'r') as file:
    for line in file:
        data = json.loads(line)
        result = data['result']
        for i in range(len(result)):
            result[i] = [annotationDict[i].get(ID, 0) for ID in result[i]]
        relevanceScores.append(result)

In [73]:
ndcgMaxPerQuery = []
for i in range(len(queryList)):
    ndcgMaxPerQuery.append(ndcgMax(list(annotationDict[i].values())))
ndcgMaxPerQuery[-1] = ndcgMaxPerQuery[0]
print(ndcgMaxPerQuery)

[15.763483535311373, 15.763483535311373, 15.763483535311373, 15.763483535311373, 15.763483535311373]


In [74]:
ndcgMaxPerQuery = [15.763483535311373] * 5
print(ndcgMaxPerQuery)

[15.763483535311373, 15.763483535311373, 15.763483535311373, 15.763483535311373, 15.763483535311373]


In [75]:
# Search methods: [searchWithEmbedding title USE, searchWithEmbedding fos FT, elasticSearch, rankedElasticSearch, rankedMLTElasticSearch] 
#numSearchMethods = 6
#assert numSearchMethods == len(relevanceScores)
results = []
meanScores = []

for i in range(len(relevanceScores)):
    relScoresForThisMethod = relevanceScores[i]
    ndcgScoresMethod = []
    for q in range(len(queryList)):
        ndcgScoresMethod.append(ndcg(relScoresForThisMethod[q], ndcgMaxPerQuery[q]))
    results.append(ndcgScoresMethod)
    meanScores.append(np.mean(ndcgScoresMethod))

In [76]:
meanScores

[0.9762365622836594,
 0.9087934022664783,
 0.9555935717868485,
 0.8085255869022709,
 0.9336545127057235,
 0.9403742034003058]

In [77]:
for result in results:
    result = [round(i, 2) for i in result]
    print(result)

[0.93, 1.0, 1.0, 1.0, 0.95]
[0.84, 1.0, 0.72, 1.0, 0.98]
[0.94, 1.0, 1.0, 1.0, 0.84]
[0.32, 1.0, 0.97, 1.0, 0.75]
[0.74, 1.0, 0.98, 1.0, 0.95]
[0.81, 1.0, 0.98, 1.0, 0.91]


In [10]:
from tqdm.notebook import tqdm
titles = []
IDList = []
with open('./data/dblp_AIpapers2Thresholded.json', 'r') as file:
    for line in tqdm(file):
        data = json.loads(line)
        titles.append(data['title'])
        IDList.append(data['id'])
def ret(paperID):
    for id, title in zip(IDList, titles):
        if (id == paperID):
            return title

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [11]:
fileName = 'entity_search_results.json'
annotationFile = filePath + fileName

with open(annotationFile, 'r') as file:
    for line in file:
        data = json.loads(line)
        result = data['result']
        for i in range(len(result)):
            result[i] = [ret(ID) for ID in result[i]]
            


In [15]:
res = ["2744714095", "2042439732", "2016945372", "2590481545", "2017980908", "2191395120", "2753316839", "1584307643", "2099100618", "2750605955"]
[ret(ID) for ID in res]

['A Secure Mobile Crowdsensing Game With Deep Reinforcement Learning',
 'Automatic computer game balancing: a reinforcement learning approach',
 'Application of reinforcement learning to the game of Othello',
 'Position-based reinforcement learning biased MCTS for General Video Game Playing',
 'EXPERIMENTS WITH ONLINE REINFORCEMENT LEARNING IN REAL-TIME STRATEGY GAMES',
 'An object-oriented approach to reinforcement learning in an action game',
 'Deep Learning for Video Game Playing.',
 'High-level reinforcement learning in strategy games',
 'GENERAL GAME-PLAYING AND REINFORCEMENT LEARNING',
 'A Unified Game-Theoretic Approach to Multiagent Reinforcement Learning']

In [None]:
dcg([3, 3, 3, 3, 3, 3, 0, 3, 2, 3]) / ndcgMaxPerQuery[0]

In [None]:
fileName = 'entity_search_results.json'
annotationFile = filePath + fileName
with open(annotationFile, 'r') as file:
    for line in file:
        data = json.loads(line)
        result = data['result']
        for i in range(len(result)):
            result[i] = [annotationDict[i].get(ID, 0) for ID in result[i]]

In [14]:
annotationDict[4]['2166159790']

0

## Evaluating Recommendations

In [7]:
queryList = ['Novel-word pronunciation within a text-to-speech system', 'Big Data Framework', 'Efficient Estimation of Word Representations in Vector Space', 'Natural Language Interface Using Shallow Parsing.', 'Reinforcement Learning in First Person Shooter Games']
queryToIdx = {queryList[i]:i for i in range(len(queryList))}
annotationDict = [{} for i in range(len(queryList))]

with open ('./data/recommendationAnnotations2.csv','r') as csv_file:
    reader =csv.reader(csv_file)
    next(reader) # skip first row
    for row in reader:
        annotationDict[queryToIdx[row[0]]][row[1]] = row[-1]

In [8]:
relevanceScores= []
fileName = './data/recommendation_results2.json'
with open(fileName, 'r') as file:
    for line in file:
        data = json.loads(line)
        result = data['result']
        for i in range(len(result)):
            result[i] = [annotationDict[i].get(ID, 0) for ID in result[i]]
        relevanceScores.append(result)

In [5]:
ndcgMaxPerQuery = []
for i in range(len(queryList)):
    ndcgMaxPerQuery.append(ndcgMax(list(annotationDict[i].values())))

In [9]:
# Search methods: [MLT, K closest node embed direct, node rerankedcosine USEabstract, node rerankedExemplar USEabstract
##                 node rerankedcosine TfIdf, node rerankedExemplar TfIdf] 
numSearchMethods = 3
assert numSearchMethods == len(relevanceScores)
results = []
meanScores = []

for i in range(numSearchMethods):
    relScoresForThisMethod = relevanceScores[i]
    ndcgScoresMethod = []
    for q in range(len(queryList)):
        ndcgScoresMethod.append(ndcg(relScoresForThisMethod[q], ndcgMaxPerQuery[q]))
    results.append(ndcgScoresMethod)
    meanScores.append(np.mean(ndcgScoresMethod))

In [10]:
[round(score, 3) for score in meanScores]

[0.635, 0.746, 0.734]

In [11]:
for result in results:
    result = [round(i, 3) for i in result]
    print(result)

[0.605, 1.0, 0.629, 0.285, 0.655]
[0.758, 1.0, 0.899, 0.352, 0.72]
[0.705, 1.0, 0.868, 0.281, 0.819]


In [None]:
annotationDict

In [None]:
dcg([0, 3, 3, 3, 3, 3, 3, 3, 0, 0, 3] , 10) / 15.763483535311373

In [None]:
dcg([0, 3, 3, 3, 3, 3, 3, 3, 0, 0, 3] , 10)

In [6]:
ndcgMaxPerQuery

[15.763483535311373,
 15.763483535311373,
 15.763483535311373,
 15.763483535311373,
 15.763483535311373]

## Evaluating topic suggestions

In [None]:
queryList = ['chatbot', 'heuristic search', 'cnn', 'word embedding', 'activation function']

In [9]:
ours = [[1, 1, 1, 0], [1, 1, 1, 1], [1, 1, 1, 1,], [1, 1, 1, 1], [0, 1, 1, 1]]
ss = [[0, 0, 0, 0], [1, 0, 0, 1], [0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 0]]
arxlive = [[0, 0, 1, 0], [0, 0, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0]]

In [10]:
ndcgMaxPerQuery = [ndcgMax([1] * 4)] * 5
relevanceScores = [ours, ss, arxlive]

In [12]:
results = []
meanScores = []
numSearchMethods = 3
for i in range(numSearchMethods):
    relScoresForThisMethod = relevanceScores[i]
    ndcgScoresMethod = []
    for q in range(len(queryList)):
        ndcgScoresMethod.append(ndcg(relScoresForThisMethod[q], ndcgMaxPerQuery[q]))
    results.append(ndcgScoresMethod)
    meanScores.append(np.mean(ndcgScoresMethod))

In [13]:
[round(score, 2) for score in meanScores]

[0.9, 0.14, 0.18]

In [14]:
for result in results:
    result = [round(i, 3) for i in result]
    print(result)

[0.84, 1.0, 1.0, 1.0, 0.681]
[0.0, 0.479, 0.0, 0.202, 0.0]
[0.202, 0.0, 0.202, 0.319, 0.202]
