In [1]:
import json
import numpy as np
import csv

In [32]:
queryList = ['converting word to speech', 'Big data', 'efficient estimation of word representations in vector space', 'natural language interface', 'reinforcement learning in video game']
queryToIdx = {queryList[i]:i for i in range(len(queryList))}
annotationDict = [{} for i in range(len(queryList))]


with open ('./data/annotated.csv','r') as csv_file:
    reader =csv.reader(csv_file)
    next(reader) # skip first row
    for row in reader:
        annotationDict[queryToIdx[row[0]]][row[1]] = int(row[-1])

In [33]:
def dcg(relevanceScores, k = 10, method=0):
    """
    Returns discounted cumulative gain (dcg)
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    relevanceScores = np.asfarray(relevanceScores)[:k]
    if relevanceScores.size:
        if method == 0:
            return relevanceScores[0] + np.sum(relevanceScores[1:] / np.log2(np.arange(2, relevanceScores.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0

def ndcgMax(relevanceScores, k=10, method=0):
    return dcg(sorted(relevanceScores, reverse=True), k, method)

def ndcg(relevanceScores, ndcgMax, k = 10, method=0):
    return dcg(relevanceScores, k, method) / ndcgMax
    
    

In [34]:
relevanceScores= []
fileName = './data/search_results.json'
with open(fileName, 'r') as file:
    for line in file:
        data = json.loads(line)
        result = data['result']
        for i in range(len(result)):
            result[i] = [annotationDict[i].get(ID, 0) for ID in result[i]]
        relevanceScores.append(result)

In [35]:
ndcgMaxPerQuery = []
for i in range(len(queryList)):
    ndcgMaxPerQuery.append(ndcgMax(list(annotationDict[i].values())))

In [36]:
# Search methods: [searchWithEmbedding title USE, searchWithEmbedding fos FT, elasticSearch, rankedElasticSearch, rankedMLTElasticSearch] 
numSearchMethods = 5
assert numSearchMethods == len(relevanceScores)
results = []
meanScores = []

for i in range(numSearchMethods):
    relScoresForThisMethod = relevanceScores[i]
    ndcgScoresMethod = []
    for q in range(len(queryList)):
        ndcgScoresMethod.append(ndcg(relScoresForThisMethod[q], ndcgMaxPerQuery[q]))
    results.append(ndcgScoresMethod)
    meanScores.append(np.mean(ndcgScoresMethod))

In [37]:
meanScores

[0.8214071817871507,
 0.2459638247296175,
 0.8333983161011485,
 0.6310996696367052,
 0.11692473507876104]

In [38]:
for result in results:
    #result = [round(i, 2) for i in result]
    print(result)

[0.5131246265799582, 1.0, 0.8275409837116328, 0.9399628496939022, 0.82640744895026]
[0.293256878238304, 0.9365622454097836, 0.0, 0.0, 0.0]
[0.35390423999227316, 1.0, 1.0, 0.9799876165646341, 0.8330997239488355]
[0.26558154794628924, 1.0, 0.44200917087882935, 0.917465578420558, 0.5304420509378489]
[0.06938418372001053, 0.07362322034364345, 0.09515663188532462, 0.0, 0.3464596394448266]


## Evaluating Recommendations

In [39]:
queryList = ['Novel-word pronunciation within a text-to-speech system', 'Big Data Framework', 'Efficient Estimation of Word Representations in Vector Space', 'Natural Language Interface Using Shallow Parsing.', 'Reinforcement Learning in First Person Shooter Games']
queryToIdx = {queryList[i]:i for i in range(len(queryList))}
annotationDict = [{} for i in range(len(queryList))]

with open ('./data/recommendationAnnotations.csv','r') as csv_file:
    reader =csv.reader(csv_file)
    next(reader) # skip first row
    for row in reader:
        annotationDict[queryToIdx[row[0]]][row[1]] = row[-1]

In [40]:
relevanceScores= []
fileName = './data/recommendation_results.json'
with open(fileName, 'r') as file:
    for line in file:
        data = json.loads(line)
        result = data['result']
        for i in range(len(result)):
            result[i] = [annotationDict[i].get(ID, 0) for ID in result[i]]
        relevanceScores.append(result)

In [41]:
ndcgMaxPerQuery = []
for i in range(len(queryList)):
    ndcgMaxPerQuery.append(ndcgMax(list(annotationDict[i].values())))

In [42]:
# Search methods: [MLT, K closest node embed direct, node rerankedcosine USEabstract, node rerankedExemplar USEabstract
##                 node rerankedcosine TfIdf, node rerankedExemplar TfIdf] 
numSearchMethods = 6
assert numSearchMethods == len(relevanceScores)
results = []
meanScores = []

for i in range(numSearchMethods):
    relScoresForThisMethod = relevanceScores[i]
    ndcgScoresMethod = []
    for q in range(len(queryList)):
        ndcgScoresMethod.append(ndcg(relScoresForThisMethod[q], ndcgMaxPerQuery[q]))
    results.append(ndcgScoresMethod)
    meanScores.append(np.mean(ndcgScoresMethod))

In [48]:
[round(score, 3) for score in meanScores]

[0.941, 0.741, 0.81, 0.828, 0.766, 0.792]

In [45]:
for result in results:
    result = [round(i, 3) for i in result]
    print(result)

[1.0, 1.0, 1.0, 0.768, 0.939]
[0.773, 1.0, 0.805, 0.507, 0.618]
[0.958, 1.0, 0.897, 0.478, 0.72]
[0.955, 1.0, 0.935, 0.528, 0.724]
[0.773, 1.0, 0.921, 0.558, 0.578]
[0.824, 1.0, 0.939, 0.594, 0.605]


In [22]:
annotationDict

[{'2402998908': '3',
  '2171975580': '3',
  '1501108632': '3',
  '2056303394': '3',
  '2095214367': '3',
  '2035633532': '3',
  '195096782': '3',
  '39973858': '0',
  '2137926008': '3',
  '32907139': '3',
  '205185618': '3',
  '115447653': '3',
  '2086182527': '3',
  '2146566006': '1',
  '2122546880': '3',
  '2115441971': '3',
  '2295520768': '3',
  '1531367301': '3',
  '1970195330': '3',
  '1550693343': '3',
  '2400055890': '3',
  '2069828345': '3',
  '1501365369': '1',
  '1562308513': '1',
  '2603096619': '3',
  '1267119491': '3',
  '1529238635': '3'},
 {'1975912085': '3',
  '2075835333': '3',
  '28784159': '3',
  '1491282455': '3',
  '1495984639': '3',
  '2008503861': '3',
  '206771271': '3',
  '2072021226': '3',
  '2053525776': '3',
  '2192247565': '3',
  '1798754286': '3',
  '2109638884': '3',
  '2157954477': '3',
  '1912802097': '3',
  '2088734805': '3',
  '2195118355': '3',
  '2082839002': '3',
  '2117798581': '3',
  '2040263621': '3',
  '2243099907': '3',
  '1985419898': '3',
 