In [1]:
cd ..

/Users/mig/Desktop/pitt/projects/Group-Project-2140/cord_ir


In [2]:
from search.elastic_index_reader import IndexReader
from tqdm.notebook import trange, tqdm
import math

In [3]:
import xml.etree.ElementTree as ET
queryTree = ET.parse('../data/2020-07-16/eval/topics-rnd5.xml')

In [4]:
queryRoot = queryTree.getroot()
queries = []
for child in queryRoot:
    query = {
        'queryNo': child.attrib['number'],
        'query': child.find('query').text,
        'question': child.find('question').text,
        'narrative': child.find('narrative').text
    }
    queries.append(query)

In [5]:
reader = IndexReader()
for query in queries:
    query['query_tokens'] = [t['token'] for t in reader.tokenize(query['query'])['tokens']]
    query['question_tokens'] = [t['token'] for t in reader.tokenize(query['question'])['tokens']]
    query['narrative_tokens'] = [t['token'] for t in reader.tokenize(query['narrative'])['tokens']]



In [6]:
def getRetrievalResults(queries, field):
    results = {}
    for query in tqdm(queries):
        res = reader.search("cord_test", query[field], size=200, fields=[], highlight=False)
        results[query['queryNo']] = res['hits']['hits']
    return results

In [7]:
queryFields = ['query', 'question', 'narrative']
results = {f: getRetrievalResults(queries, f) for f in queryFields}

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [8]:
# load the judgments
judgments = {q['queryNo']: [] for q in queries}
with open('../data/2020-07-16/eval/qrels-covid_d5_j0.5-5.txt', 'r') as qrels:
    for line in qrels:
        [topicId, iteration, cordId, judgment] = line.strip('\n').split(' ')
        judgments[topicId].append({
            'iteration': iteration,
            'cordId': cordId,
            'judgment': judgment
        })

In [9]:
for field in queryFields:
    with open('../data/2020-07-16/eval/%s-result.txt'%field, 'w') as res_file:
        fieldResults = results[field]
        for query in tqdm(queries):
            for (index, result) in enumerate(fieldResults[query['queryNo']]):
                line = ' '.join([query['queryNo'], 'Q0', result['_id'], str(index + 1), str(result['_score']), field])
                res_file.write(line + '\n')

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [10]:
# use trec_eval to get evaluation
!../data/2020-07-16/eval/trec_eval ../data/2020-07-16/eval/qrels-covid_d5_j0.5-5.txt ../data/2020-07-16/eval/query-result.txt
!../data/2020-07-16/eval/trec_eval ../data/2020-07-16/eval/qrels-covid_d5_j0.5-5.txt ../data/2020-07-16/eval/question-result.txt
!../data/2020-07-16/eval/trec_eval ../data/2020-07-16/eval/qrels-covid_d5_j0.5-5.txt ../data/2020-07-16/eval/narrative-result.txt

runid                 	all	query
num_q                 	all	50
num_ret               	all	10000
num_rel               	all	26664
num_rel_ret           	all	3395
map                   	all	0.0791
gm_map                	all	0.0344
Rprec                 	all	0.1349
bpref                 	all	0.1264
recip_rank            	all	0.7509
iprec_at_recall_0.00  	all	0.7881
iprec_at_recall_0.10  	all	0.3815
iprec_at_recall_0.20  	all	0.1179
iprec_at_recall_0.30  	all	0.0225
iprec_at_recall_0.40  	all	0.0000
iprec_at_recall_0.50  	all	0.0000
iprec_at_recall_0.60  	all	0.0000
iprec_at_recall_0.70  	all	0.0000
iprec_at_recall_0.80  	all	0.0000
iprec_at_recall_0.90  	all	0.0000
iprec_at_recall_1.00  	all	0.0000
P_5                   	all	0.5960
P_10                  	all	0.5480
P_15                  	all	0.5147
P_20                  	all	0.5140
P_30                  	all	0.4953
P_100                 	all	0.4030
P_200                 	all	0.3395
P_500                 	all	0.1358
P_1000                	

In [11]:
# calculate in Python
ndcg_eval_at = [3, 5, 10, 15, 20, 25, 30, 40]
recall_eval_at = [5, 10, 20, 50, 100]
def getEvaluation(results, judgments):
    queryEval = []
    for queryNo in results:
        # doc to relevant score
        relMap = {}
        # number of docs in each relevant score
        relCount = {}
        totalRel = 0
        for j in judgments[queryNo]:
            score = int(j['judgment'])
            relMap[j['cordId']] = score
            relCount[score] = relCount.get(score, 0) + 1
            if score > 0:
                totalRel += 1
        truePositive = 0
        falsePositive = 0
        precisions = []
        reciprocalRank = 0
        cumulativeGain = 0
        discountedCumulativeGain = 0
        idealDCG = 0
        dcgAt = {}
        idcgAt = {}
        ndcgAt = {}
        index = 1
        recallAt = {}
        for score in sorted(relCount.keys(), reverse=True):
            for i in range(relCount[score]):
                idealDCG += (2 ** score - 1) / (math.log2(1 + index))
                if index in ndcg_eval_at:
                    idcgAt[index] = idealDCG
                index += 1
        for (index, doc) in enumerate(results[queryNo]):
            docId = doc['_id']
            cumulativeGain += relMap.get(docId, 0)
            discountedCumulativeGain += (2 ** (relMap.get(docId, 0)) - 1) / (math.log2(1 + (1 + i)))
            if relMap.get(docId, 0) > 0:
                truePositive += 1
                # recall increase
                precisions.append(truePositive / (truePositive + falsePositive))
                if reciprocalRank == 0:
                    reciprocalRank = 1 / (index+1)
            else:
                falsePositive += 1
            if index + 1 in ndcg_eval_at and (index + 1) in idcgAt:
                dcgAt[index + 1] = discountedCumulativeGain
                ndcgAt[index + 1] = dcgAt[index + 1] / idcgAt[index + 1]
            if index + 1 in recall_eval_at:
                recallAt[index + 1] = truePositive / totalRel
        queryEval.append({
            'AveragePrecision': sum(precisions) / totalRel,
            'ReciprocalRank': reciprocalRank,
            'CG': cumulativeGain,
            'DCG': discountedCumulativeGain,
            'IDCG': idealDCG,
            'nDCG': discountedCumulativeGain / idealDCG,
            'nDCGAt': ndcgAt,
            'recallAt': recallAt
        })
    return {
        'MeanAveragePrecisions': sum(map(lambda e: e['AveragePrecision'], queryEval)) / len(queryEval),
        'MeanReciprocalRank': sum(map(lambda e: e['ReciprocalRank'], queryEval)) / len(queryEval),
        'AverageNDCG': sum(map(lambda e: e['nDCG'], queryEval)) / len(queryEval),
        'AverageNDCGAt': {k: sum(map(lambda e: e['nDCGAt'][k], queryEval)) / len(queryEval) for k in ndcg_eval_at},
        'AverageRecallAt': {k: sum(map(lambda e: e['recallAt'][k], queryEval)) / len(queryEval) for k in recall_eval_at}
    }
            

In [12]:
getEvaluation(results['query'], judgments)

{'MeanAveragePrecisions': 0.07905817051715658,
 'MeanReciprocalRank': 0.7404582166199813,
 'AverageNDCG': 0.1203448947865941,
 'AverageNDCGAt': {3: 0.09962907856276683,
  5: 0.11286242159868014,
  10: 0.13322677065987917,
  15: 0.1471942045945804,
  20: 0.16160510771686865,
  25: 0.1670552303123782,
  30: 0.17365398038470478,
  40: 0.17473461145635494},
 'AverageRecallAt': {5: 0.006988752457588285,
  10: 0.012564758938439573,
  20: 0.023052673601248047,
  50: 0.04808866524500816,
  100: 0.0829858583204547}}

In [13]:
getEvaluation(results['question'], judgments)

{'MeanAveragePrecisions': 0.04220131446633107,
 'MeanReciprocalRank': 0.6208614718614719,
 'AverageNDCG': 0.07316280439784144,
 'AverageNDCGAt': {3: 0.06367230456523088,
  5: 0.06498192538934379,
  10: 0.07875649120708939,
  15: 0.08630291585790087,
  20: 0.0952219411193689,
  25: 0.1016283289161539,
  30: 0.1053778390263165,
  40: 0.10575790792898676},
 'AverageRecallAt': {5: 0.00481984842025312,
  10: 0.008675436958328335,
  20: 0.01548129642156619,
  50: 0.03196019574083703,
  100: 0.05562349721084239}}

In [14]:
getEvaluation(results['narrative'], judgments)

{'MeanAveragePrecisions': 0.030891875855406422,
 'MeanReciprocalRank': 0.5323157251473863,
 'AverageNDCG': 0.06652407617726913,
 'AverageNDCGAt': {3: 0.050821528421235815,
  5: 0.07163416628936221,
  10: 0.08641802127135723,
  15: 0.09762487472789444,
  20: 0.09956498480952469,
  25: 0.10190612240665096,
  30: 0.1016210409440072,
  40: 0.10664452683987584},
 'AverageRecallAt': {5: 0.0040163215737498,
  10: 0.007869489447461506,
  20: 0.014053326195406082,
  50: 0.027675681896049045,
  100: 0.047479684284937373}}

In [15]:
# ML Part 
from search.ml_rank import MlRanker
from search.data_loader import DataLoader
ranker = MlRanker("../data/models/ranker.joblib",
                  '../data/models/tfidf.joblib',
                  "../data/models/docMatrix.joblib")
loader = DataLoader('../data/2020-07-16')
loader.load_metadata_mappings(loader.load_metadata())

In [16]:
def getMlRetrievalResults(queries, field):
    results = {}
    for query in tqdm(queries):
        res = reader.search("cord_test", query[field], size=500, fields=[], highlight=False)
        results[query['queryNo']] = ranker.rank(query[field], res['hits']['hits'], loader)[:200]
#         results[query['queryNo']] = ranker.whole_rank(query[field], loader, size=3000)
    return results
queryFields = ['query', 'question', 'narrative']
mlResults = {f: getMlRetrievalResults(queries, f) for f in queryFields}

  0%|          | 0/50 [00:00<?, ?it/s]



  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [17]:
getEvaluation(mlResults['query'], judgments)

{'MeanAveragePrecisions': 0.08909665591062012,
 'MeanReciprocalRank': 0.8300108047690016,
 'AverageNDCG': 0.13354670835492266,
 'AverageNDCGAt': {3: 0.1556992130036726,
  5: 0.17691309388655238,
  10: 0.1995525814494245,
  15: 0.20724525686129394,
  20: 0.21690807718624938,
  25: 0.22144590377390178,
  30: 0.2184926267444479,
  40: 0.2208855089694549},
 'AverageRecallAt': {5: 0.009195267123481116,
  10: 0.015760882429161027,
  20: 0.027554322182414513,
  50: 0.056245261644712906,
  100: 0.09248961586569561}}

In [18]:
getEvaluation(mlResults['question'], judgments)

{'MeanAveragePrecisions': 0.06933904379137723,
 'MeanReciprocalRank': 0.8561666666666666,
 'AverageNDCG': 0.10303120234331954,
 'AverageNDCGAt': {3: 0.15797849180424106,
  5: 0.17271698662109178,
  10: 0.20880836437751832,
  15: 0.21957581371648605,
  20: 0.21444726159121721,
  25: 0.21415979617272654,
  30: 0.2139261334732043,
  40: 0.20944752188655283},
 'AverageRecallAt': {5: 0.0083475072404962,
  10: 0.015143784926980748,
  20: 0.02639191547580016,
  50: 0.05056268732852716,
  100: 0.0787716806081566}}

In [19]:
getEvaluation(mlResults['narrative'], judgments)

{'MeanAveragePrecisions': 0.052429968976860745,
 'MeanReciprocalRank': 0.7381911209358017,
 'AverageNDCG': 0.09222067945144731,
 'AverageNDCGAt': {3: 0.13121264337812696,
  5: 0.15057619250419707,
  10: 0.18374837193945076,
  15: 0.18598510844882654,
  20: 0.19580981009715237,
  25: 0.19875688139536013,
  30: 0.2035233749863616,
  40: 0.19535825583538038},
 'AverageRecallAt': {5: 0.007146272287410422,
  10: 0.012602879150521506,
  20: 0.021883227665868824,
  50: 0.04221740559730582,
  100: 0.06734230437865764}}

In [20]:
def get_diff(res1, res2):
    res = {}
    for key in res1.keys():
        if isinstance(res1[key], dict):
            res[key] = {}
            for key2 in res1[key].keys():
                res[key][key2] = str(((res2[key][key2] - res1[key][key2]) / res1[key][key2]) * 100) + '%'
        else:
            res[key] = str(((res2[key] - res1[key]) / res1[key]) * 100) + '%'
    return res

In [21]:
get_diff(getEvaluation(results['query'], judgments), getEvaluation(mlResults['query'], judgments))

{'MeanAveragePrecisions': '12.697593844882949%',
 'MeanReciprocalRank': '12.094212224128844%',
 'AverageNDCG': '10.969982226283173%',
 'AverageNDCGAt': {3: '56.27888488959706%',
  5: '56.75110579819535%',
  10: '49.784146580323245%',
  15: '40.79715803493297%',
  20: '34.22105294238055%',
  25: '32.558497785324%',
  30: '25.820684478645234%',
  40: '26.41199538457068%'},
 'AverageRecallAt': {5: '31.572368305834463%',
  10: '25.437205014284043%',
  20: '19.527663728005727%',
  50: '16.961577864861695%',
  100: '11.452261551048371%'}}

In [22]:
get_diff(getEvaluation(results['question'], judgments), getEvaluation(mlResults['question'], judgments))

{'MeanAveragePrecisions': '64.30541244561732%',
 'MeanReciprocalRank': '37.899790125436645%',
 'AverageNDCG': '40.82456678814697%',
 'AverageNDCGAt': {3: '148.111785623207%',
  5: '165.79235008234332%',
  10: '165.13162429806437%',
  15: '154.42455974259448%',
  20: '125.20782402701613%',
  25: '110.72844398476147%',
  30: '103.00865480813239%',
  40: '98.04431270254562%'},
 'AverageRecallAt': {5: '73.19024402136323%',
  10: '74.55933343441407%',
  20: '70.47613298738311%',
  50: '58.205186659482386%',
  100: '41.61583603700858%'}}

In [23]:
get_diff(getEvaluation(results['narrative'], judgments), getEvaluation(mlResults['narrative'], judgments))

{'MeanAveragePrecisions': '69.72089756629303%',
 'MeanReciprocalRank': '38.67543002443016%',
 'AverageNDCG': '38.62752367384029%',
 'AverageNDCGAt': {3: '158.18319018382704%',
  5: '110.20164022842529%',
  10: '112.62737706348425%',
  15: '90.50995862193393%',
  20: '96.66533417521357%',
  25: '95.03919558653342%',
  30: '100.27680596039377%',
  40: '83.186387172692%'},
 'AverageRecallAt': {5: '77.93077959986093%',
  10: '60.14862507486899%',
  20: '55.715645972995866%',
  50: '52.54332578281564%',
  100: '41.833934645647076%'}}