In [1]:
cd ..

/Users/mig/Desktop/pitt/projects/Group-Project-2140/cord_ir


In [2]:
from search.elastic_index_reader import IndexReader
from tqdm.notebook import trange, tqdm
import math

In [3]:
import xml.etree.ElementTree as ET
queryTree = ET.parse('../data/2020-07-16/eval/topics-rnd5.xml')

In [4]:
queryRoot = queryTree.getroot()
queries = []
for child in queryRoot:
    query = {
        'queryNo': child.attrib['number'],
        'query': child.find('query').text,
        'question': child.find('question').text,
        'narrative': child.find('narrative').text
    }
    queries.append(query)

In [5]:
reader = IndexReader()
for query in queries:
    query['query_tokens'] = [t['token'] for t in reader.tokenize(query['query'])['tokens']]
    query['question_tokens'] = [t['token'] for t in reader.tokenize(query['question'])['tokens']]
    query['narrative_tokens'] = [t['token'] for t in reader.tokenize(query['narrative'])['tokens']]



In [6]:
def getRetrievalResults(queries, field):
    results = {}
    for query in tqdm(queries):
        res = reader.search("cord_test", query[field], size=200, fields=[], highlight=False)
        results[query['queryNo']] = res['hits']['hits']
    return results

In [7]:
queryFields = ['query', 'question', 'narrative']
results = {f: getRetrievalResults(queries, f) for f in queryFields}

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [8]:
# load the judgments
judgments = {q['queryNo']: [] for q in queries}
with open('../data/2020-07-16/eval/qrels-covid_d5_j0.5-5.txt', 'r') as qrels:
    for line in qrels:
        [topicId, iteration, cordId, judgment] = line.strip('\n').split(' ')
        judgments[topicId].append({
            'iteration': iteration,
            'cordId': cordId,
            'judgment': judgment
        })

In [9]:
for field in queryFields:
    with open('../data/2020-07-16/eval/%s-result.txt'%field, 'w') as res_file:
        fieldResults = results[field]
        for query in tqdm(queries):
            for (index, result) in enumerate(fieldResults[query['queryNo']]):
                line = ' '.join([query['queryNo'], 'Q0', result['_id'], str(index + 1), str(result['_score']), field])
                res_file.write(line + '\n')

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [10]:
# use trec_eval to get evaluation
!../data/2020-07-16/eval/trec_eval ../data/2020-07-16/eval/qrels-covid_d5_j0.5-5.txt ../data/2020-07-16/eval/query-result.txt
!../data/2020-07-16/eval/trec_eval ../data/2020-07-16/eval/qrels-covid_d5_j0.5-5.txt ../data/2020-07-16/eval/question-result.txt
!../data/2020-07-16/eval/trec_eval ../data/2020-07-16/eval/qrels-covid_d5_j0.5-5.txt ../data/2020-07-16/eval/narrative-result.txt

runid                 	all	query
num_q                 	all	50
num_ret               	all	10000
num_rel               	all	26664
num_rel_ret           	all	3774
map                   	all	0.0938
gm_map                	all	0.0542
Rprec                 	all	0.1514
bpref                 	all	0.1417
recip_rank            	all	0.8236
iprec_at_recall_0.00  	all	0.8466
iprec_at_recall_0.10  	all	0.4202
iprec_at_recall_0.20  	all	0.1483
iprec_at_recall_0.30  	all	0.0284
iprec_at_recall_0.40  	all	0.0000
iprec_at_recall_0.50  	all	0.0000
iprec_at_recall_0.60  	all	0.0000
iprec_at_recall_0.70  	all	0.0000
iprec_at_recall_0.80  	all	0.0000
iprec_at_recall_0.90  	all	0.0000
iprec_at_recall_1.00  	all	0.0000
P_5                   	all	0.6640
P_10                  	all	0.6180
P_15                  	all	0.6067
P_20                  	all	0.5840
P_30                  	all	0.5640
P_100                 	all	0.4620
P_200                 	all	0.3774
P_500                 	all	0.1510
P_1000                	

In [11]:
# calculate in Python
ndcg_eval_at = [3, 5, 10, 15, 20, 25, 30, 40]
recall_eval_at = [5, 10, 20, 50, 100]
def getEvaluation(results, judgments):
    queryEval = []
    for queryNo in results:
        # doc to relevant score
        relMap = {}
        # number of docs in each relevant score
        relCount = {}
        totalRel = 0
        for j in judgments[queryNo]:
            score = int(j['judgment'])
            relMap[j['cordId']] = score
            relCount[score] = relCount.get(score, 0) + 1
            if score > 0:
                totalRel += 1
        truePositive = 0
        falsePositive = 0
        precisions = []
        reciprocalRank = 0
        cumulativeGain = 0
        discountedCumulativeGain = 0
        idealDCG = 0
        dcgAt = {}
        idcgAt = {}
        ndcgAt = {}
        index = 1
        recallAt = {}
        for score in sorted(relCount.keys(), reverse=True):
            for i in range(relCount[score]):
                idealDCG += (2 ** score - 1) / (math.log2(1 + index))
                if index in ndcg_eval_at:
                    idcgAt[index] = idealDCG
                index += 1
        for (index, doc) in enumerate(results[queryNo]):
            docId = doc['_id']
            cumulativeGain += relMap.get(docId, 0)
            discountedCumulativeGain += (2 ** (relMap.get(docId, 0)) - 1) / (math.log2(1 + (1 + i)))
            if relMap.get(docId, 0) > 0:
                truePositive += 1
                # recall increase
                precisions.append(truePositive / (truePositive + falsePositive))
                if reciprocalRank == 0:
                    reciprocalRank = 1 / (index+1)
            else:
                falsePositive += 1
            if index + 1 in ndcg_eval_at and (index + 1) in idcgAt:
                dcgAt[index + 1] = discountedCumulativeGain
                ndcgAt[index + 1] = dcgAt[index + 1] / idcgAt[index + 1]
            if index + 1 in recall_eval_at:
                recallAt[index + 1] = truePositive / totalRel
        queryEval.append({
            'AveragePrecision': sum(precisions) / totalRel,
            'ReciprocalRank': reciprocalRank,
            'CG': cumulativeGain,
            'DCG': discountedCumulativeGain,
            'IDCG': idealDCG,
            'nDCG': discountedCumulativeGain / idealDCG,
            'nDCGAt': ndcgAt,
            'recallAt': recallAt
        })
    return {
        'MeanAveragePrecisions': sum(map(lambda e: e['AveragePrecision'], queryEval)) / len(queryEval),
        'MeanReciprocalRank': sum(map(lambda e: e['ReciprocalRank'], queryEval)) / len(queryEval),
        'AverageNDCG': sum(map(lambda e: e['nDCG'], queryEval)) / len(queryEval),
        'AverageNDCGAt': {k: sum(map(lambda e: e['nDCGAt'][k], queryEval)) / len(queryEval) for k in ndcg_eval_at},
        'AverageRecallAt': {k: sum(map(lambda e: e['recallAt'][k], queryEval)) / len(queryEval) for k in recall_eval_at}
    }
            

In [12]:
getEvaluation(results['query'], judgments)

{'MeanAveragePrecisions': 0.0938413425805633,
 'MeanReciprocalRank': 0.8136031746031747,
 'AverageNDCG': 0.14186232674604926,
 'AverageNDCGAt': {3: 0.13364055103725428,
  5: 0.1418463555798589,
  10: 0.1680738131354743,
  15: 0.18464160353165723,
  20: 0.19572503874359487,
  25: 0.20166118601071548,
  30: 0.21199265201748066,
  40: 0.21832616459428522},
 'AverageRecallAt': {5: 0.007571936093659915,
  10: 0.014347370761125028,
  20: 0.025675868223038946,
  50: 0.05593293689518463,
  100: 0.09603243791764829}}

In [13]:
getEvaluation(results['question'], judgments)

{'MeanAveragePrecisions': 0.05846350714788746,
 'MeanReciprocalRank': 0.6930461107519932,
 'AverageNDCG': 0.10352932326145607,
 'AverageNDCGAt': {3: 0.09875194087296979,
  5: 0.11636770250055294,
  10: 0.13835908558004478,
  15: 0.1488085799127936,
  20: 0.15100867206756832,
  25: 0.15061045685942806,
  30: 0.15584868010479463,
  40: 0.16313228687390108},
 'AverageRecallAt': {5: 0.005727560253085045,
  10: 0.011274666322520223,
  20: 0.019611381677786005,
  50: 0.04029976874013658,
  100: 0.06874504517191328}}

In [14]:
getEvaluation(results['narrative'], judgments)

{'MeanAveragePrecisions': 0.0529449798520697,
 'MeanReciprocalRank': 0.6434796152314557,
 'AverageNDCG': 0.10658332315533475,
 'AverageNDCGAt': {3: 0.09286698794380803,
  5: 0.09907587042444932,
  10: 0.12023696099287039,
  15: 0.1276837496201119,
  20: 0.1381839021514124,
  25: 0.1423471423387641,
  30: 0.14730348405985721,
  40: 0.15610602983947353},
 'AverageRecallAt': {5: 0.005375328395812915,
  10: 0.00957656146663195,
  20: 0.01846469816821065,
  50: 0.03979425169564572,
  100: 0.06755924917514011}}

In [15]:
# ML Part 
from search.ml_rank import MlRanker
from search.data_loader import DataLoader
ranker = MlRanker("../data/models/ranker.joblib",
                  '../data/models/tfidf.joblib',
                  "../data/models/docMatrix.joblib")
loader = DataLoader('../data/2020-07-16')
loader.load_metadata_mappings(loader.load_metadata())

In [16]:
def getMlRetrievalResults(queries, field):
    results = {}
    for query in tqdm(queries):
        res = reader.search("cord_test", query[field], size=500, fields=[], highlight=False)
        results[query['queryNo']] = ranker.rank(query[field], res['hits']['hits'], loader)[:200]
#         results[query['queryNo']] = ranker.whole_rank(query[field], loader, size=3000)
    return results
queryFields = ['query', 'question', 'narrative']
mlResults = {f: getMlRetrievalResults(queries, f) for f in queryFields}

  0%|          | 0/50 [00:00<?, ?it/s]



  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [17]:
getEvaluation(mlResults['query'], judgments)

{'MeanAveragePrecisions': 0.09299413070207688,
 'MeanReciprocalRank': 0.9005555555555556,
 'AverageNDCG': 0.1510033853443577,
 'AverageNDCGAt': {3: 0.17076521192384952,
  5: 0.19119209607182744,
  10: 0.21897900980004373,
  15: 0.22704139463649414,
  20: 0.2320231137172044,
  25: 0.24043657606506613,
  30: 0.24109911691699082,
  40: 0.2507445424680255},
 'AverageRecallAt': {5: 0.009840972333750742,
  10: 0.016892999447137232,
  20: 0.028383983823326468,
  50: 0.0580874367030921,
  100: 0.09589634021026726}}

In [18]:
getEvaluation(mlResults['question'], judgments)

{'MeanAveragePrecisions': 0.07610951296701937,
 'MeanReciprocalRank': 0.8516190476190476,
 'AverageNDCG': 0.12900370650984513,
 'AverageNDCGAt': {3: 0.1602245720291471,
  5: 0.18375355205715801,
  10: 0.20812907892037352,
  15: 0.22309565854515345,
  20: 0.23516075022577718,
  25: 0.2418775318643262,
  30: 0.24517210827474578,
  40: 0.25250952833169743},
 'AverageRecallAt': {5: 0.008837038253178131,
  10: 0.015719471689018938,
  20: 0.027212891345120723,
  50: 0.05290005769237582,
  100: 0.08284294530613966}}

In [19]:
getEvaluation(mlResults['narrative'], judgments)

{'MeanAveragePrecisions': 0.07065652269282045,
 'MeanReciprocalRank': 0.8301515151515153,
 'AverageNDCG': 0.12665863872964597,
 'AverageNDCGAt': {3: 0.13615126628920626,
  5: 0.15431081439396427,
  10: 0.19047269842734457,
  15: 0.19891589421806594,
  20: 0.20955450993742747,
  25: 0.2161980320132762,
  30: 0.22200446992517914,
  40: 0.22872788542225314},
 'AverageRecallAt': {5: 0.007692296217285149,
  10: 0.014019619731797085,
  20: 0.02377836699468743,
  50: 0.04796005230575103,
  100: 0.08004549414918855}}

In [20]:
def get_diff(res1, res2):
    res = {}
    for key in res1.keys():
        if isinstance(res1[key], dict):
            res[key] = {}
            for key2 in res1[key].keys():
                res[key][key2] = str(((res2[key][key2] - res1[key][key2]) / res1[key][key2]) * 100) + '%'
        else:
            res[key] = str(((res2[key] - res1[key]) / res1[key]) * 100) + '%'
    return res

In [21]:
get_diff(getEvaluation(results['query'], judgments), getEvaluation(mlResults['query'], judgments))

{'MeanAveragePrecisions': '-0.902813040807773%',
 'MeanReciprocalRank': '10.68732075618939%',
 'AverageNDCG': '6.443612485414849%',
 'AverageNDCGAt': {3: '27.779488035967614%',
  5: '34.78816236782841%',
  10: '30.287405107860433%',
  15: '22.96329228833163%',
  20: '18.545442732629116%',
  25: '19.227988697978937%',
  30: '13.729940459025947%',
  40: '14.848599540958944%'},
 'AverageRecallAt': {5: '29.966394486487037%',
  10: '17.74282360437581%',
  20: '10.547318504530766%',
  50: '3.851933990065435%',
  100: '-0.14172055852392265%'}}

In [22]:
get_diff(getEvaluation(results['question'], judgments), getEvaluation(mlResults['question'], judgments))

{'MeanAveragePrecisions': '30.18294091474042%',
 'MeanReciprocalRank': '22.880575247005435%',
 'AverageNDCG': '24.605959399594727%',
 'AverageNDCGAt': {3: '62.24954224975998%',
  5: '57.907690973176074%',
  10: '50.426752278559086%',
  15: '49.92123349063232%',
  20: '55.726652652475%',
  25: '60.598099831860985%',
  30: '57.31420253921235%',
  40: '54.788198688640755%'},
 'AverageRecallAt': {5: '54.28974751366821%',
  10: '39.422943786997756%',
  20: '38.760704330919324%',
  50: '31.26640510889576%',
  100: '20.507514540097013%'}}

In [23]:
get_diff(getEvaluation(results['narrative'], judgments), getEvaluation(mlResults['narrative'], judgments))

{'MeanAveragePrecisions': '33.452733177418295%',
 'MeanReciprocalRank': '29.009761226533158%',
 'AverageNDCG': '18.835325246008153%',
 'AverageNDCGAt': {3: '46.608896555995436%',
  5: '55.7501475716376%',
  10: '58.4144316809861%',
  15: '55.78794859164602%',
  20: '51.64900301325408%',
  25: '51.88083755047111%',
  30: '50.71230075927258%',
  40: '46.52085230625486%'},
 'AverageRecallAt': {5: '43.103744568927624%',
  10: '46.39513128638381%',
  20: '28.777447527547157%',
  50: '20.520050666008164%',
  100: '18.481917911312458%'}}