In [1]:
cd ..

/Users/mig/Desktop/pitt/projects/Group-Project-2140/cord_ir


In [2]:
from search.elastic_index_reader import IndexReader
from tqdm.notebook import trange, tqdm
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from search.data_loader import DataLoader
import pandas as pd
from joblib import dump, load

In [3]:
import xml.etree.ElementTree as ET
queryTree = ET.parse('../data/2020-07-16/eval/topics-rnd5.xml')
queryRoot = queryTree.getroot()
queries = []
for child in queryRoot:
    query = {
        'queryNo': child.attrib['number'],
        'query': child.find('query').text,
        'question': child.find('question').text,
        'narrative': child.find('narrative').text
    }
    queries.append(query)

In [4]:
from pathlib import Path
Path("../data/models").mkdir(parents=True, exist_ok=True)
loader = DataLoader('../data/2020-07-16')
loader.load_metadata_mappings(loader.load_metadata())

In [5]:
reader = IndexReader()
# iterator for the training documents
class DocIter:
    def __init__(self, pbar=True):
        metadata = loader.load_metadata()
        # only use rows that have file info
        self.metadata = metadata[pd.notna(metadata['pmc_json_files']) | pd.notna(metadata['pdf_json_files'])]
        self.rows = self.metadata.shape[0]
        self.current = 0
        if pbar:
            self.pbar = tqdm(total=self.rows)

    def __iter__(self):
        return self
    
    def __len__(self):
        return self.rows

    def __next__(self): 
        if self.current < self.rows:
            row = self.metadata.iloc[self.current]
            self.current += 1
            docData = loader.load_paper_data(row)
            text = docData['data']['main_text']
#             tokens = [t['token'] for t in reader.tokenize(text)['tokens']]
#             text = ' '.join(tokens)
            if hasattr(self, 'pbar'):
                self.pbar.update(1)
            return text
        if hasattr(self, 'pbar'):
            self.pbar.close()
        raise StopIteration


In [None]:
vectorizer = TfidfVectorizer()
docIterator = DocIter()
vectorizer.fit(docIterator)
dump(vectorizer, '../data/models/tfidf.joblib') 

In [6]:
vectorizer = load('../data/models/tfidf.joblib') 

In [7]:
vocabulary = vectorizer.get_feature_names_out()
len(vocabulary)

1643295

In [8]:
judgments = {q['queryNo']: [] for q in queries}
with open('../data/2020-07-16/eval/qrels-covid_d5_j0.5-5.txt', 'r') as qrels:
    for line in qrels:
        [topicId, iteration, cordId, judgment] = line.strip('\n').split(' ')
        judgments[topicId].append({
            'iteration': iteration,
            'cordId': cordId,
            'judgment': max(0, int(judgment))
        })
# candidate set, select  non-relevant docs in results to add in training data
def getRetrievalResults(queries, field):
    results = {}
    for query in tqdm(queries):
        res = reader.search("cord_test", query[field], size=1000, fields=[], highlight=False)
        results[query['queryNo']] = res['hits']['hits']
    return results
candidates = getRetrievalResults(queries, 'question')


  0%|          | 0/50 [00:00<?, ?it/s]



In [9]:
# generate the training data
from scipy.sparse import coo_matrix, hstack, vstack
# feature X for each row is [[tfidf of document], [tfidf of query]]
# separate some queries to used in evaluation
def get_training_data(testQueries=5):
    X_train = coo_matrix((0, len(vocabulary) * 2))
    y_train = []
    X_test_query = []
    X_test_question = []
    y_test = []
    group_counts = []
    origin_data_train = {}
    origin_data_test = {}
    for (i, query) in tqdm(enumerate(queries), total=len(queries)):
        queryNo = query['queryNo']
        queryTfIdf = vectorizer.transform([query['query']])
        questionTfIdf = vectorizer.transform([query['question']])
        narrativeTfIdf = vectorizer.transform([query['narrative']])
        retrieved = candidates[queryNo]
        queryResults = list(judgments[queryNo])
        for doc in retrieved:
            docId = doc['_id']
            if len([e for e in queryResults if e['cordId'] == docId]) == 0:
                queryResults.append({
                    'cordId': docId,
                    'judgment': 0
                })
        text_list = []
        empty_data_index = set()
        for (k, item) in enumerate(queryResults):
            paper_data = loader.load_paper_data(item['cordId'])
            if paper_data:
                main_text = paper_data['data']['main_text']
                text_list.append(main_text)
            else:
                empty_data_index.add(k)
        queryResults = [queryResults[k] for k in range(len(queryResults)) if k not in empty_data_index]
        # batch transform
        textTfIdf = vectorizer.transform(text_list)
        queryTfIdf = vstack([queryTfIdf for i in range(len(queryResults))])
        questionTfIdf = vstack([questionTfIdf for i in range(len(queryResults))])
        narrativeTfIdf = vstack([narrativeTfIdf for i in range(len(queryResults))])
        queryRows = hstack([textTfIdf, queryTfIdf])
        questionRows = hstack([textTfIdf, questionTfIdf])
        narrativeRows = hstack([textTfIdf, narrativeTfIdf])
        if i < len(queries) - testQueries:
            y_train.extend(map(lambda e: e['judgment'], queryResults))
            y_train.extend(map(lambda e: e['judgment'], queryResults))
            y_train.extend(map(lambda e: e['judgment'], queryResults))
            X_train = vstack([X_train, queryRows, questionRows, narrativeRows])
            group_counts.append(len(queryResults) * 3)
            origin_data_train[queryNo] = queryResults
        else:
            y_test.append(list(map(lambda e: e['judgment'], queryResults)))
            query_vecs = coo_matrix((0, len(vocabulary) * 2))
            question_vecs = coo_matrix((0, len(vocabulary) * 2))
            query_vecs = vstack([query_vecs, queryRows])
            question_vecs = vstack([question_vecs, questionRows])
            X_test_query.append(query_vecs)
            X_test_question.append(question_vecs)
            origin_data_test[queryNo] = queryResults
    return (X_train, y_train, X_test_query, X_test_question, y_test, group_counts, origin_data_train, origin_data_test)
(X_train, y_train, X_test_query, X_test_question, y_test, group_counts, origin_data_train, origin_data_test) = get_training_data()


  0%|          | 0/50 [00:00<?, ?it/s]

In [10]:
dump(X_train, '../data/models/X_train.joblib')
dump(y_train, '../data/models/y_train.joblib')
dump(X_test_query, '../data/models/X_test_query.joblib')
dump(X_test_question, '../data/models/X_test_question.joblib')
dump(y_test, '../data/models/y_test.joblib')
dump(group_counts, '../data/models/group_counts.joblib')
dump(origin_data_train, '../data/models/origin_data_train.joblib')
dump(origin_data_test, '../data/models/origin_data_test.joblib')

['../data/models/origin_data_test.joblib']

In [11]:
import lightgbm as lgb
# import xgboost as xgb
def getRanker():
    ranker = lgb.LGBMRanker(
        num_leaves=63,
        n_estimators=1000,
        max_bin=511,
        objective="lambdarank",
        metric="ndcg",
        ndcg_eval_at=[1, 3, 5, 10, 15, 20, 25, 30, 50],
        learning_rate= .1,
        num_iterations=120,
        importance_type="gain")
    return ranker
# ranker = xgb.XGBRanker(objective='rank:ndcg',
#       learning_rate=0.1,
#       gamma=1.0,
#       min_child_weight=0.1,
#       max_depth=6,
#       verbose=2,
#       random_state=42)

In [12]:
ranker = getRanker()
ranker = ranker.fit(X_train, y_train, group=group_counts)



In [13]:
ndcg_eval_at = [3, 5, 10, 15, 20, 25, 30]
recall_eval_at = [5, 10, 20, 50, 100, 300]
def getEvaluation(results, judgments):
    queryEval = []
    for queryNo in results:
        # doc to relevant score
        relMap = {}
        # number of docs in each relevant score
        relCount = {}
        totalRel = 0
        for j in judgments[queryNo]:
            score = int(j['judgment'])
            relMap[j['cordId']] = score
            relCount[score] = relCount.get(score, 0) + 1
            if score > 0:
                totalRel += 1
        truePositive = 0
        falsePositive = 0
        precisions = []
        reciprocalRank = 0
        cumulativeGain = 0
        discountedCumulativeGain = 0
        idealDCG = 0
        dcgAt = {}
        idcgAt = {}
        ndcgAt = {}
        index = 1
        recallAt = {}
        for score in sorted(relCount.keys(), reverse=True):
            for i in range(relCount[score]):
                idealDCG += (2 ** score - 1) / (math.log2(1 + index))
                if index in ndcg_eval_at:
                    idcgAt[index] = idealDCG
                index += 1
        for (index, doc) in enumerate(results[queryNo]):
            docId = doc['cordId']
            cumulativeGain += relMap.get(docId, 0)
            discountedCumulativeGain += (2 ** (relMap.get(docId, 0)) - 1) / (math.log2(1 + (1 + i)))
            if relMap.get(docId, 0) > 0:
                truePositive += 1
                # recall increase
                precisions.append(truePositive / (truePositive + falsePositive))
                if reciprocalRank == 0:
                    reciprocalRank = 1 / (index+1)
            else:
                falsePositive += 1
            if index + 1 in ndcg_eval_at and (index + 1) in idcgAt:
                dcgAt[index + 1] = discountedCumulativeGain
                ndcgAt[index + 1] = dcgAt[index + 1] / idcgAt[index + 1]
            if index + 1 in recall_eval_at:
                recallAt[index + 1] = truePositive / totalRel
        queryEval.append({
            'AveragePrecision': sum(precisions) / totalRel,
            'ReciprocalRank': reciprocalRank,
            'CG': cumulativeGain,
            'DCG': discountedCumulativeGain,
            'IDCG': idealDCG,
            'nDCG': discountedCumulativeGain / idealDCG,
            'nDCGAt': ndcgAt,
            'recallAt': recallAt
        })
    return {
        'MeanAveragePrecisions': sum(map(lambda e: e['AveragePrecision'], queryEval)) / len(queryEval),
        'MeanReciprocalRank': sum(map(lambda e: e['ReciprocalRank'], queryEval)) / len(queryEval),
        'AverageNDCG': sum(map(lambda e: e['nDCG'], queryEval)) / len(queryEval),
        'AverageNDCGAt': {k: sum(map(lambda e: e['nDCGAt'][k], queryEval)) / len(queryEval) for k in ndcg_eval_at},
        'AverageRecallAt': {k: sum(map(lambda e: e['recallAt'][k], queryEval)) / len(queryEval) for k in recall_eval_at}
    }
            

In [14]:
query_results = {}
question_results = {}
for i in range(len(X_test_query)):
    queryNo = queries[len(queries) - 5 + i]['queryNo']
    origin = origin_data_test[queryNo]
    query_result_vec = ranker.predict(X_test_query[i])
    question_result_vec = ranker.predict(X_test_question[i])
    query_result_indexes = sorted(range(query_result_vec.shape[0]), key=lambda k: query_result_vec[k], reverse=True)
    question_result_indexes = sorted(range(question_result_vec.shape[0]), key=lambda k: question_result_vec[k], reverse=True)
    query_result = [{'cordId': origin[index]['cordId']} for index in query_result_indexes]
    question_result = [{'cordId': origin[index]['cordId']} for index in question_result_indexes]
    query_results[queryNo] = query_result
    question_results[queryNo] = question_result



In [15]:
getEvaluation(query_results, origin_data_test)

{'MeanAveragePrecisions': 0.28020915255982576,
 'MeanReciprocalRank': 0.53,
 'AverageNDCG': 0.487209377322056,
 'AverageNDCGAt': {3: 0.04257206767999946,
  5: 0.03938906903656014,
  10: 0.05658203396797654,
  15: 0.06826037269364724,
  20: 0.07352287735734579,
  25: 0.07788491722148343,
  30: 0.08333441807055461},
 'AverageRecallAt': {5: 0.00789045568569007,
  10: 0.016599118700500376,
  20: 0.0346930761177999,
  50: 0.08694460780535793,
  100: 0.17200189443065533,
  300: 0.4270305777269218}}

In [16]:
getEvaluation(question_results, origin_data_test)

{'MeanAveragePrecisions': 0.21130033721096364,
 'MeanReciprocalRank': 0.1788936627282492,
 'AverageNDCG': 0.487209377322056,
 'AverageNDCGAt': {3: 0.017892875669912107,
  5: 0.02186918185859056,
  10: 0.022651808116935755,
  15: 0.03439818944247965,
  20: 0.043447986463499136,
  25: 0.048028482736797835,
  30: 0.04834396735609978},
 'AverageRecallAt': {5: 0.004396637865161935,
  10: 0.00879327573032387,
  20: 0.02249007439217239,
  50: 0.058733777558250086,
  100: 0.10488699706699658,
  300: 0.35708902937820924}}

In [17]:
# use full data to train
(X_train, y_train, X_test_query, X_test_question, y_test, group_counts, origin_data_train, origin_data_test) = get_training_data(0)

  0%|          | 0/50 [00:00<?, ?it/s]

In [18]:
ranker = getRanker()
ranker = ranker.fit(X_train, y_train, group=group_counts)

In [19]:
dump(ranker, '../data/models/ranker.joblib')

['../data/models/ranker.joblib']

In [20]:
docIterator = DocIter()
docMatrix = vectorizer.transform(docIterator)

  0%|          | 0/84145 [00:00<?, ?it/s]

In [21]:
metadata = loader.load_metadata()
dump({
    'matrix': docMatrix,
    'cordIds': list(metadata[pd.notna(metadata['pmc_json_files']) | pd.notna(metadata['pdf_json_files'])]['cord_uid'])
}, '../data/models/docMatrix.joblib')

['../data/models/docMatrix.joblib']