In [1]:
cd ..

/Users/mig/Desktop/pitt/projects/Group-Project-2140/cord_ir


In [2]:
from search.elastic_index_reader import IndexReader
from tqdm.notebook import trange, tqdm
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from search.data_loader import DataLoader
import pandas as pd
from joblib import dump, load

In [3]:
import xml.etree.ElementTree as ET
queryTree = ET.parse('../data/2020-07-16/eval/topics-rnd5.xml')
queryRoot = queryTree.getroot()
queries = []
for child in queryRoot:
    query = {
        'queryNo': child.attrib['number'],
        'query': child.find('query').text,
        'question': child.find('question').text,
        'narrative': child.find('narrative').text
    }
    queries.append(query)

In [4]:
from pathlib import Path
Path("../data/models").mkdir(parents=True, exist_ok=True)
loader = DataLoader('../data/2020-07-16')
loader.load_metadata_mappings(loader.load_metadata())

In [5]:
reader = IndexReader()
# iterator for the training documents
class DocIter:
    def __init__(self, pbar=True):
        metadata = loader.load_metadata()
        # only use rows that have file info
        self.metadata = metadata[pd.notna(metadata['pmc_json_files']) | pd.notna(metadata['pdf_json_files'])]
        self.rows = self.metadata.shape[0]
        self.current = 0
        if pbar:
            self.pbar = tqdm(total=self.rows)

    def __iter__(self):
        return self
    
    def __len__(self):
        return self.rows

    def __next__(self): 
        if self.current < self.rows:
            row = self.metadata.iloc[self.current]
            self.current += 1
            docData = loader.load_paper_data(row)
            text = docData['data']['main_text']
#             tokens = [t['token'] for t in reader.tokenize(text)['tokens']]
#             text = ' '.join(tokens)
            if hasattr(self, 'pbar'):
                self.pbar.update(1)
            return text
        if hasattr(self, 'pbar'):
            self.pbar.close()
        raise StopIteration


In [6]:
vectorizer = TfidfVectorizer()

In [None]:
docIterator = DocIter()
vectorizer.fit(docIterator)

In [None]:
dump(vectorizer, '../data/models/tfidf.joblib') 

In [7]:
vectorizer = load('../data/models/tfidf.joblib') 

In [8]:
vocabulary = vectorizer.get_feature_names_out()
len(vocabulary)

1643295

In [9]:
judgments = {q['queryNo']: [] for q in queries}
with open('../data/2020-07-16/eval/qrels-covid_d5_j0.5-5.txt', 'r') as qrels:
    for line in qrels:
        [topicId, iteration, cordId, judgment] = line.strip('\n').split(' ')
        judgments[topicId].append({
            'iteration': iteration,
            'cordId': cordId,
            'judgment': judgment
        })
# candidate set, select  non-relevant docs in results to add in training data
def getRetrievalResults(queries, field):
    results = {}
    for query in tqdm(queries):
        res = reader.search("cord_test", query[field], size=500, fields=[], highlight=False)
        results[query['queryNo']] = res['hits']['hits']
    return results
candidates = getRetrievalResults(queries, 'question')


  0%|          | 0/50 [00:00<?, ?it/s]



In [10]:
# generate the training data
from scipy.sparse import coo_matrix, hstack, vstack
# feature X for each row is [[tfidf of query], [tfidf of document]]
X = coo_matrix((0, len(vocabulary) * 2))
y = []
group_counts = []
for query in tqdm(queries):
    queryNo = query['queryNo']
    queryTfIdf = vectorizer.transform([query['query']])
    questionTfIdf = vectorizer.transform([query['question']])
    retrieved = candidates[queryNo]
    queryResults = list(judgments[queryNo])
    for doc in retrieved:
        docId = doc['_id']
        if len([e for e in queryResults if e['cordId'] == docId]) == 0:
            queryResults.append({
                'cordId': docId,
                'judgment': 0
            })
    text_list = []
    for item in queryResults:
        paper_data = loader.load_paper_data(item['cordId'])
        main_text = paper_data['data']['main_text']
        text_list.append(main_text)
    # batch transform
    y.extend(map(lambda e: e['judgment'], queryResults))
    y.extend(map(lambda e: e['judgment'], queryResults))
    textTfIdf = vectorizer.transform(text_list)
    queryTfIdf = vstack([queryTfIdf for i in range(len(queryResults))])
    questionTfIdf = vstack([questionTfIdf for i in range(len(queryResults))])
    queryRows = hstack([queryTfIdf, textTfIdf])
    questionRows = hstack([questionTfIdf, textTfIdf])
    X = vstack([X, queryRows, questionRows])
    group_counts.append(len(queryResults))
    

  0%|          | 0/50 [00:00<?, ?it/s]

In [11]:
import lightgbm as lgb
ranker = lgb.LGBMRanker(task="train",
        objective="lambdarank")

In [None]:
ranker = ranker.fit(X, y, group=group_counts)