In [78]:
from __future__ import absolute_import, division, print_function
%reload_ext autoreload
%autoreload 2


import argparse
import logging
import os
import sys
import random
from tqdm import tqdm_notebook as tqdm
from tqdm import tnrange as trange

import numpy as np

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from torch.nn import CrossEntropyLoss, MSELoss


if sys.version_info[0] == 2:
    import cPickle as pickle
else:
    import pickle

from pytorch_transformers import BertForNextSentencePrediction, BertTokenizer

logger = logging.getLogger(__name__)


sys.path.append("../scripts/")
from run_classifier_dataset_utils import load_dataset

In [11]:
task_name = "msmarco"
do_lower_case = True
data_dir = "/ssd2/arthur/TREC2019/data/"
bert_model = "bert-base-uncased"
max_seq_length = 512 
output_dir = os.path.join(data_dir, "models")
eval_batch_size = 128

local_rank = -1

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [13]:
model = BertForNextSentencePrediction.from_pretrained(output_dir)
model = torch.nn.DataParallel(model)
tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=do_lower_case)
model.to(device)
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [74]:
!wc -l /ssd2/arthur/TREC2019/data/bm25_bert_docs.tsv

5191671 /ssd2/arthur/TREC2019/data/bm25_bert_docs.tsv


In [None]:
eval_dataloader, eval_examples = load_dataset(task_name, bert_model,max_seq_length,
                                              data_dir, tokenizer, eval_batch_size,
                                              eval=True, return_examples=True, force_reload=True, expected_len = 5191671)

/ssd2/arthur/TREC2019/data/dev_bert-base-uncased_512_msmarco


HBox(children=(IntProgress(value=0, description='Reading input tsv', max=5191671, style=ProgressStyle(descript…

HBox(children=(IntProgress(value=0, description='creating examples...', max=5191671, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='Feature Extraction', max=5191671, style=ProgressStyle(descrip…

In [87]:
sum(outputs[0])

tensor(1.9360, device='cuda:0')

In [89]:
input_ids.shape

torch.Size([106, 512])

In [None]:
model.eval()
eval_loss = 0
nb_eval_steps = 0
preds = []
out_label_ids = None
scores = []
classes = []
evaluated_samples = 0 

softmax = torch.nn.Softmax(dim=1)
for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=segment_ids, next_sentence_label=label_ids)
        predictions = outputs[1]
        eval_loss += sum(outputs[0])
        
        scores += list(predictions[:, 0].cpu().detach().numpy())
        
        classes += list(torch.argmax(predictions, dim=1).cpu().numpy())

        nb_eval_steps+=1

HBox(children=(IntProgress(value=0, description='Evaluating', max=454, style=ProgressStyle(description_width='…

In [None]:
from IPython.core.debugger import set_trace

from collections import defaultdict
#load bm25 scores.
bm25_scores = {}
bm25_run_file = "/ssd2/arthur/insy/msmarco/data/results/dev/bm25_finetuned.run"
guids = []
last_topic = None
normalized_scores = []
ordered_topics = []
scores_per_topic = defaultdict(lambda:[])


with open(bm25_run_file, 'r') as inf:
    for counter, line in tqdm(enumerate(inf), desc="reading run file"):
        [topic_id, _, doc_id, _, score, _] = line.split()
        if topic_id not in ordered_topics:
            ordered_topics.append(topic_id)
        scores_per_topic[topic_id].append((doc_id, score))
#normalize
for _id in tqdm(scores_per_topic, desc="normalizing"):
    _scores = np.asarray([float(x[1]) for x in scores_per_topic[_id]])
    normalized_scores = (_scores - np.min(_scores))/np.ptp(_scores)
    for (did, _), score in zip(scores_per_topic[_id], normalized_scores):
        guid = "{}-{}".format(_id, did)
        bm25_scores[guid] = score

In [73]:
len(scores)

58090

In [54]:
## from IPython.core.debugger import set_trace

import subprocess

trec_path = "/ssd2/arthur/trec_eval/trec_eval"
qrel_path = "/ssd2/arthur/TREC2019/data/msmarco-docdev-qrels.tsv"
cmd = "{} -q -c {} {}"



best_map = 0.0

runs_format = "{} Q0 {} {} {} BERT_BM25\n" #topic_id, doc_id, ranking, score

n_alphas = 50
for a in range(0, n_alphas):
    alpha = a/n_alphas
    beta = 1-alpha

    run_file = os.path.join("/ssd2/arthur/TREC2019/data/runs/bert_eval/bert-sequence-{}.res".format(alpha))

    topic_results = []
    last_topic = eval_examples[0].guid.split("-")[1]
    with open(run_file, 'w') as outf, open(bm25_run_file) as inf:
        for counter, (example, score) in enumerate(zip(eval_examples, scores)):
#             print(example.guid, score)
            [_, topic_id, doc_id] = example.guid.split("-")
            if topic_id != last_topic:
                last_topic = topic_id
                print(topic_id)
                break
#                 topic_results.sort(key = lambda x:x['score'], reverse=True)
#                 for rank, topic in enumerate(topic_results):
#                     outf.write(runs_format.format(topic['topic_id'], topic['doc_id'], rank, topic['score']))
#                 topic_results = []
#             topic_results.append({'topic_id': topic_id, 'doc_id': doc_id, 'score': alpha*score+beta*bm25_scores[f"{topic_id}-{doc_id}"]})
#             last_topic = topic_id
#         for rank, topic in enumerate(topic_results):
#             outf.write(runs_format.format(topic['topic_id'], topic['doc_id'], rank, topic['score']))
#     result = subprocess.check_output(cmd.format(trec_path, qrel_path, run_file).split()).decode('utf-8')
#     _map = float(result.split("\n")[-26].split("\t")[-1])
#     print("alpha: {}\t map: {}".format(alpha, _map))
#     if _map > best_map:
#         best_map = _map
#         print("best map found for alpha {}, map={}".format(alpha, _map))
#         best_file = run_file

In [None]:
# eval script:
cmd = "/ssd2/arthur/terrier-core/bin/terrier batchevaluate -f -q {}".format(os.path.join(data_dir, "msmarco-docdev-qrels.tsv"))
output = subprocess.run(cmd.split(), capture_output=True)
lines = output.stdout.decode("utf-8").split("\n")[3:-1]
max_score = 0.0
for i, j in list(zip(lines[:-1], lines[1:]))[::2]:
    alpha = i.split("-")[-1].split(".res")[0]
    score = float(j.split(":")[-1])
    print(alpha, score)
    if score > max_score:
        max_score = score
        best_alpha = alpha
print(best_alpha, max_score)

In [None]:
26.01846062624611

In [None]:
example.guid

In [None]:
[x for x in topic_results if x['doc_id']=="D3240836"]