In [1]:
import pytorch_transformers
import sys
import torch
%load_ext autoreload
%autoreload 2
sys.path.insert(0, "/ssd2/arthur/TREC2019/scripts/")
from msmarco_dataset import MsMarcoDataset
from args_parser import getArgs
from torch.utils.data import DataLoader
import os
from tqdm.autonotebook import tqdm                         
from sklearn.metrics import f1_score, average_precision_score, accuracy_score
import numpy as np
os.environ["CUDA_VISIBLE_DEVICES"]="2,3,4,5,6,7"  # specify which GPU(s) to be used

In [2]:
from pytorch_transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained("/ssd2/arthur/TREC2019/data/models/distilbert-model/")

In [3]:
data_dir = "/ssd2/arthur/TREC2019/data"

labeled=True
datasets_names = ["test"]

batch_size = 1024
dataloaders = {}
datasets = {}

for dataset in datasets_names:
    _file = os.path.join(data_dir, "triples-tokenized", "cut-test.top100")
    assert os.path.isfile(_file)
    datasets[dataset] = MsMarcoDataset(_file, data_dir, distil=True, invert_label=True, labeled=labeled)
    dataloaders[dataset] = DataLoader(datasets[dataset], batch_size=batch_size, shuffle=False)

HBox(children=(IntProgress(value=1, bar_style='info', description='Counting lines on file...', max=1, style=Pr…




In [4]:
seed = 42
torch.cuda.manual_seed_all(seed)
n_gpu = torch.cuda.device_count()
device = torch.device("cuda" if (torch.cuda.is_available() and n_gpu > 0) else "cpu")
model = torch.nn.DataParallel(model)
model = model.to(device)

In [5]:
import torch
softmax = torch.nn.Softmax(dim=1)

preds = {}
for dataset in datasets_names:
    eval_loss = 0.0
    nb_eval_steps = 0
    preds[dataset] = None
    out_label_ids = 0
    _preds = None
    dataloader = dataloaders[dataset]
    for index, batch in tqdm(enumerate(dataloader), desc="{} Dataset".format(dataset), total = len(dataloader)):
        model.eval()
        with torch.no_grad():
            if labeled:
                inputs = {'input_ids': batch[0].to(device),
                          'attention_mask': batch[1].to(device),
                          'labels': batch[3].to(device)}
            else:
                inputs = {'input_ids': batch[0].to(device),
                          'attention_mask': batch[1].to(device)}
            outputs = model(**inputs)
            if labeled:
                tmp_eval_loss, logits = outputs[:2]
                eval_loss+=tmp_eval_loss.mean().item()
                nb_eval_steps+=1
            else:
                logits = outputs[0]

            if _preds is None:
                _preds = logits.detach().cpu().numpy()
                if labeled:
                    out_label_ids = inputs['labels'].detach().cpu().numpy().flatten()

            else:
                batch_predictions = logits.detach().cpu().numpy()
                _preds = np.append(_preds, batch_predictions, axis=0)
                if labeled:
                    batch_ground_truth = inputs['labels'].detach().cpu().numpy().flatten()
                    out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy().flatten(), axis=0)
                    if index%20 == 0:
                        print("\taccuracy: {}".format(accuracy_score(batch_ground_truth, np.argmax(batch_predictions, axis=1))))
                        print("\tf1_score: {}".format(f1_score(batch_ground_truth, np.argmax(batch_predictions, axis=1))))
    if labeled:
        assert len(_preds) == len(out_label_ids)
    torch.save(_preds, os.path.join(data_dir, 'predictions', '{}_.distilBERT.tensor'.format(dataset)))
    preds[dataset] = softmax(torch.as_tensor(_preds))[:,0].cpu().numpy()

HBox(children=(IntProgress(value=0, description='test Dataset', max=153, style=ProgressStyle(description_width…



	accuracy: 0.9814453125
	f1_score: 0.9906265416872224
	accuracy: 0.9833984375
	f1_score: 0.9915966386554622
	accuracy: 0.982421875
	f1_score: 0.9910979228486646
	accuracy: 0.9775390625
	f1_score: 0.9886082218920258
	accuracy: 0.98046875
	f1_score: 0.9901185770750986
	accuracy: 0.9736328125
	f1_score: 0.9866137828458107
	accuracy: 0.9765625
	f1_score: 0.9881305637982196



In [6]:
preds[dataset] = softmax(torch.as_tensor(_preds)).cpu().numpy()[:,0]

In [7]:
#load QL scores and normalize
from collections import defaultdict
ql_scores = defaultdict(lambda:defaultdict(lambda:[]))
ordered_topics = defaultdict(lambda:[])
scores_per_topic = defaultdict(lambda:defaultdict(lambda:[]))
for dataset in datasets_names:
    QL_run_file = "/ssd2/arthur/TREC2019/data/runs/indri_test_10_10.run"
    last_topic = None
    normalized_scores = []

    with open(QL_run_file, 'r') as inf:
        for counter, line in tqdm(enumerate(inf), desc="reading run file", total=len(datasets[dataset])):
            [topic_id, _, doc_id, _, score, _] = line.split()
            if topic_id not in ordered_topics[dataset]:
                ordered_topics[dataset].append(topic_id)
            scores_per_topic[dataset][topic_id].append((doc_id, score))
    #normalize
    for _id in tqdm(scores_per_topic[dataset], desc="normalizing"):
        _scores = np.asarray([float(x[1]) for x in scores_per_topic[dataset][_id]])
        normalized_scores = (_scores - np.min(_scores))/np.ptp(_scores)
        for (did, _), score in zip(scores_per_topic[dataset][_id], normalized_scores):
            guid = "{}-{}".format(_id, did)
            ql_scores[dataset][guid] = score
print(len(ql_scores["test"]))

HBox(children=(IntProgress(value=0, description='reading run file', max=155800, style=ProgressStyle(descriptio…




HBox(children=(IntProgress(value=0, description='normalizing', max=1558, style=ProgressStyle(description_width…


155800


In [8]:
import subprocess
import os
trec_eval_path = "/ssd2/arthur/trec_eval/trec_eval"
dev_qrel_path = "/ssd2/arthur/TREC2019/data/qrels/test_qrels"
cmd = "{} -q -c {} {}"
map_cmd = "{} -q -m map {} {}"
ndcg_cmd = "{} -q -m ndcg {} {}"
best_map = 0.0
best_ndcg = 0.0
runs_format = "{} Q0 {} {} {} DISTILBERT_QL\n" #topic_id, doc_id, ranking, score
# preds[dataset] = softmax(torch.as_tensor(_preds))[:,1].cpu().numpy()
# alphas = [0.0, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, .45, .5, .55, .6, .65, .7, .75, .8, 0.85, .9, .95, 1.0]
alphas = [0.0, 0.85, 1.0]
for dataset in datasets:
#     ql_run_file = os.path.join("/ssd2/arthur/TREC2019/data/runs/{}_QL.run".format(dataset))
    ql_run_file = "/ssd2/arthur/TREC2019/data/runs/indri_test_10_10.run"
    for alpha in alphas:
        beta = 1-alpha
        out_run_file = os.path.join("/ssd2/arthur/TREC2019/data/runs/{}_distilBert-{}.run".format(dataset, alpha))
        topic_results = [] 
        last_topic = -1
        with open(ql_run_file, 'r') as inf, open(out_run_file, 'w') as outf:
            for counter, (example, score) in enumerate(zip(inf, preds[dataset])):
                topic_id, _, doc_id, _, _, _ = example.split()
                guid = "{}-{}".format(topic_id, doc_id)
                if topic_id != last_topic and len(topic_results) > 0:
                    topic_results.sort(key=lambda x:x['score'], reverse=True)
                    for rank, topic in enumerate(topic_results):
                        outf.write(runs_format.format(topic['topic_id'], topic['doc_id'], rank, topic['score']))
                    topic_results = []
                topic_results.append({'topic_id':topic_id, 'doc_id':doc_id, 'score':alpha*score + beta*ql_scores[dataset][guid]})
                last_topic = topic_id

            #dump last topic
            topic_results.sort(key=lambda x:x['score'], reverse=True)
            for rank, topic in enumerate(topic_results):
                outf.write(runs_format.format(topic['topic_id'], topic['doc_id'], rank, topic['score']))
        
        if labeled:
            qrel_file = os.path.join(data_dir, 'qrels', '{}_qrels'.format(dataset))
            ndcgs = subprocess.check_output(ndcg_cmd.format(trec_eval_path, qrel_file, out_run_file).split()).decode('utf-8')
#             maps = [float(x.strip().split("\t")[-1]) for x in maps.split("\n") if len(x)>2]
            _ndcg = float(ndcgs.split("\n")[-2].split("\t")[-1])
            print("\talpha: {}\t ndcg: {}".format(alpha, _ndcg))
            if _ndcg > best_ndcg:
                best_ndcg = _ndcg
                best_alpha = alpha

                

	alpha: 0.0	 ndcg: 0.2179
	alpha: 0.85	 ndcg: 0.2881
	alpha: 1.0	 ndcg: 0.2816


In [None]:
ndcgs

In [None]:
ql_scores[dataset][guid]

In [None]:
import subprocess
import os
trec_eval_path = "/ssd2/arthur/trec_eval/trec_eval"
dev_qrel_path = "/ssd2/arthur/TREC2019/data/qrels/test_qrels"
cmd = "{} -q -c {} {}"
map_cmd = "{} -q -m ndcg {} {}"
best_map = 0.0
runs_format = "{} Q0 {} {} {} DISTILBERT_QL\n" #topic_id, doc_id, ranking, score

In [None]:
qrel_file = os.path.join(data_dir, 'qrels', 'test_qrels')
for alpha in [0.0, 0.85, 1.0]:
    out_run_file = os.path.join("/ssd2/arthur/TREC2019/data/runs/{}_distilBert-{}.run".format(dataset, alpha))
    print(out_run_file)
    map_cmd = "{} -q -m recip_rank {} {}"
    ndcgs = subprocess.check_output(map_cmd.format(trec_eval_path, qrel_file, out_run_file).split()).decode('utf-8')
    _ndcg = float(ndcgs.split("\n")[-2].split("\t")[-1])
    print("\talpha: {}\t ndcg: {}".format(alpha, _ndcg))
# maps = subprocess.check_output(map_cmd.format(trec_eval_path, qrel_file, out_run_file).split()).decode('utf-8')
# maps = [float(x.strip().split("\t")[-1]) for x in maps.split("\n") if len(x)>2]
# _map = float(result.split("\n")[-26].split("\t")[-1])
# print("\talpha: {}\t map: {}".format(alpha, _map))
# if _map > best_map:
#     best_map = _map
#     best_alpha = alpha



In [None]:
float(maps.split("\n")[-2].split("\t")[-1])