In [6]:
import pytorch_transformers
import sys
import torch
%load_ext autoreload
%autoreload 2
sys.path.insert(0, "/ssd2/arthur/TREC2019/scripts/")
from msmarco_dataset import MsMarcoDataset
from args_parser import getArgs
from torch.utils.data import DataLoader
import os
from tqdm.autonotebook import tqdm                         
from sklearn.metrics import f1_score, average_precision_score, accuracy_score
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from pytorch_transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained("/ssd2/arthur/TREC2019/data/models/distilbert-model")

In [2]:
data_dir = "/ssd2/arthur/TREC2019/data"

batch_size = 2048

dev_file = os.path.join(data_dir, "triples-tokenized", "dev-triples.top100")
assert os.path.isfile(dev_file)

test_file = os.path.join(data_dir, "triples-tokenized", "test-triples.top100")
assert os.path.isfile(test_file)

fulldev_file = os.path.join(data_dir, "triples-tokenized", "fulldev-triples.top100")
assert os.path.isfile(fulldev_file)

In [3]:
dev_dataset = MsMarcoDataset(dev_file, data_dir, distil=True, invert_label=True, force=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
assert len(dev_dataset)==363500


test_dataset = MsMarcoDataset(test_file, data_dir, distil=True, invert_label=True, force=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
assert len(test_dataset)==155800

fulldev_dataset = MsMarcoDataset(fulldev_file, data_dir, distil=True, invert_label=True, force=True)
fulldev_dataloader = DataLoader(fulldev_dataset, batch_size=batch_size, shuffle=False)
assert len(fulldev_dataset) == len(dev_dataset) + len(test_dataset)

dataloaders = {'dev': dev_dataloader, 'test': test_dataloader, 'fulldev': fulldev_dataloader}
datasets = {'dev': dev_dataset, 'test': test_dataset, 'fulldev': fulldev_dataset}

HBox(children=(IntProgress(value=1, bar_style='info', description='Counting lines on file...', max=1, style=Pr…




HBox(children=(IntProgress(value=0, description='Computing offset dictionary', max=363500, style=ProgressStyle…




HBox(children=(IntProgress(value=1, bar_style='info', description='Counting lines on file...', max=1, style=Pr…




HBox(children=(IntProgress(value=0, description='Computing offset dictionary', max=155800, style=ProgressStyle…




HBox(children=(IntProgress(value=1, bar_style='info', description='Counting lines on file...', max=1, style=Pr…




HBox(children=(IntProgress(value=0, description='Computing offset dictionary', max=519300, style=ProgressStyle…

In [None]:
seed = 42
torch.cuda.manual_seed_all(seed)
n_gpu = torch.cuda.device_count()
device = torch.device("cuda" if (torch.cuda.is_available() and n_gpu > 0) else "cpu")
model = torch.nn.DataParallel(model)
model = model.to(device)

In [9]:
import torch
softmax = torch.nn.Softmax(dim=1)

preds = {}
for dataset in ['test', 'dev', 'fulldev']:
    eval_loss = 0.0
    nb_eval_steps = 0
    preds[dataset] = None
    out_label_ids = 0
    _preds = None
    if os.path.isfile(os.path.join(data_dir, 'predictions', '{}_distilBERT.tensor'.format(dataset))):
        preds[dataset] = torch.load(os.path.join(data_dir, 'predictions', '{}_distilBERT.tensor'.format(dataset)))
        preds[dataset] = list(softmax(torch.as_tensor(preds[dataset]))[:, 0].cpu().numpy())
        continue
    dataloader = dataloaders[dataset]
    for index, batch in tqdm(enumerate(dataloader), desc="{} Dataset".format(dataset), total = len(dataloader)):
        model.eval()
        with torch.no_grad():
            inputs = {'input_ids': batch[0].to(device),
                      'attention_mask': batch[1].to(device),
                      'labels': batch[3].to(device)}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]
            eval_loss+=tmp_eval_loss.mean().item()
            nb_eval_steps+=1

            if _preds is None:
                _preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy().flatten()

            else:
                batch_predictions = logits.detach().cpu().numpy()
                batch_ground_truth = inputs['labels'].detach().cpu().numpy().flatten()
                if index%50 == 0:
                    print("\taccuracy: {}".format(accuracy_score(batch_ground_truth, np.argmax(batch_predictions, axis=1))))
                    print("\tf1_score: {}".format(f1_score(batch_ground_truth, np.argmax(batch_predictions, axis=1))))
                _preds = np.append(_preds, batch_predictions, axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy().flatten(), axis=0)

    assert len(_preds) == len(out_label_ids)
    torch.save(_preds, os.path.join(data_dir, 'predictions', '{}_distilBERT.tensor'.format(dataset)))
    preds[dataset] = _preds

In [7]:
#load QL scores and normalize

from collections import defaultdict
ql_scores = defaultdict(lambda:defaultdict(lambda:[]))
ordered_topics = defaultdict(lambda:[])
scores_per_topic = defaultdict(lambda:defaultdict(lambda:[]))
for dataset in ['test', 'dev', 'fulldev']:
    QL_run_file = "/ssd2/arthur/TREC2019/data/runs/{}_QL.run".format(dataset)
    last_topic = None
    normalized_scores = []

    with open(QL_run_file, 'r') as inf:
        for counter, line in tqdm(enumerate(inf), desc="reading run file", total=len(datasets[dataset])):
            [topic_id, _, doc_id, _, score, _] = line.split()
            if topic_id not in ordered_topics[dataset]:
                ordered_topics[dataset].append(topic_id)
            scores_per_topic[dataset][topic_id].append((doc_id, score))
    #normalize
    for _id in tqdm(scores_per_topic[dataset], desc="normalizing"):
        _scores = np.asarray([float(x[1]) for x in scores_per_topic[dataset][_id]])
        normalized_scores = (_scores - np.min(_scores))/np.ptp(_scores)
        for (did, _), score in zip(scores_per_topic[dataset][_id], normalized_scores):
            guid = "{}-{}".format(_id, did)
            ql_scores[dataset][guid] = score
            

HBox(children=(IntProgress(value=0, description='reading run file', max=155800, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='normalizing', max=1558, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='reading run file', max=363500, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='normalizing', max=3635, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='reading run file', max=519300, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='normalizing', max=5193, style=ProgressStyle(description_width…

In [17]:
import subprocess
import os
trec_eval_path = "/ssd2/arthur/trec_eval/trec_eval"
dev_qrel_path = "/ssd2/arthur/TREC2019/data/qrels/dev_qrels"
cmd = "{} -q -c {} {}"
map_cmd = "{} -q -m map {} {}"
best_map = 0.0
runs_format = "{} Q0 {} {} {} DISTILBERT_QL\n" #topic_id, doc_id, ranking, score

alphas = [0.0, 0.85, 1.0]
for dataset in datasets:
    print(dataset)
    ql_run_file = os.path.join("/ssd2/arthur/TREC2019/data/runs/{}_QL.run".format(dataset))
    for alpha in alphas:
        beta = 1-alpha
        out_run_file = os.path.join("/ssd2/arthur/TREC2019/data/runs/{}_distilBert-{}.run".format(dataset, alpha))
        topic_results = [] 
        last_topic = -1
        with open(ql_run_file, 'r') as inf, open(out_run_file, 'w') as outf:
            for counter, (example, score) in enumerate(zip(inf, preds[dataset])):
                topic_id, _, doc_id, _, _, _ = example.split()
                guid = "{}-{}".format(topic_id, doc_id)
                if topic_id != last_topic and len(topic_results) > 0:
                    topic_results.sort(key=lambda x:x['score'], reverse=True)
                    for rank, topic in enumerate(topic_results):
                        outf.write(runs_format.format(topic['topic_id'], topic['doc_id'], rank, topic['score']))
                    topic_results = []
                topic_results.append({'topic_id':topic_id, 'doc_id':doc_id, 'score':alpha*score + beta*ql_scores[dataset][guid]})
                last_topic = topic_id

            #dump last topic
            topic_results.sort(key=lambda x:x['score'], reverse=True)
            for rank, topic in enumerate(topic_results):
                outf.write(runs_format.format(topic['topic_id'], topic['doc_id'], rank, topic['score']))
        
        qrel_file = os.path.join(data_dir, 'qrels', '{}_qrels'.format(dataset))
        result = subprocess.check_output(cmd.format(trec_eval_path, qrel_file , out_run_file).split()).decode('utf-8')
        maps = subprocess.check_output(map_cmd.format(trec_eval_path, qrel_file, out_run_file).split()).decode('utf-8')
        maps = [float(x.strip().split("\t")[-1]) for x in maps.split("\n") if len(x)>2]
        _map = float(result.split("\n")[-26].split("\t")[-1])
        print("\talpha: {}\t map: {}".format(alpha, _map))
        if _map > best_map:
            best_map = _map
            best_alpha = alpha


dev
	alpha: 0.0	 map: 0.2196
	alpha: 0.85	 map: 0.375
	alpha: 1.0	 map: 0.3638
test
	alpha: 0.0	 map: 0.2274
	alpha: 0.85	 map: 0.3833
	alpha: 1.0	 map: 0.3729
fulldev
	alpha: 0.0	 map: 0.2219
	alpha: 0.85	 map: 0.3775
	alpha: 1.0	 map: 0.3665
