In [2]:
import argparse
import os
import sys
import pickle
from pytorch_transformers import BertTokenizer
from collections import Counter, defaultdict
from tqdm.auto import tqdm
from itertools import product
from gensim.models import KeyedVectors
from gensim.similarities import SoftCosineSimilarity
import numpy as np
from scipy.spatial.distance import cosine
import random
import re

parser = argparse.ArgumentParser()
parser.add_argument("--data_home", type=str, default="/ssd2/arthur/TREC2019/data")
parser.add_argument("--docs_file", type=str, default="docs/tokenized-msmarco-docs.tsv")
parser.add_argument("--queries_file", type=str, default="queries/test_queries.tsv")
parser.add_argument("--axioms", type=str, default="TFC1,TFC2,MTDC,LNC1,LNC2,LB1,LB2,STMC1,STMC2,STMC3,TP")
parser.add_argument("--top100", type=str, default="runs/indri_test_10_10.run")
parser.add_argument("--cpus", type=int, default=1)
parser.add_argument("--total_docs", type=int, default=3213835)
parser.add_argument("--delta", type=int, default=100)
parser.add_argument("--idf_file", type=str, default="docs/IDFS/IDFS-FULL")
parser.add_argument("--embeddings_path", type=str, default="GloVe/w2v.txt")
parser.add_argument("--stmc_sim", type=float, default=0.2)
parser.add_argument("--delta_lb", type=float, default=0.2)
parser.add_argument("--min_df_lb", type=int, default=100000)

argv = ["--cpus", "1",
        "--axioms", "TFC1"]
args = parser.parse_args(argv)
sys.path.insert(0, "/ssd2/arthur/TREC2019/scripts/")

In [3]:
def getcontent(doc_id, docs_file, offset_dict):
    offset = offset_dict[doc_id]
    with open(docs_file) as f:
        f.seek(offset)
        doc = f.readline()
    return doc

docs_path = os.path.join(args.data_home, args.docs_file)
offset_dict = pickle.load(open(docs_path+".offset", 'rb'))
top_100_path = os.path.join(args.data_home, args.top100)
assert os.path.isfile(top_100_path)
assert os.path.isfile(docs_path)
#set of valid documents
tuples = defaultdict(lambda: set())
all_docs = {}
docs_lens = {}
scores = {}
for i in tqdm(open(top_100_path), total=155800):
    topic_id, _ , doc_id, _, score, _ = i.split()
    tuples[topic_id].add(doc_id)
    scores[f"{topic_id}-{doc_id}"] = score
    if doc_id in all_docs:
        continue  
    all_docs[doc_id] = getcontent(doc_id, docs_path, offset_dict).split()
    docs_lens[doc_id] = len(all_docs[doc_id])
args.scores = scores

HBox(children=(IntProgress(value=0, max=155800), HTML(value='')))




In [4]:
#create a triples file with the instances needed

In [5]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length=509):
    """Truncates a sequence pair in place to the maximum length."""
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
def text_to_tokens(query, document):
    tokens_a = query
    tokens_b = document
    _truncate_seq_pair(tokens_a, tokens_b)
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
    return tokens


In [7]:
queries_file = os.path.join(args.data_home, args.queries_file)
tokenized_queries = {}
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
for line in tqdm(open(queries_file), total=1558):
    q_id, query_text = line.strip().split("\t")
    tokenized_query = tokenizer.tokenize(query_text)
    tokenized_queries[q_id] = tokenized_query

HBox(children=(IntProgress(value=0, max=1558), HTML(value='')))




In [8]:
multiply_factor = dict()
doc_lens = []
processed_docs = set()
processed_docs_per_topic = defaultdict(lambda: set())
instances = []
with open("/ssd2/arthur/TREC2019/data/triples-tokenized/LNC2_test-triples.top100", 'w') as outf:
    for line in tqdm(open(top_100_path), total = 155800):
        topic_id, _ , D, _, score, _ = line.split()
        doc = all_docs[D]
        doc_text = " ".join(doc)
        if len(doc)<256:
            processed_docs_per_topic[topic_id].add(D)
            processed_docs.add(D)
            multiply_factor[D] = 1
            new_doc = doc.copy()
            missing = 512 - len(new_doc)
            while missing > len(doc):
                multiply_factor[D] += 1
                new_doc += doc
                missing = 512 - len(new_doc)
            doc = text_to_tokens(tokenized_queries[topic_id], doc)
            outf.write("{}-{}\t{}\n".format(topic_id, D, doc))
            new_doc = text_to_tokens(tokenized_queries[topic_id], new_doc)
            outf.write("{}-{}-LNC2\t{}\n".format(topic_id, D, new_doc))
            instances.append((topic_id, D, D+"-LNC2"))

HBox(children=(IntProgress(value=0, max=155800), HTML(value='')))




In [9]:
np.mean(list(multiply_factor.values())), np.std(list(multiply_factor.values())), np.median(list(multiply_factor.values()))

(2.647257260193605, 1.5655546098891808, 2.0)

In [11]:
os.environ["CUDA_VISIBLE_DEVICES"]="2,3,4,5,6,7"  # specify which GPU(s) to be used
from pytorch_transformers import DistilBertForSequenceClassification
import sys
import torch
model = DistilBertForSequenceClassification.from_pretrained("/ssd2/arthur/TREC2019/data/models/distilbert-cut/")
model = torch.nn.DataParallel(model)
sys.path.insert(0, "/ssd2/arthur/TREC2019/scripts/")
batch_size = 1024
from msmarco_dataset import MsMarcoDataset
from torch.utils.data import DataLoader
dataset = MsMarcoDataset("/ssd2/arthur/TREC2019/data/triples-tokenized/LNC2_test-triples.top100", args.data_home, distil=True, invert_label=True, labeled=False, force=True)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

HBox(children=(IntProgress(value=1, bar_style='info', description='Counting lines on file...', max=1, style=Pr…




HBox(children=(IntProgress(value=0, description='Computing offset dictionary', max=14904, style=ProgressStyle(…




In [12]:
seed = 42
torch.cuda.manual_seed_all(seed)
n_gpu = torch.cuda.device_count()
device = torch.device("cuda" if (torch.cuda.is_available() and n_gpu > 0) else "cpu")
model = model.to(device)
softmax = torch.nn.Softmax(dim=1)

preds = {}
eval_loss = 0.0
nb_eval_steps = 0
preds[dataset] = None
out_label_ids = 0
_preds = None
for index, batch in tqdm(enumerate(dataloader), desc="{} Dataset".format(dataset), total = len(dataloader)):
    model.eval()
    with torch.no_grad():
        inputs = {'input_ids': batch[0].to(device),
                  'attention_mask': batch[1].to(device)}
        outputs = model(**inputs)
        logits = outputs[0]
        if _preds is None:
            _preds = logits.detach().cpu().numpy()
        else:
            batch_predictions = logits.detach().cpu().numpy()
            _preds = np.append(_preds, batch_predictions, axis=0)
torch.save(_preds, os.path.join(args.data_home, 'predictions', 'LNC2.distilBERT.tensor'))
assert len(_preds) == len(instances)*2

HBox(children=(IntProgress(value=0, description='<msmarco_dataset.MsMarcoDataset object at 0x7fe1a5ca20f0> Dat…




In [33]:
from collections import defaultdict
ql_scores = defaultdict(lambda:[])
ordered_topics = []
scores_per_topic = defaultdict(lambda:[])
QL_run_file = "/ssd2/arthur/TREC2019/data/runs/LNC.run"
last_topic = None
normalized_scores = []
with open(QL_run_file, 'r') as inf:
    for counter, line in tqdm(enumerate(inf), desc="reading run file", total=len(dataset)):
        [topic_id, _, doc_id, _, score, _] = line.split()
        if topic_id not in ordered_topics:
            ordered_topics.append(topic_id)
        scores_per_topic[topic_id].append((doc_id, score))
assert sum([len(scores_per_topic[x]) for x in scores_per_topic]) == len(_preds)
#normalize
for _id in tqdm(scores_per_topic, desc="normalizing"):
    _scores = np.asarray([float(x[1]) for x in scores_per_topic[_id]])
    normalized_scores = (_scores - np.min(_scores))/np.ptp(_scores)
    for (did, _), score in zip(scores_per_topic[_id], normalized_scores):
        guid = "{}-{}".format(_id, did)
        ql_scores[guid] = score
assert len(ql_scores) == len(_preds)

HBox(children=(IntProgress(value=0, description='reading run file', max=13882, style=ProgressStyle(description…




HBox(children=(IntProgress(value=0, description='normalizing', max=1343, style=ProgressStyle(description_width…




In [13]:
preds = softmax(torch.as_tensor(_preds))[:,0].cpu().numpy()

In [22]:
len(instances)

7452

In [18]:
len(qrels)

1558

In [20]:
qrels_path = os.path.join(args.data_home, 'qrels', "test_qrels")
qrels = {}
for line in open(qrels_path):
    topic_id, _, doc_id, rel = line.split("\t")
    if topic_id in qrels:
        qrels[topic_id].append(doc_id)
    else:
        qrels[topic_id] = [doc_id]

instances_with_relevant = 0
for i in instances:
    if i[1] in qrels[i[0]]:
        instances_with_relevant += 1
print(instances_with_relevant)

82


In [14]:
fulfills = 0
_list = list(zip(preds[:-1], preds[1:]))
for v, w in _list[::2]:
    if v<=w:
        fulfills+=1
fulfills/len(instances)

0.055152979066022546

In [34]:
alphas = [0.0, 0.85, 1.0]
runs_format = "{} Q0 {} {} {} DISTILBERT_QL\n" #topic_id, doc_id, ranking, score
preds = softmax(torch.as_tensor(_preds))[:,0].cpu().numpy()
for alpha in alphas:
    beta = 1-alpha
    out_run_file = os.path.join("/ssd2/arthur/TREC2019/data/runs/LNC2_dev_distilBert-{}.run".format(alpha))
    topic_results = [] 
    last_topic = -1
    with open(QL_run_file, 'r') as inf, open(out_run_file, 'w') as outf:
        for counter, (example, score) in enumerate(zip(inf, preds)):
            topic_id, _, doc_id, _, _, _ = example.split()
            guid = "{}-{}".format(topic_id, doc_id)
            if topic_id != last_topic and len(topic_results) > 0:
                topic_results.sort(key=lambda x:x['score'], reverse=True)
                for rank, topic in enumerate(topic_results):
                    outf.write(runs_format.format(topic['topic_id'], topic['doc_id'], rank, topic['score']))
                topic_results = []
            topic_results.append({'topic_id':topic_id, 'doc_id':doc_id, 'score':alpha*score + beta*ql_scores[guid]})
            last_topic = topic_id
        topic_results.sort(key=lambda x:x['score'], reverse=True)
        for rank, topic in enumerate(topic_results):
            outf.write(runs_format.format(topic['topic_id'], topic['doc_id'], rank, topic['score']))

In [43]:
len(instances)

6941

In [None]:
len

In [38]:
for alpha in alphas:
    LNC2_scores = {}
    run_file = "/ssd2/arthur/TREC2019/data/runs/LNC2_dev_distilBert-{}.run".format(alpha)
    for line in open(run_file):
        topic_id, _, doc_id, _, score, _ = line.split()
        pair_id = "{}-{}".format(topic_id, doc_id)
        LNC2_scores[pair_id] = float(score)
    dataset_path = os.path.join(args.data_home, "diagnostics/LNC2-instances")
    dataset = pickle.load(open(dataset_path, 'rb'))
    agreements = 0
    for topic_id, di_id, dj_id in dataset:
        guid1 = f"{topic_id}-{di_id}"
        guid2 = f"{topic_id}-{dj_id}"
        if LNC2_scores[guid1] > LNC2_scores[guid2]:
            agreements+=1
    print(alpha, len(dataset), agreements, agreements / len(dataset))

0.0 6941 17 0.002449214810546031
0.85 6941 2205 0.31767756807376457
1.0 6941 3644 0.5249963982135138
