In [3]:
import os
data_home = "/ssd2/arthur/TREC2019/data/"
run_file = "/ssd2/arthur/TREC2019/data/msmarco-doctrain-top100"
assert os.path.isfile(run_file)

In [4]:
from tqdm import tqdm_notebook as tqdm
import os
import gzip
import csv
import random
from collections import defaultdict
from pytorch_transformers import BertTokenizer
import subprocess
split = 'train'
top_k = 100

queries_file = os.path.join(data_home, f"msmarco-doc{split}-queries.tsv.gz")
lookup_file = os.path.join(data_home, "msmarco-docs-lookup.tsv.gz")
qrels_file = os.path.join(data_home, f"msmarco-doc{split}-qrels.tsv.gz")
docs_file = os.path.join(data_home, "msmarco-docs.tsv")
output_file = os.path.join(data_home, "ql_bert_{}_top{}.tsv".format(split, top_k))

n_topics = 0

number_of_lines_to_process = int(subprocess.check_output("wc -l {}".format(run_file).split()).decode("utf=8").split()[0])

querystring = {}
with gzip.open(queries_file, 'rt', encoding='utf-8') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for [topicid, querystring_of_topicid] in tsvreader:
        querystring[topicid] = querystring_of_topicid
        n_topics +=1
        

docoffset = {}
with gzip.open(lookup_file, 'rt', encoding='utf-8') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for [docid, _, offset] in tsvreader:
        docoffset[docid] = int(offset)

qrel = {}
with gzip.open(qrels_file, 'rt', encoding='utf8') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for [topicid, _, docid, rel] in tsvreader:
        assert rel == "1"
        if topicid in qrel:
            qrel[topicid].append(docid)
        else:
            qrel[topicid] = [docid]

def getcontent(docid, file_name):
    """getcontent(docid, f) will get content for a given docid (a string) from filehandle f.
    The content has four tab-separated strings: docid, url, title, body.
    """
    with open(file_name, encoding='utf-8') as f:
        f.seek(docoffset[docid])
        line = f.readline()
        assert line.startswith(docid + "\t"), \
            f"Looking for {docid}, found {line}"
    return line.rstrip()

expected_total = n_topics * top_k

In [10]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length=509):
    """Truncates a sequence pair in place to the maximum length."""
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()



def text_to_tokens(query, document, tokenizer):
    tokens_a = tokenizer.tokenize(query)
    tokens_b = tokenizer.tokenize(document)
    _truncate_seq_pair(tokens_a, tokens_b)
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
    return tokens


In [13]:
mp.cpu_count()

56

In [6]:
#pre-process positions for each chunk
cpus = 32
number_of_chunks = cpus
block_offset = dict()
lines_per_chunk = number_of_lines_to_process // 32
# lines_per_chunk = 5
excess_lines = number_of_lines_to_process%32
start = 0
f = open(run_file)
with open(run_file) as f:
    current_chunk = 0
    counter = 0
    line = f.readline()
    while(line):
        if (counter + 1) % lines_per_chunk == 0:
            block_offset[current_chunk] = f.tell()
            print(current_chunk, block_offset[current_chunk])
            current_chunk+=1
        line = f.readline()
        
        counter+=1

0 60193884
1 120416162
2 180823912
3 241223703
4 301567036
5 361853175
6 422140324
7 482440712
8 542735323
9 602984979
10 663223311
11 723516992
12 783810715
13 844096525
14 904379955
15 964676182
16 1024948188
17 1084863259
18 1144799569
19 1204963668
20 1265098610
21 1325255755
22 1385257620
23 1445165078
24 1505249723
25 1565443251
26 1625627610
27 1685813834
28 1746015535
29 1806217155
30 1866415130
31 1926617495


In [11]:
import multiprocessing as mp, os

def process_chunk(chunk_no, block_offset, inf, no_lines):
    lines = []
    with open(inf, 'r') as f:
        f.seek(block_offset[chunk_no])
        for i in range(no_lines):
            lines.append(f.readline().strip())
    tokenizer = BertTokenizer.from_pretrained(os.path.join(data_home, "models"))
    output_line_format = "{}-{}\t{}\t{}\n"
    with open("/ssd2/arthur/TREC2019/data/{}-triples.{}".format(split, chunk_no), 'w', encoding='utf-8') as outf:
        for line in tqdm(lines):
            [topic_id, _, doc_id, ranking, score, _] = line.split()
            is_relevant = doc_id in qrel[topic_id]
            query = querystring[topic_id]
            document = getcontent(doc_id, docs_file)
            tokenized = text_to_tokens(query, document, tokenizer)
            outf.write(output_line_format.format(topic_id, doc_id, tokenized, int(is_relevant)))


pbar = tqdm(total=32)
def update(*a):
    pbar.update()
pool = mp.Pool(32)
jobs = []
for i in tqdm(range(32)):
    jobs.append(pool.apply_async(process_chunk, args=(i, block_offset, run_file, lines_per_chunk, ),callback=update ))
for job in jobs:
    job.get()

pool.close()


HBox(children=(IntProgress(value=0, max=32), HTML(value='')))

HBox(children=(IntProgress(value=0, max=32), HTML(value='')))

Process ForkPoolWorker-157:
Process ForkPoolWorker-130:
Process ForkPoolWorker-139:
Process ForkPoolWorker-137:
Process ForkPoolWorker-138:
Process ForkPoolWorker-140:
Process ForkPoolWorker-135:
Process ForkPoolWorker-148:
Process ForkPoolWorker-154:
Process ForkPoolWorker-150:
Process ForkPoolWorker-136:
Process ForkPoolWorker-133:
Process ForkPoolWorker-142:
Process ForkPoolWorker-147:
Traceback (most recent call last):
Process ForkPoolWorker-155:


KeyboardInterrupt: 

Process ForkPoolWorker-129:
Process ForkPoolWorker-146:
Process ForkPoolWorker-143:
Process ForkPoolWorker-144:
Process ForkPoolWorker-134:
Process ForkPoolWorker-149:
Process ForkPoolWorker-159:
Process ForkPoolWorker-152:
Process ForkPoolWorker-151:
Process ForkPoolWorker-141:
Traceback (most recent call last):
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-156:
Process ForkPoolWorker-132:
Process ForkPoolWorker-158:
Process ForkPoolWorker-145:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-131:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
T

  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-11-68a98138262e>", line 17, in process_chunk
    tokenized = text_to_tokens(query, document, tokenizer)

  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-11-68a98138262e>", line 17, in process_chunk
    tokenized = text_to_tokens(query, document, tokenizer)
  File "<ipython-input-11-68a98138262e>", line 17, in process_chunk
    tokenized = text_to_tokens(query, document, tokenizer)
  File "<ipython-input-11-68a98138262e>", line 17, in process_chunk
    tokenized = text_to_tokens(query, document, tokenizer)
  File "<ipython-input-11-68a98138262e>", line 17, in process_chunk
    tokenized = text_to_tokens(query, document, tokenizer)
  File "<ipython-input-10-c005b547fc13>", line 16, in text_to_tokens
    tokens_b = tokenizer.tokenize(document)
  File "<ipython-input-11-68a98138262e>", line 17, in process_chunk
    tokenized = t

  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 361, in tokenize
    tokenized_text = split_on_tokens(added_tokens, text)
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 361, in tokenize
    tokenized_text = split_on_tokens(added_tokens, text)
  File "<ipython-input-10-c005b547fc13>", line 16, in text_to_tokens
    tokens_b = tokenizer.tokenize(document)
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/multiprocessing/queues.py", line 352, in get
    res = self._reader.recv_bytes()
  File "<ipython-input-11-68a98138262e>", line 17, in process_chunk
    tokenized = text_to_tokens(query, document, tokenizer)
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 361, in tokenize
    tokenized_text = split_on_tokens(added_tokens, text)
  File "/home/arthur/miniconda3/envs/bert

  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transforme

  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tok

  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorc

  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_tran

  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorc

  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in split_on_tokens
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 354, in split_on_tokens
    return self._tokenize(text, **kwargs)
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transforme

  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_bert.py", line 149, in _tokenize
    for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 354, in split_on_tokens
    return self._tokenize(text, **kwargs)
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 358, in <genexpr>
    for sub_text in split_text), [])[:-1]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 354, in split_on_tokens
    return self._tokenize(text, **kwargs)
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_bert.py", line 149, in _tokenize
    for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_t

  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_bert.py", line 237, in tokenize
    text = self._clean_text(text)
KeyboardInterrupt
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 354, in split_on_tokens
    return self._tokenize(text, **kwargs)
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_bert.py", line 252, in tokenize
    split_tokens.extend(self._run_split_on_punc(token))
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_bert.py", line 277, in _run_split_on_punc
    char = chars[i]
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_bert.py", line 278, in _run_split_on_punc
    if _is_punctuation(char):
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/t

KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_bert.py", line 418, in _is_control
    if cat.startswith("C"):
KeyboardInterrupt
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_bert.py", line 335, in _clean_text
    output.append(char)
KeyboardInterrupt
KeyboardInterrupt
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_utils.py", line 354, in split_on_tokens
    return self._tokenize(text, **kwargs)
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_bert.py", line 319, in _is_chinese_char
    (cp >= 0xF900 and cp <= 0xFAFF) or  #
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
  File "/home/arthur/miniconda3/envs/bert/lib/python3.7/site-packages/pytorch_transformers/tokenization_bert.py", line 149, in _tokenize

In [None]:
import multiprocessing as mp,os
number_of_lines_to_process = int(subprocess.check_output("wc -l {}".format(run_file).split()).decode("utf=8").split()[0])
run_file = "/ssd2/arthur/TREC2019/data/msmarco-doctrain-top100"

def process_wrapper(chunkStart, chunkSize, chunk_no):
    with open("input.txt") as f, open("output_{}.txt".format(chunk_no)) as outf:
        f.seek(chunkStart)
        lines = f.read(chunkSize).splitlines()
        for line in lines:
            [topic_id, _, doc_id, ranking, score, _] = line.split()
            is_relevant = doc_id in qrel[topic_id]
            query = querystring[topic_id]
            document = getcontent(doc_id, f)
            tokenized = text_to_tokens(query, document)
            outf.write(output_line_format.format(topic_id, doc_id, tokenized, int(is_relevant)))

def chunkify(fname,size=1024*1024):
    fileEnd = os.path.getsize(fname)
    print(fileEnd)
    with open(fname,'r') as f:
        chunkEnd = f.tell()
        while True:
            chunkStart = chunkEnd
            f.seek(size,1)
            f.readline()
            chunkEnd = f.tell()
            yield chunkStart, chunkEnd - chunkStart
            if chunkEnd > fileEnd:
                break

# pool = mp.Pool(32)
# jobs = []
for n_chunk, chunkStart, chunkSize in enumerate(chunkify(run_file)):
    print(n_chunk)




In [None]:
36699977//32

In [None]:
number_of_lines_to_process

In [None]:
from multiprocessing import os

def process_chunk()

output_line_format = "{}-{}\t{}\t{}\n" #query_id, tokenized_text, label
with open(run_file, 'r') as inf, open(docs_file, 'r') as f, open(output_file, 'w', encoding='utf-8') as outf:
    for counter, line in tqdm(enumerate(inf), total=expected_total):
        [topic_id, _, doc_id, ranking, score, _] = line.split()
        is_relevant = doc_id in qrel[topic_id]
        
        query = querystring[topic_id]
        document = getcontent(doc_id, f)
        tokenized = text_to_tokens(query, document)
        outf.write(output_line_format.format(topic_id, doc_id, tokenized, int(is_relevant)))