### Code to extract potential hard negatives

In [1]:
!wget https://msmarco.blob.core.windows.net/msmarcoranking/top1000.dev.tar.gz
!wget https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv
!tar xzf top1000.dev.tar.gz

print('Generate potential hard negatives for the query dev set\n') 

# qrels.dev.tsv contains relevant passages for each query in dev
# the dict qid_pid stores all relevant (query, passage) pairs
qid_pid = {}
qids = set()
with open("/content/qrels.dev.tsv") as infile:
  for line in infile:
    qid, _, pid, rel = line.split('\t')
    qid_pid[(qid, pid)] = rel
    qids.add(qid)

# top1000.dev contains the top 1000 passages retrieved by BM25. 
# For each (query, passage) pair in top1000.dev, if it's not contained in the qrels file, it's a potential hard negative 
#   - not relevant but is retrieved by BM25,
#   - or relevant but is not assessed by the assessors
with open('/content/potential_strong_neg.dev.tsv', 'w') as outfile:
  with open("/content/top1000.dev") as infile:
    for line in infile:
      qid, pid, query, passage = line.split('\t')
      if qid not in qids:
        print("query {} not in qrels.tsv\n")
        continue
      if (qid, pid) not in qid_pid:
        s = '\t'.join([str(qid), str(pid), query, passage])
        outfile.write(s + '\n')

# print file size
!ls -sh /content/potential_strong_neg.dev.tsv

--2020-11-16 21:18:58--  https://msmarco.blob.core.windows.net/msmarcoranking/top1000.dev.tar.gz
Resolving msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)... 40.112.152.16
Connecting to msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)|40.112.152.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 687414398 (656M) [application/x-gzip]
Saving to: ‘top1000.dev.tar.gz’


2020-11-16 21:19:43 (14.7 MB/s) - ‘top1000.dev.tar.gz’ saved [687414398/687414398]

--2020-11-16 21:19:43--  https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv
Resolving msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)... 40.112.152.16
Connecting to msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)|40.112.152.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1201626 (1.1M) [text/tab-separated-values]
Saving to: ‘qrels.dev.tsv’


2020-11-16 21:19:44 (1.57 MB/s) - ‘qrels.dev.tsv’ saved [1201626/1201626]

Ge

In [None]:
!wget https://msmarco.blob.core.windows.net/msmarcoranking/top1000.train.tar.gz
!wget https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv
!tar xzf top1000.train.tar.gz

############Alert#############
# This requires prob 200GB space. I failed to run it on Colab
##############################

print('Generating potential hard negatives for the query train set\n')

qid_pid = {}
qids = set()
with open("/content/qrels.train.tsv") as infile:
  for line in infile:
    qid, _, pid, rel = line.split('\t')
    qid_pid[(qid, pid)] = rel
    qids.add(qid)


with open('/content/potential_strong_neg.train.tsv', 'w') as outfile:
  with open("/content/top1000.train.txt") as infile:
    for line in infile:
      qid, pid, query, passage = line.split('\t')
      if qid not in qids:
        print("query {} not in qrels.tsv\n")
        continue
      if (qid, pid) not in qid_pid:
        s = '\t'.join([str(qid), str(pid), query, passage])
        outfile.write(s + '\n')

--2020-11-13 22:51:34--  https://msmarco.blob.core.windows.net/msmarcoranking/top1000.train.tar.gz
Resolving msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)... 40.112.152.16
Connecting to msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)|40.112.152.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11519984492 (11G) [application/x-gzip]
Saving to: ‘top1000.train.tar.gz’


2020-11-13 23:13:19 (3.62 MB/s) - Read error at byte 4949278720/11519984492 (Connection timed out). Retrying.

--2020-11-13 23:13:20--  (try: 2)  https://msmarco.blob.core.windows.net/msmarcoranking/top1000.train.tar.gz
Connecting to msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)|40.112.152.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11519984492 (11G) [application/x-gzip]
Saving to: ‘top1000.train.tar.gz’

--2020-11-13 23:27:52--  https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv
Resolving msmarco.blo

In [2]:
%%shell
wget https://msmarco.blob.core.windows.net/msmarcoranking/top1000.train.tar.gz
ls -sh top1000.train.tar.gz

--2020-11-16 21:28:42--  https://msmarco.blob.core.windows.net/msmarcoranking/top1000.train.tar.gz
Resolving msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)... 40.112.152.16
Connecting to msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)|40.112.152.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11519984492 (11G) [application/x-gzip]
Saving to: ‘top1000.train.tar.gz’

top1000.train.tar.g   0%[                    ]   2.98M  2.70MB/s               

CalledProcessError: ignored

In [None]:
%%shell
tar -tf top1000.train.tar.gz

top1000.train.txt


