In [3]:
import pandas as pd 

DATA_DIR='/projets/iris/PROJETS/lboualil/workdata/msmarco-passage'
RUN_DIR='/projets/iris/PROJETS/lboualil/workdata/run_files'

## Read Run file top1000 with Anserini BM25 with Doc2Qeury

In [30]:
run=pd.read_csv(f'{RUN_DIR}/run.dev.small.tsv', delimiter='\t', header=None, names=['q_id', 'p_id','rank'])
run.head(10)

Unnamed: 0,q_id,p_id,rank
0,1048585,7187160,1
1,1048585,7187157,2
2,1048585,7187163,3
3,1048585,7187158,4
4,1048585,7617404,5
5,1048585,7187156,6
6,1048585,7187155,7
7,1048585,2298838,8
8,1048585,574346,9
9,1048585,2298839,10


## Read qrels file q_id, p_id, label=1

In [31]:
qrels=pd.read_csv(f'{DATA_DIR}/qrels.dev.small.tsv', delimiter='\t', header=None, names=['q_id','0', 'p_id','label'], usecols=['q_id', 'p_id','label'])
print(qrels.shape)
qrels.sample(10)

(7437, 3)


Unnamed: 0,q_id,p_id,label
234,1088302,7094594,1
6322,1026711,7861043,1
206,1088658,7092278,1
5811,283344,7795548,1
3708,481961,7533440,1
4223,662282,7590829,1
302,1087327,7102802,1
6870,1021639,7713994,1
3849,395538,7548990,1
7384,208198,8001318,1


In [32]:
(qrels[(qrels['label']==1)]).shape

(7437, 3)

## Read queries file q_id, q_txt

In [37]:
queries=pd.read_csv(f'{DATA_DIR}/queries.dev.small.tsv', delimiter='\t', header=None, names=['q_id', 'q_txt'])
print(queries.shape)
queries.sample(10)

(6980, 2)


Unnamed: 0,q_id,q_txt
386,275534,how many books did montag steal
5461,995598,which car brands hold value
5775,1085035,what did daniel fahrenheit invent
3052,874299,what lizards like to be handled
1493,1089925,the bancorp routing number
6116,493543,school age definition
1422,1089036,vasospasms caused by what
6954,830649,what is the meaning behind welcome to the blac...
1169,1085545,what county is geneva il in?
3936,402427,is an american staffordshire a pitbull


In [38]:
queries.set_index('q_id', inplace=True, drop=True)
queries.shape

(6980, 1)

In [39]:
queries=queries.sort_values('q_id')
queries.head(10)

Unnamed: 0_level_0,q_txt
q_id,Unnamed: 1_level_1
2,Androgen receptor define
1215,3 levels of government in canada and their res...
1288,3/5 of 60
1576,60x40 slab cost
2235,Bethel University was founded in what year
2798,Does Suddenlink Carry ESPN3
2962,Explain what a bone scan is and what it is use...
4696,Is the Louisiana sales tax 4.75
4947,Ludacris Net Worth
5925,Sony PS-LX300USB how to connect to pc


In [42]:
queries.loc[478691,:]

q_txt    positive effects of agriculture
Name: 478691, dtype: object

## Read collection/passages p_id, p_txt

In [4]:
passages = pd.read_csv(f'{DATA_DIR}/collection.tsv', delimiter='\t', header=None, names=['p_id', 'p_txt'])
print(passages.shape)
passages.sample(10)

(8841823, 2)


Unnamed: 0,p_id,p_txt
1103334,1103334,"Branding, Identity & Logo Design Explained. A ..."
1234033,1234033,"If you continue to cycle during the month, you..."
8296941,8296941,1. Make Sure It's Actually a Subpoena. If it i...
191623,191623,(Recall from Part 1 of this series that a sche...
6328289,6328289,The best way to circulate the warm air is to r...
7691105,7691105,Best Answer: Lots of suspects here! It's doubt...
7525976,7525976,by Indika. AES vs TKIP. When communicating ove...
1141969,1141969,For the city in Florida with movie studios sim...
38205,38205,The inside of your bones are filled with a sof...
980758,980758,Capillaries are the blood vessels that lead fr...


## Merge run, queries, collection to obtain the text of queries and passages

In [40]:
dff = pd.merge(pd.merge(run, queries, on='q_id'), passages, on='p_id')
print(dff.shape)
dff.sample(10)

(6974598, 5)


Unnamed: 0,q_id,p_id,rank,q_txt,p_txt
6390624,478691,3745234,205,positive effects of agriculture,As a leading executive in the field of manufac...
5972762,717751,7980789,229,what is analytical approach to intelligence,10 Smartest Countries in Europe. Published on ...
73838,815421,7947567,923,what is the definition of restatement,MTS is a file extension for an AVCHD (Advanced...
3659234,1100580,4903523,410,fin de siecle meaning,Sogeti was the original name for the entire Ca...
1316396,1068276,2205995,976,what is a rebuttal expert witness,"Note: In hearsay law, witness means someone wh..."
195575,689885,2669644,381,what is a malanga,What impressed me about Ruthless Supplements i...
1832738,1097619,713987,72,how long would the border wall need to be?,President Donald Trump has directed the Depart...
2905245,1102001,6314158,395,why is the whooping crane endangered,1973: With the support of President Richard Ni...
2544645,853344,1600504,166,what is the weather forecast at this time what...,Current U.S. National Radar--Current. The Curr...
2035438,761627,7979686,982,what is it when your battery loses t charge,"Capacity of the charger. Normally, batteries a..."


In [43]:
run.shape[0] == dff.shape[0]

True

## Add the column label from qrels

In [44]:
dffm=pd.merge(dff,qrels,on=['q_id','p_id'])
dff['label']=[0]*dff.shape[0]
dffinal=pd.concat([dffm,dff], sort=False).drop_duplicates(['q_id','p_id'], keep='first')
dffinal.head()

Unnamed: 0,q_id,p_id,rank,q_txt,p_txt,label
0,1048585,7187158,4,what is paula deen's brother,Paula Deen and her brother Earl W. Bubba Hiers...,1
1,1084602,7126763,4,cast of velvet on netflix,"Velvet (tv series 2013â2016) - imdb, With pa...",1
2,2,4339068,10,Androgen receptor define,"The androgen receptor (AR), also known as NR3C...",1
3,1048642,7187023,33,what is paranoid sc,Paranoid schizophrenia is a psychotic disorder...,1
4,992618,4692413,10,where is seneca south carolina?,"Seneca is a city in Oconee County, South Carol...",1


In [45]:
dffinal[(dffinal['label']==0)].shape[0] == run.shape[0]-dffm.shape[0]

True

In [47]:
dfsorted=dffinal.sort_values(by=['q_id','rank'])
dfsorted.reset_index(drop=True, inplace=True)
dfsorted.head(1010)

Unnamed: 0,q_id,p_id,rank,q_txt,p_txt,label
0,2,7035024,1,Androgen receptor define,SRF may bring a number of other proteins to th...,0
1,2,1782337,2,Androgen receptor define,Enzalutamide is an androgen receptor inhibitor...,0
2,2,2022782,3,Androgen receptor define,The androgen dihydrotestosterone binds to the ...,0
3,2,4339073,4,Androgen receptor define,Androgens and androgen receptors (AR) play a p...,0
4,2,4339075,5,Androgen receptor define,"During androgen-independent progression, prost...",0
5,2,5414414,6,Androgen receptor define,Most of the androgen receptors on the market h...,0
6,2,7496508,7,Androgen receptor define,Androgens and androgen receptors also have oth...,0
7,2,3634076,8,Androgen receptor define,SARMS or Selective Androgen Receptor Modulator...,0
8,2,1001873,9,Androgen receptor define,The AR gene provides instructions for making a...,0
9,2,4339068,10,Androgen receptor define,"The androgen receptor (AR), also known as NR3C...",1


## How many queries have top1000 docs retrieved

In [15]:
df1000=dfsorted[(dfsorted['rank']==1000)]
top1000queries=set(df1000['q_id'])

In [16]:
for q in top1000queries:
    if len(dfsorted[(dfsorted['q_id']==q)])<1000:
        print(q)
        break

In [17]:
dfsortedtop1000= dfsorted[dfsorted['q_id'].isin(top1000queries)]

In [21]:
len(dfsortedtop1000[dfsortedtop1000['label']==1])

6590

In [23]:
dfsortedtop1000=dfsortedtop1000.sort_values(by=['q_id','rank'])
dfsortedtop1000.shape

(6972000, 6)

In [24]:
dfsortedtop1000=dfsortedtop1000.reset_index(drop=True)
dfsortedtop1000.head(1010)

Unnamed: 0,q_id,p_id,rank,q_txt,p_txt,label
0,2,7035024,1,Androgen receptor define,SRF may bring a number of other proteins to th...,0
1,2,1782337,2,Androgen receptor define,Enzalutamide is an androgen receptor inhibitor...,0
2,2,2022782,3,Androgen receptor define,The androgen dihydrotestosterone binds to the ...,0
3,2,4339073,4,Androgen receptor define,Androgens and androgen receptors (AR) play a p...,0
4,2,4339075,5,Androgen receptor define,"During androgen-independent progression, prost...",0
5,2,5414414,6,Androgen receptor define,Most of the androgen receptors on the market h...,0
6,2,7496508,7,Androgen receptor define,Androgens and androgen receptors also have oth...,0
7,2,3634076,8,Androgen receptor define,SARMS or Selective Androgen Receptor Modulator...,0
8,2,1001873,9,Androgen receptor define,The AR gene provides instructions for making a...,0
9,2,4339068,10,Androgen receptor define,"The androgen receptor (AR), also known as NR3C...",1


In [25]:
dfsortedtop1000.to_csv(f'{DATA_DIR}/top1000.doc2query.dev.small.csv')

In [35]:
dfsortedtop1000[:1000000].to_csv(f'{DATA_DIR}/top1000.doc2query.dev.small_1000queries.csv')

In [36]:
df=dfsortedtop1000[:1000000]
len(set(df.loc[:,'q_id']))

1000

## Save the query and passage ids in a separate txt file

In [43]:
dev= pd.read_csv(f'{DATA_DIR}/top1000.doc2query.dev.small_1000queries.csv', index_col=0)
dev.head()

Unnamed: 0,q_id,p_id,rank,q_txt,p_txt,label
0,2,7035024,1,Androgen receptor define,SRF may bring a number of other proteins to th...,0
1,2,1782337,2,Androgen receptor define,Enzalutamide is an androgen receptor inhibitor...,0
2,2,2022782,3,Androgen receptor define,The androgen dihydrotestosterone binds to the ...,0
3,2,4339073,4,Androgen receptor define,Androgens and androgen receptors (AR) play a p...,0
4,2,4339075,5,Androgen receptor define,"During androgen-independent progression, prost...",0


In [44]:
dev100queries=dev.head(100*1000)
#dev100queries=dev100queries.drop(columns=['q_id','p_id'])
dev100queries.to_csv(f'{DATA_DIR}/top1000.doc2query.dev.small_100queries.csv')

In [45]:
df= pd.read_csv(f'{DATA_DIR}/marked_top1000.doc2query.dev.small_100queries.csv', index_col=0)
with open(f'{DATA_DIR}/query_doc_ids.top1000.doc2query.dev.small_100queries.txt', 'w') as ids_file:
    for i, row in df.iterrows():
        ids_file.write(f"{row['q_id']}\t{row['p_id']}\n")


In [48]:
df=df.drop(columns=['q_id','p_id','rank'])
df.head()

Unnamed: 0,q_txt,p_txt,label,marked_q,marked_p
0,Androgen receptor define,SRF may bring a number of other proteins to th...,0,[e0]Androgen[\e0] [e1]receptor[\e1] define,SRF may bring a number of other proteins to th...
1,Androgen receptor define,Enzalutamide is an androgen receptor inhibitor...,0,[e0]Androgen[\e0] [e1]receptor[\e1] define,Enzalutamide is an [e0]androgen[\e0] [e1]recep...
2,Androgen receptor define,The androgen dihydrotestosterone binds to the ...,0,[e0]Androgen[\e0] [e1]receptor[\e1] define,The [e0]androgen[\e0] dihydrotestosterone bind...
3,Androgen receptor define,Androgens and androgen receptors (AR) play a p...,0,[e0]Androgen[\e0] [e1]receptor[\e1] define,[e0]Androgens[\e0] and [e0]androgen[\e0] [e1]r...
4,Androgen receptor define,"During androgen-independent progression, prost...",0,[e0]Androgen[\e0] [e1]receptor[\e1] define,"During androgen-independent progression , pros..."


In [49]:
df.to_csv(f'{DATA_DIR}/marked_top1000.doc2query.dev.small_100queries_ids_free.csv')

In [None]:
## Base files without markers

In [3]:
df= pd.read_csv(f'{DATA_DIR}/top1000.doc2query.dev.small_100queries.csv', index_col=0)
with open(f'{DATA_DIR}/doc2query_run/base/query_doc_ids.top1000.doc2query.dev.small_100queries_no_markers.txt', 'w') as ids_file:
    for i, row in df.iterrows():
        ids_file.write(f"{row['q_id']}\t{row['p_id']}\n")

In [6]:
df=df.drop(columns=['q_id','p_id','rank'])
df.head()

KeyError: "['q_id' 'p_id' 'rank'] not found in axis"

In [7]:
df.head(1003)

Unnamed: 0,q_txt,p_txt,label
0,Androgen receptor define,SRF may bring a number of other proteins to th...,0
1,Androgen receptor define,Enzalutamide is an androgen receptor inhibitor...,0
2,Androgen receptor define,The androgen dihydrotestosterone binds to the ...,0
3,Androgen receptor define,Androgens and androgen receptors (AR) play a p...,0
4,Androgen receptor define,"During androgen-independent progression, prost...",0
5,Androgen receptor define,Most of the androgen receptors on the market h...,0
6,Androgen receptor define,Androgens and androgen receptors also have oth...,0
7,Androgen receptor define,SARMS or Selective Androgen Receptor Modulator...,0
8,Androgen receptor define,The AR gene provides instructions for making a...,0
9,Androgen receptor define,"The androgen receptor (AR), also known as NR3C...",1


In [5]:
df.to_csv(f'{DATA_DIR}/doc2query_run/base/top1000.doc2query.dev.small_100queries_ids_free.csv')

## complete the <1000 query runs

In [48]:
df1000=dfsorted[(dfsorted['rank']==1000)]
top1000queries=set(df1000['q_id'])

In [49]:
all_queries=set(dfsorted.loc[:,'q_id'])
not1000queries=[q for q in all_queries if q not in top1000queries]


In [50]:
not1000queries= set(not1000queries)

In [51]:
not1000queries

{405737, 537995, 1088884, 1090291, 1098481, 1099981, 1100010, 1100094}

In [52]:
dfsortedtop1000= dfsorted[dfsorted['q_id'].isin(top1000queries)]
len(dfsortedtop1000)

6972000

In [62]:
NUM_DOCS=1000
for q in not1000queries:
    dfake=pd.DataFrame()
    dff=dfsorted[(dfsorted['q_id']==q)]
    n=max(0,NUM_DOCS-dff.shape[0])
    dfake['q_id']=[q]*n
    dfake['p_id']=["00000000"]*n
    dfake['p_txt']=['FAKE DOCUMENT']*n
    dfake['q_txt']=[queries.loc[q,'q_txt']]*n
    dfake['label']=[0]*n
    dfake['rank']=list(range(dff.shape[0]+1,NUM_DOCS+1))
    dff=dff.append(dfake, ignore_index=True)
    dfsortedtop1000=dfsortedtop1000.append(dff, ignore_index=True)

In [63]:
dfsortedtop1000.tail(1010)

Unnamed: 0,label,p_id,p_txt,q_id,q_txt,rank
6978990,0,00000000,FAKE DOCUMENT,1088884,wadi sirhan,991
6978991,0,00000000,FAKE DOCUMENT,1088884,wadi sirhan,992
6978992,0,00000000,FAKE DOCUMENT,1088884,wadi sirhan,993
6978993,0,00000000,FAKE DOCUMENT,1088884,wadi sirhan,994
6978994,0,00000000,FAKE DOCUMENT,1088884,wadi sirhan,995
6978995,0,00000000,FAKE DOCUMENT,1088884,wadi sirhan,996
6978996,0,00000000,FAKE DOCUMENT,1088884,wadi sirhan,997
6978997,0,00000000,FAKE DOCUMENT,1088884,wadi sirhan,998
6978998,0,00000000,FAKE DOCUMENT,1088884,wadi sirhan,999
6978999,0,00000000,FAKE DOCUMENT,1088884,wadi sirhan,1000


In [65]:
dfsortedtop1000=dfsortedtop1000.sort_values(by=['q_id','rank'])
dfsortedtop1000.shape

(6980000, 6)

In [66]:
dfsortedtop1000=dfsortedtop1000.reset_index(drop=True)
dfsortedtop1000.head(1010)

Unnamed: 0,label,p_id,p_txt,q_id,q_txt,rank
0,0,7035024,SRF may bring a number of other proteins to th...,2,Androgen receptor define,1
1,0,1782337,Enzalutamide is an androgen receptor inhibitor...,2,Androgen receptor define,2
2,0,2022782,The androgen dihydrotestosterone binds to the ...,2,Androgen receptor define,3
3,0,4339073,Androgens and androgen receptors (AR) play a p...,2,Androgen receptor define,4
4,0,4339075,"During androgen-independent progression, prost...",2,Androgen receptor define,5
5,0,5414414,Most of the androgen receptors on the market h...,2,Androgen receptor define,6
6,0,7496508,Androgens and androgen receptors also have oth...,2,Androgen receptor define,7
7,0,3634076,SARMS or Selective Androgen Receptor Modulator...,2,Androgen receptor define,8
8,0,1001873,The AR gene provides instructions for making a...,2,Androgen receptor define,9
9,1,4339068,"The androgen receptor (AR), also known as NR3C...",2,Androgen receptor define,10


In [67]:
dfsortedtop1000.to_csv(f"{DATA_DIR}/top1000.doc2query.dev.small_full.csv")

In [68]:
df= pd.read_csv(f'{DATA_DIR}/top1000.doc2query.dev.small_full.csv', index_col=0)
with open(f'{DATA_DIR}/query_doc_ids.top1000.doc2query.dev.small_full.txt', 'w') as ids_file:
    for i, row in df.iterrows():
        ids_file.write(f"{row['q_id']}\t{row['p_id']}\n")

  mask |= (ar1 == a)


In [69]:
df=df.drop(columns=['q_id','p_id','rank'])
df.head()

Unnamed: 0,label,p_txt,q_txt
0,0,SRF may bring a number of other proteins to th...,Androgen receptor define
1,0,Enzalutamide is an androgen receptor inhibitor...,Androgen receptor define
2,0,The androgen dihydrotestosterone binds to the ...,Androgen receptor define
3,0,Androgens and androgen receptors (AR) play a p...,Androgen receptor define
4,0,"During androgen-independent progression, prost...",Androgen receptor define


In [71]:
df.shape

(6980000, 3)

In [70]:
df.to_csv(f"{DATA_DIR}/top1000.doc2query.dev.small_full_ids_free.csv")

In [3]:
df= pd.read_csv(f'{DATA_DIR}/top1000.doc2query.dev.small_full.csv', index_col=0)

  mask |= (ar1 == a)


In [9]:
df.head(1745000*3+1)

Unnamed: 0,label,p_id,p_txt,q_id,q_txt,rank
0,0,7035024,SRF may bring a number of other proteins to th...,2,Androgen receptor define,1
1,0,1782337,Enzalutamide is an androgen receptor inhibitor...,2,Androgen receptor define,2
2,0,2022782,The androgen dihydrotestosterone binds to the ...,2,Androgen receptor define,3
3,0,4339073,Androgens and androgen receptors (AR) play a p...,2,Androgen receptor define,4
4,0,4339075,"During androgen-independent progression, prost...",2,Androgen receptor define,5
5,0,5414414,Most of the androgen receptors on the market h...,2,Androgen receptor define,6
6,0,7496508,Androgens and androgen receptors also have oth...,2,Androgen receptor define,7
7,0,3634076,SARMS or Selective Androgen Receptor Modulator...,2,Androgen receptor define,8
8,0,1001873,The AR gene provides instructions for making a...,2,Androgen receptor define,9
9,1,4339068,"The androgen receptor (AR), also known as NR3C...",2,Androgen receptor define,10


In [1]:
# TF marked
import pandas as pd 
df= pd.read_csv('/projets/iris/PROJETS/lboualil/workdata/msmarco-passage/TF_Marked_Data/dev/marked_TF_top1000_doc2query_full.csv', index_col=0)

  mask |= (ar1 == a)


In [10]:
df.head()

Unnamed: 0,label,p_id,p_txt,q_id,q_txt,rank,marked_p,marked_q,stats
0,0,7035024,SRF may bring a number of other proteins to th...,2,Androgen receptor define,1,SRF may bring a number of other proteins to th...,[e0]Androgen[\e0] [e1]receptor[\e1] define,"[1, 1]"
1,0,1782337,Enzalutamide is an androgen receptor inhibitor...,2,Androgen receptor define,2,Enzalutamide is an [e0]androgen[\e0] [e1]recep...,[e0]Androgen[\e0] [e1]receptor[\e1] define,"[5, 4]"
2,0,2022782,The androgen dihydrotestosterone binds to the ...,2,Androgen receptor define,3,The [e0]androgen[\e0] dihydrotestosterone bind...,[e0]Androgen[\e0] [e1]receptor[\e1] define,"[3, 2]"
3,0,4339073,Androgens and androgen receptors (AR) play a p...,2,Androgen receptor define,4,[e0]Androgens[\e0] and [e0]androgen[\e0] [e1]r...,[e0]Androgen[\e0] [e1]receptor[\e1] define,"[5, 2]"
4,0,4339075,"During androgen-independent progression, prost...",2,Androgen receptor define,5,"During androgen-independent progression , pros...",[e0]Androgen[\e0] [e1]receptor[\e1] define,"[5, 4]"


In [2]:
with open('/projets/iris/PROJETS/lboualil/workdata/msmarco-passage/TF_Marked_Data/dev/query_doc_ids.top1000.doc2query.dev.small_full.txt', 'w') as ids_file:
    for i, row in df.iterrows():
        ids_file.write(f"{row['q_id']}\t{row['p_id']}\n")

In [11]:
df=df.drop(columns=['q_id','p_id','rank'])
df.to_csv("/projets/iris/PROJETS/lboualil/workdata/msmarco-passage/TF_Marked_Data/dev/marked_TF_top1000_doc2query_full_ids_free.csv")

In [13]:
d=pd.read_csv("/projets/iris/PROJETS/lboualil/workdata/msmarco-passage/TF_Marked_Data/dev/marked_TF_top1000_doc2query_full_ids_free.csv",index_col=0)

In [1]:
import pandas as pd
l=[]
for chunk in pd.read_csv(f"/projets/iris/PROJETS/lboualil/workdata/msmarco-passage/TF_Marked_Data/dev/marked_TF_top1000_doc2query_full_ids_free.csv",index_col=0, chunksize=1745000):
  l.append(chunk)
  print(chunk.shape)

  mask |= (ar1 == a)


(1745000, 6)
(1745000, 6)
(1745000, 6)
(1745000, 6)
