# main

## retreival

### definitions

In [98]:
train_queries_file = 'train/queriesROBUST_train_1.xml'
train_results_file = 'train/train_1_res.csv' 

# https://lemur.sourceforge.io/indri/IndriRunQuery.html
pseudo_rel_feedb_params = '-fbDocs=10 -fbTerms=4 -fbMu=0.5 -fbOrigWeight=0.8'

### run queries in indri

In [101]:
def run_indri(pseudo_rel_feedb_params, train_queries_file):
    res = !indrirunquery -index=ROBUSTindex -trecFormat=true {pseudo_rel_feedb_params} {train_queries_file}
    return res

qres = run_indri(pseudo_rel_feedb_params, train_queries_file)

### indri query results to pandas DataFrame

In [104]:
import pandas as pd

def res_to_df(qres):
    qres_df = pd.DataFrame.from_records([a.split(' ') for a in qres])
    # query result trec format
    # <queryID> Q0 <DocID> <rank> <score> <runID>
    qres_df.columns = ['queryID','Q0','DocID','rank','score','runID']
    return qres_df

qres_df = res_to_df(qres)
print(qres_df.head())
print(qres_df.shape)

  queryID  Q0        DocID rank     score  runID
0     301  Q0  FBIS4-41991    1  -5.28365  indri
1     301  Q0  FBIS4-38364    2  -5.35954  indri
2     301  Q0  FBIS4-55395    3  -5.39989  indri
3     301  Q0   FBIS4-7811    4  -5.42014  indri
4     301  Q0  FBIS3-23986    5  -5.44377  indri
(1000, 6)


## imporve ranking

### !!!! our ranking model !!!!

In [95]:
# working on qres_df
# creating new ranking order in trec format

### ranking results to file

In [105]:
def res_to_file(qres_df, train_results_file):
    qres_df.to_csv(train_results_file, index=False, header=False, sep=' ')

res_to_file(qres_df, train_results_file)

## results evaluation
*requires building trec_eval in cygwin and running the notebook from cygwin

In [118]:
def run_trec_eval(train_results_file):
    res = !trec_eval-9.0.7\trec_eval.exe qrels_50_Queries {train_results_file} | grep "^map"
    return res[0].split('\t')[2]

print(run_trec_eval(train_results_file))

0.0510


# functions

In [119]:
def run_retreival(pseudo_rel_feedb_params, train_queries_file):
    qres = run_indri(pseudo_rel_feedb_params, train_queries_file)
    return res_to_df(qres)

def run_our_model(qres_df):
    return qres_df

def evaluate(qres_df, train_results_file):
    res_to_file(qres_df, train_results_file)
    return run_trec_eval(train_results_file)

def run_ir(pseudo_rel_feedb_params, train_queries_file, train_results_file):
    qres_df = run_retreival(pseudo_rel_feedb_params, train_queries_file)
    qres_our_df = run_our_model(qres_df)
    return evaluate(qres_our_df, train_results_file)

# all in a single cell 

In [136]:
train_queries = 5
train_queries_file = f'train/queriesROBUST_train_{train_queries}.xml'
train_results_file = f'train/train_{train_queries}_res.csv' 

# https://lemur.sourceforge.io/indri/IndriRunQuery.html
pseudo_rel_feedb_params = '-fbDocs=50 -fbTerms=10 -fbMu=0.9 -fbOrigWeight=0.9'
smoothing_rule = '-rule=dirichlet'

params = f'{pseudo_rel_feedb_params} {smoothing_rule}'

print(run_ir(params, train_queries_file, train_results_file))

0.1914


# sandbox

### my files

In [24]:
!ls

ir_chalenge.ipynb
IR_Challenge.pdf
qrels_50_Queries
queriesROBUST.xml
ROBUSTindex
train
trec_eval-9.0.7


### training queries set 

In [25]:
import pandas as pd

a = pd.read_csv('qrels_50_Queries', sep=' ', header=None)
a.columns = ['qid', 'qother', 'docid', 'is_doc_rel']

print(a.head())
print(a.shape)
print(a['qid'].value_counts().sort_index())

   qid  qother        docid  is_doc_rel
0  301       0  FBIS3-10082           1
1  301       0  FBIS3-10169           0
2  301       0  FBIS3-10243           1
3  301       0  FBIS3-10319           0
4  301       0  FBIS3-10397           1
(61511, 4)
301    1512
302     874
303     761
304    1080
305    1463
306    1318
307    1236
308    1149
309    1057
310    1399
311    1299
312    1248
313     968
314    1438
315    1417
316    1521
317    1072
318    1393
319    1454
320     960
321    1369
322    1504
323    1291
324     822
325    1148
326     778
327    1332
328    1006
329    1224
330    1277
331    1381
332    1243
333     978
334    1204
335     950
336    1518
337    1044
338    1230
339     932
340    1409
341    1134
342    1612
343    1549
344    1250
345     867
346    1371
347    1079
348    1668
349    1437
350    1285
Name: qid, dtype: int64


### all queries

In [26]:
with open('queriesROBUST.xml', 'r') as queries_file:
    print(queries_file.read())

<parameters>
	<query>
		<number>301</number>
		<text>#combine( international organized crime )</text>
	</query>
	<query>
		<number>302</number>
		<text>#combine( poliomyelitis post polio )</text>
	</query>
	<query>
		<number>303</number>
		<text>#combine( hubble telescope achievements )</text>
	</query>
	<query>
		<number>304</number>
		<text>#combine( endangered species mammals )</text>
	</query>
	<query>
		<number>305</number>
		<text>#combine( dangerous vehicles )</text>
	</query>
	<query>
		<number>306</number>
		<text>#combine( african civilian deaths )</text>
	</query>
	<query>
		<number>307</number>
		<text>#combine( new hydroelectric projects )</text>
	</query>
	<query>
		<number>308</number>
		<text>#combine( implant dentistry )</text>
	</query>
	<query>
		<number>309</number>
		<text>#combine( rap crime )</text>
	</query>
	<query>
		<number>310</number>
		<text>#combine( radio waves brain cancer )</text>
	</query>
	<query>
		<number>311</number>
		<text>#combine( industrial e

### running indri query from python

#### run single word query 

In [2]:
!indrirunquery -index=ROBUSTindex -query=bobb

-5.74828	LA032890-0110	0	1264
-6.90624	LA051190-0116	0	496
-6.94036	LA091590-0120	0	600
-6.98527	LA011490-0143	0	1823
-7.20127	LA051190-0032	0	183
-7.27628	LA102589-0005	0	392
-7.3039	LA092989-0050	0	473
-7.36427	LA050390-0031	0	658
-7.42449	LA032290-0221	0	854
-7.48015	LA021389-0012	0	1046
-7.55353	LA010189-0137	0	1316
-7.56094	LA011990-0091	0	3266
-7.60661	LA011890-0224	0	1524
-7.7691	LA010889-0195	0	2234
-7.8919	LA053090-0144	0	177
-7.89451	LA053190-0148	0	184
-7.91443	LA082990-0004	0	238
-7.92965	LA102189-0094	0	280
-7.96012	LA053190-0112	0	366
-7.98084	LA102689-0029	0	426
-8.00515	LA102689-0024	0	498
-8.03213	LA021590-0069	0	580
-8.06471	LA012490-0009	0	682
-8.09657	LA060790-0089	0	785
-8.28672	FBIS4-67398	0	1473
-8.28924	FBIS4-21487	0	1483
-8.61431	LA042990-0169	0	3013


#### run all queries from file

In [48]:
!indrirunquery -index=ROBUSTindex train/queriesROBUST_train_5.xml

-5.79587	FBIS4-41991	0	1124
-5.90807	FBIS4-38364	0	2152
-5.95047	FBIS4-55395	0	932
-5.98923	FBIS4-7811	0	2237
-6.01082	FBIS3-23986	0	1002
-6.07622	FBIS3-24143	0	2182
-6.08028	FBIS3-37418	0	1574
-6.14724	FBIS4-22471	0	1697
-6.18212	FBIS4-46734	0	2338
-6.18403	FBIS4-16951	0	2181
-6.2255	FBIS3-19646	0	456
-6.22583	FBIS3-21961	0	457
-6.24489	FBIS4-46846	0	2290
-6.25641	FBIS4-68801	0	2307
-6.26477	FBIS3-18129	0	809
-6.29473	FBIS4-46780	0	2772
-6.29581	FBIS3-58058	0	935
-6.29581	FBIS3-58028	0	935
-6.3039	FBIS3-24277	0	1513
-6.31434	FBIS4-19535	0	361
-6.32007	FBIS4-7688	0	620
-6.3209	FBIS3-41247	0	644
-6.3209	FBIS3-26415	0	644
-6.33596	FBIS3-19199	0	360
-6.3527	FBIS4-40260	0	774
-6.36653	FBIS4-45477	0	839
-6.36683	FBIS4-43801	0	840
-6.36873	FBIS4-43965	0	450
-6.38745	FBIS4-10817	0	867
-6.39191	FBIS4-41541	0	1126
-6.39934	LA121990-0141	0	319
-6.40266	LA102290-0116	0	345
-6.40729	LA041789-0010	0	2334
-6.42149	FBIS3-24145	0	7442
-6.42948	FBIS4-40936	0	531
-6.43117	FBIS4-22596	0	1793
-6.43482	LA0

-10.7869	LA060689-0128	0	904
-10.7872	LA030589-0043	0	905
-10.7896	LA080689-0092	0	913
-10.7975	FR940505-2-00124	0	940
-10.8001	FT934-3608	0	949
-10.8021	FT942-12872	0	956
-10.8038	FT943-15591	0	962
-10.805	FT921-503	0	966
-10.8059	FT932-13081	0	969
-10.8059	FT933-14487	0	969
-10.811	LA032590-0128	0	987
-10.8116	FBIS4-57620	0	989
-10.8125	FT922-11472	0	992
-10.8139	FR940810-0-00247	0	997
-10.814	FBIS4-60455	0	315
-10.8142	LA010689-0155	0	998
-10.8148	FBIS3-29421	0	1000
-10.815	LA050690-0154	0	1001
-10.8153	FT933-8843	0	1002
-10.8165	FBIS4-44662	0	1006
-10.817	LA123089-0084	0	1008
-10.8179	FBIS3-51444	0	1011
-10.8182	FBIS4-46151	0	1012
-10.8196	LA083090-0083	0	1017
-10.8207	LA072490-0143	0	1021
-10.823	FT923-803	0	1029
-10.8239	LA111789-0130	0	1032
-10.826	FBIS4-49074	0	754
-10.8287	LA100490-0212	0	1049
-10.8295	FBIS3-5693	0	359
-10.8352	FBIS4-49929	0	524
-10.8365	LA011190-0120	0	1077
-10.8368	LA103189-0032	0	1078
-10.8401	LA033189-0163	0	1090
-10.8413	LA121189-0039	0	1094
-10.844	LA120

-7.48345	FBIS4-22168	0	2300
-7.48601	FR941213-2-00083	0	1289
-7.48654	FR940628-1-00078	0	476
-7.48958	FR940505-1-00221	0	678
-7.4899	FR940425-0-00030	0	679
-7.49001	FT943-11205	0	490
-7.49214	FBIS3-58463	0	321
-7.49235	FBIS4-66625	0	497
-7.49246	LA102189-0139	0	585
-7.49337	FR940930-0-00159	0	429
-7.49404	FBIS4-22563	0	1627
-7.49602	FBIS3-595	0	508
-7.49848	FR940303-2-00168	0	578
-7.50092	FR940921-0-00049	0	838
-7.50101	FR940921-0-00103	0	777
-7.50165	FBIS3-10720	0	525
-7.50297	LA051590-0035	0	529
-7.50429	FR940104-1-00042	0	969
-7.50519	FR941104-0-00021	0	870
-7.51074	FBIS4-177	0	743
-7.51133	FR940921-0-00041	0	811
-7.51167	FBIS4-68207	0	3445
-7.51262	FR941213-0-00077	0	998
-7.51284	FBIS3-22412	0	380
-7.51331	FR940805-1-00043	0	624
-7.51351	LA062290-0006	0	752
-7.51475	FR940425-0-00042	0	759
-7.51676	FR940106-2-00132	0	829
-7.51681	FR940505-1-00444	0	701
-7.51837	LA021289-0135	0	576
-7.51861	FR940930-0-00200	0	1019
-7.52026	FR940921-0-00040	0	777
-7.52452	LA071289-0003	0	788
-7.52614	