<a href="https://colab.research.google.com/github/ChenKua/xir/blob/main/robust04_terrier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!tar -xzvf "/content/drive/MyDrive/TREC-Disk-4.tar.gz" 
!tar -xzvf "/content/drive/MyDrive/TREC-Disk-5.tar.gz" 

Install PyTerrier - this installs the latest version from the GitHub repository.

In [None]:
!pip install python-terrier

import pyterrier as pt
if not pt.started():
    pt.init(mem=8000, version='snapshot', tqdm='notebook', 
            boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"]
           )


In [None]:
DISK45_PATH="/content/trec45"
INDEX_DIR="/content/indices"

# indexes the corpus

In [None]:
import os
if os.path.exists(os.path.join(INDEX_DIR, "data.properties")):
    indexref = pt.IndexRef.of(os.path.join(INDEX_DIR, "data.properties"))
else:    
    files = pt.io.find_files(DISK45_PATH)
    # no-one indexes the congressional record in directory /CR/
    # indeed, recent copies from NIST dont contain it
    # we also remove some of the other unneeded files
    bad = ['/CR/', '/AUX/', 'READCHG', 'READMEFB', 'READFRCG', 'READMEFR', 'READMEFT', 'READMELA']
    for b in bad:
        files = list(filter(lambda f: b not in f, files))
    indexer = pt.TRECCollectionIndexer(INDEX_DIR, verbose=True)
    indexref = indexer.index(files)
    # processing the files took 7 minutes; the total indexing process took 7m40

index = pt.IndexFactory.of(indexref)
print(index.getCollectionStatistics().toString())

  0%|          | 0/2303 [00:00<?, ?files/s]

07:51:39.270 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /content/trec45/TREC-Disk-4/DTDS/CREDTD. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
07:51:39.297 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /content/trec45/TREC-Disk-4/DTDS/CRHDTD. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
07:51:39.306 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /content/trec45/TREC-Disk-4/DTDS/FR94DTD. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression failed
07:51:39.323 [main] WARN org.terrier.indexing.MultiDocumentFileCollection - TRECCollection found no documents in /content/trec45/TREC-Disk-4/DTDS/FTDTD. Perhaps trec.collection.class is wrongly set, TrecDocTags are incorrect, or decompression fai

# Retrieval - Simple Weighting Models

In [None]:
BM25 = pt.BatchRetrieve(index, wmodel="BM25")
DPH  = pt.BatchRetrieve(index, wmodel="DPH")
PL2  = pt.BatchRetrieve(index, wmodel="PL2")
DLM  = pt.BatchRetrieve(index, wmodel="DirichletLM")

In [None]:
pt.Experiment(
    [BM25, DPH, PL2, DLM],
    pt.get_dataset("trec-robust-2004").get_topics(),
    pt.get_dataset("trec-robust-2004").get_qrels(),
    eval_metrics=["map", "P_10", "P_20", "ndcg_cut_20"],
    names=["BM25", "DPH", "PL2", "Dirichlet QL"]
)

Downloading trec-robust-2004 topics to /root/.pyterrier/corpora/trec-robust-2004/04.testset.gz


04.testset.gz:   0%|          | 0.00/33.5k [00:00<?, ?iB/s]

Downloading trec-robust-2004 qrels to /root/.pyterrier/corpora/trec-robust-2004/qrels.robust2004.txt


qrels.robust2004.txt:   0%|          | 0.00/6.24M [00:00<?, ?iB/s]

Unnamed: 0,name,map,P_10,P_20,ndcg_cut_20
0,BM25,0.241766,0.426104,0.349398,0.408061
1,DPH,0.251307,0.44739,0.361446,0.422524
2,PL2,0.229383,0.420884,0.343775,0.402179
3,Dirichlet QL,0.236826,0.407631,0.337952,0.39687


# Retrieval - Query Expansion

In [None]:
Bo1 = pt.rewrite.Bo1QueryExpansion(index)
KL = pt.rewrite.KLQueryExpansion(index)
RM3 = pt.rewrite.RM3(index)

pt.Experiment(
    [
            BM25, 
            BM25 >> Bo1 >> BM25, 
            BM25 >> KL >> BM25, 
            BM25 >> RM3 >> BM25, 
    ],
    pt.get_dataset("trec-robust-2004").get_topics(),
    pt.get_dataset("trec-robust-2004").get_qrels(),
    eval_metrics=["map", "P_10", "P_20", "ndcg_cut_20"],
    names=["BM25", "+Bo1", "+KL", "+RM3"]
    )

Unnamed: 0,name,map,P_10,P_20,ndcg_cut_20
0,BM25,0.241766,0.426104,0.349398,0.408061
1,+Bo1,0.279458,0.448996,0.378916,0.436533
2,+KL,0.279401,0.444177,0.378313,0.435196
3,+RM3,0.27673,0.453414,0.380522,0.430947
