In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import random
import time
import warnings

from tqdm import tqdm

# Suppress unnecessary FutureWarning
warnings.filterwarnings('ignore', category=FutureWarning)

In [12]:
docs = pd.read_csv('/kaggle/input/lab-nir-2024/lab_docs.csv')
topics = pd.read_csv('/kaggle/input/lab-nir-2024/lab_topics.csv')
qrels = pd.read_csv('/kaggle/input/lab-nir-2024/lab_qrels.csv')

In [3]:
!curl -s "https://get.sdkman.io" | bash && source "$HOME/.sdkman/bin/sdkman-init.sh" && sdk install java 11.0.22-amzn < /dev/null


                                -+syyyyyyys:
                            `/yho:`       -yd.
                         `/yh/`             +m.
                       .oho.                 hy                          .`
                     .sh/`                   :N`                `-/o`  `+dyyo:.
                   .yh:`                     `M-          `-/osysoym  :hs` `-+sys:      hhyssssssssy+
                 .sh:`                       `N:          ms/-``  yy.yh-      -hy.    `.N-````````+N.
               `od/`                         `N-       -/oM-      ddd+`     `sd:     hNNm        -N:
              :do`                           .M.       dMMM-     `ms.      /d+`     `NMMs       `do
            .yy-                             :N`    ```mMMM.      -      -hy.       /MMM:       yh
          `+d+`           `:/oo/`       `-/osyh/ossssssdNMM`           .sh:         yMMN`      /m.
         -dh-           :ymNMMMMy  `-/shmNm-`:N/-.``   `.sN            /N-         `NMMy      .m/
  

In [4]:
pip install python-terrier

Collecting python-terrier
  Downloading python-terrier-0.10.1.tar.gz (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting wget (from python-terrier)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting pyjnius>=1.4.2 (from python-terrier)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting matchpy (from python-terrier)
  Downloading matchpy-0.5.5-py3-none-any.whl.metadata (12 kB)
Collecting chest (from python-terrier)
  Downloading chest-0.2.3.tar.gz (9.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting nptyping==1.4.4 (from python-terrier)
  Downloading nptyping-1.4.4-py3-none-any.whl.metadata (7.7 kB)
Collecting ir_datasets>=0.3.2 (from python-terrier)
  Downloading ir_datasets-0.5.7-py3-none-any.whl.metadata (12 

In [5]:
%env JAVA_HOME=/root/.sdkman/candidates/java/current
import pyterrier as pt
if not pt.started():
    pt.init()

env: JAVA_HOME=/root/.sdkman/candidates/java/current
terrier-assemblies 5.9 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [6]:
start_time = time.time()

indexer = pt.DFIndexer("./indexes/default", overwrite=True)
index_ref = indexer.index(docs["text"], docs["docno"].astype(str))
index_ref.toString()
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())

end_time = time.time()
print(end_time - start_time)
len((index.getLexicon()))

Number of documents: 2453
Number of terms: 23693
Number of postings: 208487
Number of fields: 0
Number of tokens: 273373
Field names: []
Positions:   false

5.137542486190796


23693

In [None]:
search_times = []

# randomly selected 1000 items
ix_range = random.sample(range(0, 4023296), 1000)

# timimg
for ix, kv in enumerate(index.getLexicon()):
    start_time = time.time()
    if ix in ix_range:
        pointer = index.getLexicon()[kv.getKey()]
    end_time = time.time()
    search_time = end_time - start_time
    search_times.append(search_time)
    
print(sum(search_times)/1000)

In [None]:
# Stopwords removal
start_time = time.time()

indexer = pt.DFIndexer("./indexes/stopwords", overwrite=True)
indexer.setProperty("termpipelines", "Stopwords")
index_ref = indexer.index(docs["text"], docs["docno"].astype(str))
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())

end_time = time.time()
print(end_time - start_time)
len((index.getLexicon()))

In [None]:
# Stemmer
start_time = time.time()

indexer = pt.DFIndexer("./indexes/stemmer", overwrite=True)
indexer.setProperty("termpipelines", "PorterStemmer")
index_ref = indexer.index(docs["text"], docs["docno"].astype(str))
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())

end_time = time.time()
print(end_time - start_time)
len((index.getLexicon()))

In [None]:
# No pre-processing
start_time = time.time()

indexer = pt.DFIndexer("./indexes/none", overwrite=True)
indexer.setProperty("termpipelines", "")
index_ref = indexer.index(docs["text"], docs["docno"].astype(str))
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())

end_time = time.time()
print(end_time - start_time)
len((index.getLexicon()))

In [11]:
BM25 = pt.BatchRetrieve(index, wmodel="BM25")

In [16]:
from pyterrier.measures import *
K = 5

pt.Experiment(
    retr_systems=[BM25],
    names = ["BM25"],
    topics=topics,
    qrels=qrels,
    eval_metrics=[NDCG@10, 'map']
)

Unnamed: 0,name,nDCG@10,map
0,BM25,0.842503,0.628454


In [None]:
print(topics.shape)
topics.sample(5)

In [None]:
qrels.sample(5)

In [None]:
qrels["label"].dtype

In [14]:
topics["qid"] = topics["qid"].astype(str)
qrels["qid"] = qrels["qid"].astype(str)
qrels["docno"] = qrels["docno"].astype(str)

In [None]:
qrels["qid"].isna().any()