In [4]:
#!pip install nltk
#!pip install pytorch_lightning
#!pip install gensim
#!pip install pyserini==0.12.0
#!pip install python-terrier
#!pip install ipywidgets
#!pip install --upgrade notebook jupyter
#!jupyter nbextension enable --py widgetsnbextension

usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: console dejavu events execute kernel kernelspec lab
labextension labhub migrate nbconvert notebook qtconsole run server
troubleshoot trust

Jupyter command `jupyter-nbextension` not found.


In [17]:
# Import libraries
import os
import json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader, random_split

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger

import nltk
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer

#from src.dataset import mydataset
#from src.utils import utils, dataset_utils, eval_utils, trie

from sklearn.model_selection import train_test_split

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alirezarafiei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alirezarafiei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alirezarafiei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
EMBEDDING_SIZE = 120
MAX_TOKENS = 7
K = 10      # K param metrics

QUERIES_PATH = './materials/queries.json'
DOCS_PATH = './materials/documents.json'
W2V_PATH = './materials/word2vec_model.bin'
BT_MOD_PATH = './materials/bt_model.pth'

In [4]:
# If the json files of documents does not exists
if not os.path.exists(QUERIES_PATH) or not os.path.exists(DOCS_PATH) or not os.path.exists(W2V_PATH):
    # Build dictionaries and corpus
    queries, documents, corpus = dataset_utils.build_dicts(
        max_topics=None, max_docs=10)
    # Write to file the dictionaries
    with open(QUERIES_PATH, 'w') as json_file:
        json.dump(queries, json_file)
    with open(DOCS_PATH, 'w') as json_file:
        json.dump(documents, json_file)

    # Train the word2vec model
    w2v_model = Word2Vec(
        sentences=corpus, vector_size=EMBEDDING_SIZE,
        window=MAX_TOKENS if MAX_TOKENS else 5,
        min_count=1, sg=0, epochs=10)
    
    # Save w2v model
    w2v_model.save(W2V_PATH)

# Read from json
with open(QUERIES_PATH, 'r') as json_file:
    queries = json.load(json_file)
with open(DOCS_PATH, 'r') as json_file:
    documents = json.load(json_file)

# Load model
w2v_model = Word2Vec.load(W2V_PATH)

### pyterrier-based approaches

In [5]:
import pyterrier as pt
if not pt.started():
    pt.init()
import string

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [15]:
docs = pd.DataFrame([
    {"docno": doc_id, "text": details['raw']}
    for doc_id, details in documents.items()
])

indexer = pt.DFIndexer("./materials/index_pt/index", overwrite=True)
index_ref = indexer.index(docs["text"], docs["docno"])

quer = pd.DataFrame([{"qid": str(qid), "query": details['raw']} for qid, details in queries.items()])
qrels = pd.DataFrame([{"qid": str(qid), "docno": docno, "label": 1} for qid, details in queries.items() for docno in details['docids_list']])

translator = str.maketrans('', '', string.punctuation)
def remove_punctuation(text):
    text_no_punctuation = text.translate(translator)
    return text_no_punctuation

quer['query'] = quer['query'].apply(remove_punctuation)


In [16]:
# Load the index
index = pt.IndexFactory.of(index_ref)

# Create a BM25 retrieval model
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

tf_idf = pt.BatchRetrieve(index, wmodel="TF_IDF")

pl2 = pt.BatchRetrieve(index, wmodel="PL2")

evaluation = pt.Experiment(
    [bm25, tf_idf, pl2],
    quer,
    qrels,
    eval_metrics=['map', 'ndcg', 'P', 'recall'],
    round=4,
    verbose="true"
)

print(evaluation)

pt.Experiment:   0%|          | 0/3 [00:00<?, ?system/s]

pt.Experiment: 100%|██████████| 3/3 [03:05<00:00, 61.91s/system]

         name     map    ndcg     P@5    P@10    P@15    P@20    P@30  P@100  \
0    BR(BM25)  0.9456  0.9751  0.9614  0.9167  0.6399  0.4853  0.3264  0.099   
1  BR(TF_IDF)  0.9458  0.9752  0.9617  0.9171  0.6400  0.4854  0.3264  0.099   
2     BR(PL2)  0.9378  0.9720  0.9552  0.9043  0.6368  0.4843  0.3261  0.099   

    P@200  ...  P@1000     R@5    R@10    R@15    R@20    R@30   R@100  \
0  0.0496  ...    0.01  0.4807  0.9167  0.9599  0.9706  0.9791  0.9896   
1  0.0496  ...    0.01  0.4808  0.9171  0.9601  0.9708  0.9792  0.9896   
2  0.0496  ...    0.01  0.4776  0.9043  0.9553  0.9686  0.9784  0.9896   

    R@200   R@500  R@1000  
0  0.9923  0.9944  0.9952  
1  0.9922  0.9944  0.9952  
2  0.9924  0.9944  0.9952  

[3 rows x 21 columns]



