In [12]:
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as gensim_downloader
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Searcher
import joblib
import warnings
import os
import datetime
import ir_datasets
import pyterrier as pt
from pyterrier.measures import *

index_path = './vaswaniindex/'

dataset =  pt.get_dataset("irds:vaswani")
index = pt.index.IterDictIndexer(
    index_path,
    type=pt.index.IndexingType.MEMORY,
).index(dataset.get_corpus_iter())


vaswani documents:   0%|                                                                                                                      | 0/11429 [00:00<?, ?it/s][A
vaswani documents:   0%|                                                                                                              | 1/11429 [00:00<36:02,  5.28it/s][A
vaswani documents:   3%|███▏                                                                                                      | 345/11429 [00:00<00:07, 1418.80it/s][A
vaswani documents:   5%|████▉                                                                                                     | 529/11429 [00:00<00:08, 1323.48it/s][A
vaswani documents:   6%|██████▎                                                                                                   | 685/11429 [00:00<00:07, 1388.42it/s][A
vaswani documents:   7%|███████▊                                                                                                  | 840/114

In [13]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
word2vec_model = gensim_downloader.load("glove-wiki-gigaword-100")
vectorizer = joblib.load('trained/vectorizervaswani.pkl')
file = open("trained/doc_vecsvaswani.pickle",'rb') 
doc_vecs = pickle.load(file)
data_dir = './project-root/vaswani/raw/'
collection = pd.read_csv(data_dir + "collection.tsv", sep='\t', 
                                names=['doc_id', 'text'])

In [14]:
if not pt.java.started():
    pt.init()

tokenizer = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
def strip_markup(text):
    return " ".join(tokenizer.getTokens(text))

def _preprocess_text(text: str) -> list:
    tokens = word_tokenize(text.lower())
    return [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

def stop_lemma(text: str) -> list:
    tokens = word_tokenize(text.lower())
    return ' '.join([lemmatizer.lemmatize(token) for token in tokens if token not in stop_words])

def stop_porter(text: str) -> list:
    tokens = word_tokenize(text.lower())
    return ' '.join([stemmer.stem(token) for token in tokens if token not in stop_words])
    
def stop_word(text: str) -> list:
    tokens = word_tokenize(text.lower())
    return ' '.join([token for token in tokens if token not in stop_words])

def expand_query_wordnet(query: str, num_expansions: int = 2) -> str:
    tokens = _preprocess_text(query)
    expanded_terms = set(tokens)
    for token in tokens:
        synonyms = set()
        for syn in wordnet.synsets(token)[:2]:  # Limit to top 2 synsets
            for lemma in syn.lemmas()[:num_expansions]:
                synonym = lemma.name().lower()
                if synonym != token and synonym not in synonyms:
                    synonyms.add(synonym)
            if len(synonyms) >= num_expansions:
                break
        expanded_terms.update(synonyms)
    return ' '.join(expanded_terms)

def expand_query_word2vec(query: str, num_expansions: int = 2, threshold: float = 0.7) -> str:
    topn=3
    words = query.split()
    expanded_words = words.copy()

    for word in words:
        try:
            similar_words = [w for w, _ in word2vec_model.most_similar(word, topn=topn) 
                            if w.lower() != word.lower()]
            expanded_words.extend(similar_words)
        except KeyError:
            continue

    return ' '.join(expanded_words)

def expand_query_pseudo_relevance(doc_vecs, query: str, collection: pd.DataFrame, 
                                 vectorizer: TfidfVectorizer, top_k: int = 3, 
                                 num_expansions: int = 2) -> str:
    try:
        query_vec = vectorizer.transform([query])
        similarities = cosine_similarity(query_vec, doc_vecs)[0]
        top_k_indices = np.argsort(similarities)[-top_k:]
        top_k_docs = collection.iloc[top_k_indices]
        top_k_vecs = vectorizer.transform(top_k_docs['text']).toarray()
        mean_top_k = np.mean(top_k_vecs, axis=0)
        original_vec = query_vec.toarray()[0]
        combined_vec = 0.7 * mean_top_k + 0.3 * original_vec  # Rocchio-like weighting
        feature_names = vectorizer.get_feature_names_out()
        top_indices = np.argsort(combined_vec)[-num_expansions:]
        expansion_terms = [feature_names[idx] for idx in top_indices 
                          if feature_names[idx] not in query.lower().split()]
    except IndexError:
        print(len(collection))
        print(collection)
        print(vectorizer)
        print(query)
        print(doc_vecs)
        print(query_vec)
        print(top_k_indices)
    return query + ' ' + ' '.join(expansion_terms)

def expand_porter_stemmer(text: str) -> str:
    tokens = word_tokenize(text.lower())
    return ' '.join([stemmer.stem(token) for token in tokens])
    
def comb(text: str, doc_vecs, collection, vectorizer) -> str:
    q = expand_query_wordnet(text)
    q = expand_query_word2vec(q)
    q = expand_query_pseudo_relevance(doc_vecs, q, collection, vectorizer)
    return expand_porter_stemmer(q)

  tokenizer = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()


In [6]:
  
no_qe = pt.terrier.Retriever(index, wmodel="BM25", metadata=["docno", "text"], properties={"termpipelines": ""}, controls={"qe": "off"})
qe_stop = pt.apply.query(lambda q: strip_markup(stop_word(q["query"]))) >> no_qe
qe_sp = pt.apply.query(lambda q: strip_markup(stop_porter(q["query"]))) >> no_qe
qe_sl = pt.apply.query(lambda q: strip_markup(stop_lemma(q["query"]))) >> no_qe
qe_wordnet = pt.apply.query(lambda q: strip_markup(expand_query_wordnet(q["query"]))) >> no_qe
qe_word2vec = pt.apply.query(lambda q: strip_markup(expand_query_word2vec(q["query"]))) >> no_qe
qe_pseudo = pt.apply.query(lambda q: strip_markup(expand_query_pseudo_relevance(doc_vecs, q["query"], collection, vectorizer))) >> no_qe
qe_stem = pt.apply.query(lambda q: strip_markup(expand_porter_stemmer(q["query"]))) >> no_qe
qe_comb = pt.apply.query(lambda q: strip_markup(comb(q["query"], doc_vecs, collection, vectorizer))) >> no_qe
pt.Experiment(
    [no_qe, qe_stop, qe_sp, qe_sl, qe_stem, qe_wordnet, qe_word2vec, qe_pseudo, qe_comb],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=[pt.measures.MAP(rel=1), RR(rel=1), nDCG@10],
    baseline = 0,
    names = ['None', 'Stop', 'Stop-Porter', 'Stop-Lemma', 'Stemming', 'Wordnet', 'Word2Vec', 'Pseudo-relevance', 'Combined']
)

Unnamed: 0,name,AP,RR,nDCG@10,AP +,AP -,AP p-value,RR +,RR -,RR p-value,nDCG@10 +,nDCG@10 -,nDCG@10 p-value
0,,0.296517,0.725665,0.446609,,,,,,,,,
1,Stop,0.296986,0.727457,0.447187,1.0,0.0,0.3199335,1.0,0.0,0.3199335,1.0,0.0,0.3199335
2,Stop-Porter,0.27703,0.686217,0.423711,6.0,28.0,0.01626489,3.0,9.0,0.02370605,5.0,17.0,0.03863622
3,Stop-Lemma,0.294246,0.721999,0.442597,3.0,4.0,0.4183245,2.0,2.0,0.5562424,3.0,4.0,0.4046201
4,Stemming,0.276865,0.685659,0.422176,6.0,29.0,0.01567204,3.0,9.0,0.02231831,5.0,17.0,0.03180577
5,Wordnet,0.203801,0.543942,0.32179,18.0,73.0,4.070873e-10,12.0,41.0,4.91318e-06,14.0,64.0,5.001826e-08
6,Word2Vec,0.186555,0.553907,0.305802,8.0,84.0,6.39105e-13,10.0,42.0,1.081216e-05,17.0,66.0,2.818189e-09
7,Pseudo-relevance,0.285681,0.707794,0.437438,7.0,82.0,0.000620686,1.0,13.0,0.05836703,5.0,20.0,0.07538543
8,Combined,0.162655,0.479302,0.266303,12.0,80.0,1.597736e-11,9.0,48.0,2.775704e-08,15.0,69.0,2.52355e-09


In [8]:
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Searcher
from colbert.data import Collection
import pyterrier as pt
import pandas as pd
import joblib
import pickle
from pyterrier.measures import *
if not pt.java.started():
  pt.init()
data_dir = './project-root/vaswani/raw/collection.tsv'
collection_text = pd.read_csv(data_dir, sep="\t", names=['id', 'text'])
# qrelstsv = pd.read_csv('./project-root/vaswani/raw/qrels.dev.tsv', sep="\t", names=['qid','iteration','docno','label'])
# qrelstsv['qid'] = qrelstsv['qid'].astype(str)
# qrelstsv['iteration'] = qrelstsv['iteration'].astype(str)
# qrelstsv['docno'] = qrelstsv['docno'].astype(str)
# qrelstsv['label'] = qrelstsv['label'].astype(int)
collection2 = Collection(data=collection_text['text'].tolist())
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
if __name__=='__main__':
    with Run().context(RunConfig(nranks=1, experiment="vaswaniindex")):
        config = ColBERTConfig(
            root="experiments",
            collection=collection2
        )
        searcher = Searcher(index="vaswaniindex", config=config)

class MyColbert:
    def __init__(self, searcher, method):
        self.searcher = searcher 
        self.method = method
    
    def transform(self, df):
        results = None
        for index, row in df.iterrows():
            query = row['query']
            if self.method is not None:
                query = self.method(query)
            result = self.searcher.search(query, k=100)
            result = pd.DataFrame(result).transpose()#, columns=['doc_index', 'rank', 'value'])
            result.columns = ['docno', 'rank', 'score']
            result['docno'] = result['docno'] + 1
            result['docno'] = result['docno'].astype(int)
            result['qid'] = row['qid']
            if results is None:
                results = result
            else:
                results = pd.concat([results, result])
        return results
        
        
no_qe = MyColbert(searcher, None)
qe_stop = MyColbert(searcher, lambda x: stop_word(x))
qe_sp = MyColbert(searcher, lambda x: stop_porter(x))
qe_sl = MyColbert(searcher, lambda x: stop_lemma(x))
qe_wordnet = MyColbert(searcher, lambda x: expand_query_wordnet(x))
qe_word2vec = MyColbert(searcher, lambda x: expand_query_word2vec(x))
qe_pseudo = MyColbert(searcher, lambda x: expand_query_pseudo_relevance(doc_vecs, x, collection_text, vectorizer))
qe_stem = MyColbert(searcher, lambda x: expand_porter_stemmer(x))
qe_comb = MyColbert(searcher, lambda x: comb(x, doc_vecs, collection_text, vectorizer))

pt.Experiment(
    [no_qe, qe_stop, qe_sp, qe_sl, qe_stem, qe_wordnet, qe_word2vec, qe_pseudo, qe_comb],
    # [no_qe],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=[pt.measures.MAP(rel=1), RR(rel=1), nDCG@10],
    baseline = 0,
    names = ['None', 'Stop', 'Stop-Porter', 'Stop-Lemma', 'Stemming', 'Wordnet', 'Word2Vec', 'Pseudo-relevance', 'Combined']
)


[Apr 08, 15:05:33] #> Loading codec...
[Apr 08, 15:05:33] #> Loading IVF...
[Apr 08, 15:05:33] #> Loading doclens...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1508.74it/s]

[Apr 08, 15:05:33] #> Loading codes and residuals...



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 48.03it/s]


#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . measurement of dielectric constant of liquids by the use of microwave techniques, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1, 10903,  1997,  3280,  2571, 22601,  5377,  1997, 26820,
         2011,  1996,  2224,  1997, 18302,  5461,   102,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')






Unnamed: 0,name,AP,RR,nDCG@10,AP +,AP -,AP p-value,RR +,RR -,RR p-value,nDCG@10 +,nDCG@10 -,nDCG@10 p-value
0,,0.287854,0.747713,0.481308,,,,,,,,,
1,Stop,0.28328,0.747092,0.465798,44.0,38.0,0.4426888,11.0,12.0,0.9693507,32.0,41.0,0.07252273
2,Stop-Porter,0.152022,0.490303,0.280053,16.0,76.0,2.270297e-13,12.0,46.0,1.106572e-08,18.0,69.0,9.466255e-13
3,Stop-Lemma,0.269861,0.751633,0.44331,38.0,52.0,0.01784855,12.0,12.0,0.8553236,27.0,55.0,0.0004901051
4,Stemming,0.162226,0.529872,0.29973,14.0,78.0,1.280454e-13,8.0,41.0,1.924347e-07,16.0,70.0,6.124664e-12
5,Wordnet,0.164237,0.541097,0.289337,9.0,83.0,1.839297e-13,8.0,46.0,9.570426e-07,13.0,72.0,1.745998e-14
6,Word2Vec,0.18307,0.561631,0.334193,13.0,79.0,6.131425e-11,7.0,49.0,3.516649e-06,16.0,67.0,1.626242e-09
7,Pseudo-relevance,0.274182,0.717897,0.455576,35.0,57.0,0.01624291,12.0,18.0,0.129094,28.0,52.0,0.0082174
8,Combined,0.079989,0.362666,0.176048,3.0,89.0,7.883576e-22,5.0,66.0,1.56226e-13,7.0,80.0,5.439623e-20


In [None]:
import pyterrier as pt
from pyterrier.measures import *
from pyterrier_t5 import MonoT5ReRanker, DuoT5ReRanker
monoT5 = MonoT5ReRanker() 
duoT5 = DuoT5ReRanker() 

bm25 = pt.BatchRetrieve(index, wmodel="BM25", properties={"termpipelines": ""}, controls={"qe": "off"})
mono_pipeline = (bm25 % 50) >> pt.text.get_text(dataset, "text") >> monoT5
duo_pipeline = mono_pipeline % 5 >> duoT5

no_qe = duo_pipeline
qe_stop = pt.apply.query(lambda q: strip_markup(stop_word(q["query"]))) >> no_qe
qe_sp = pt.apply.query(lambda q: strip_markup(stop_porter(q["query"]))) >> no_qe
qe_sl = pt.apply.query(lambda q: strip_markup(stop_lemma(q["query"]))) >> no_qe
qe_stem = pt.apply.query(lambda q: strip_markup(expand_porter_stemmer(q["query"]))) >> no_qe
qe_wordnet = pt.apply.query(lambda q: strip_markup(expand_query_wordnet(q["query"]))) >> no_qe
qe_word2vec = pt.apply.query(lambda q: strip_markup(expand_query_word2vec(q["query"]))) >> no_qe
qe_pseudo = pt.apply.query(lambda q: strip_markup(expand_query_pseudo_relevance(doc_vecs, q["query"], collection, vectorizer))) >> no_qe
qe_comb = pt.apply.query(lambda q: strip_markup(comb(q["query"], doc_vecs, collection, vectorizer))) >> no_qe


pt.Experiment(
    [no_qe, qe_stop, qe_sp, qe_sl, qe_stem, qe_wordnet, qe_word2vec, qe_pseudo, qe_comb],
    dataset.get_topics()[:50],
    dataset.get_qrels(),
    eval_metrics=[pt.measures.MAP(rel=1), RR(rel=1), nDCG@10],
    baseline = 0,
    names = ['None', 'Stop', 'Stop-Porter', 'Stop-Lemma', 'Stemming', 'Wordnet', 'Word2Vec', 'Pseudo-relevance', 'Combined']
)

In [15]:
import xgboost as xgb

from pyterrier.measures import *


params = {'objective': 'rank:ndcg',
          'learning_rate': 0.1,
          'gamma': 1.0, 
          'min_child_weight': 0.1,
          'max_depth': 6,
          'random_state': 42
         }
topics = dataset.get_topics()
qrels = dataset.get_qrels()
train_topics, valid_topics, test_topics = np.split(topics, [int(.6*len(topics)), int(.8*len(topics))])

fbr3f = pt.terrier.FeaturesRetriever(index, wmodel="BM25", features=  ['WMODEL:TF_IDF', 'WMODEL:PL2', 'WMODEL:BM25', 
                                                 'WMODEL:DirichletLM', 'WMODEL:Hiemstra_LM', 
                                                 'WMODEL:DFR_BM25', 'WMODEL:InL2', 'WMODEL:LGD', 
                                                 'WMODEL:DLH', 'WMODEL:DPH', 'WMODEL:LemurTF_IDF'], properties={"termpipelines": ""}, controls={"qe": "off"})
BaseLTR_LM = fbr3f >> pt.ltr.apply_learned_model(xgb.sklearn.XGBRanker(**params), form='ltr')
BaseLTR_LM.fit(train_topics, qrels, valid_topics, qrels)

no_qe = BaseLTR_LM
qe_stop = pt.apply.query(lambda q: strip_markup(stop_word(q["query"]))) >> no_qe
qe_sp = pt.apply.query(lambda q: strip_markup(stop_porter(q["query"]))) >> no_qe
qe_sl = pt.apply.query(lambda q: strip_markup(stop_lemma(q["query"]))) >> no_qe
qe_stem = pt.apply.query(lambda q: strip_markup(expand_porter_stemmer(q["query"]))) >> no_qe
qe_wordnet = pt.apply.query(lambda q: strip_markup(expand_query_wordnet(q["query"]))) >> no_qe
qe_word2vec = pt.apply.query(lambda q: strip_markup(expand_query_word2vec(q["query"]))) >> no_qe
qe_pseudo = pt.apply.query(lambda q: strip_markup(expand_query_pseudo_relevance(doc_vecs, q["query"], collection, vectorizer))) >> no_qe
qe_comb = pt.apply.query(lambda q: strip_markup(comb(q["query"], doc_vecs, collection, vectorizer))) >> no_qe


pt.Experiment(
    [no_qe, qe_stop, qe_sp, qe_sl, qe_stem, qe_wordnet, qe_word2vec, qe_pseudo, qe_comb],
    # [bm25, BaseLTR_LM, lmart_l_pipe],
    test_topics,
    qrels,
    eval_metrics=[pt.measures.MAP(rel=1), RR(rel=1), nDCG@10],
    baseline = 0,
    names = ['None', 'Stop', 'Stop-Porter', 'Stop-Lemma', 'Stemming', 'Wordnet', 'Word2Vec', 'Pseudo-relevance', 'Combined']
    # names = ['bm25', 'xgb', 'lgbm']
)



Unnamed: 0,name,AP,RR,nDCG@10,AP +,AP -,AP p-value,RR +,RR -,RR p-value,nDCG@10 +,nDCG@10 -,nDCG@10 p-value
0,,0.063776,0.337277,0.139544,,,,,,,,,
1,Stop,0.064467,0.363313,0.144371,5.0,2.0,0.570021,4.0,2.0,0.238625,2.0,1.0,0.248295
2,Stop-Porter,0.139352,0.253369,0.141009,15.0,4.0,0.008896,8.0,8.0,0.337981,6.0,7.0,0.974255
3,Stop-Lemma,0.10092,0.350505,0.171458,13.0,2.0,0.114308,8.0,4.0,0.714737,5.0,5.0,0.309293
4,Stemming,0.149641,0.335027,0.174221,16.0,3.0,0.005234,11.0,6.0,0.982675,10.0,6.0,0.479805
5,Wordnet,0.092496,0.359553,0.130274,9.0,10.0,0.312095,6.0,10.0,0.835844,4.0,8.0,0.860879
6,Word2Vec,0.048062,0.158027,0.054901,7.0,12.0,0.162927,5.0,12.0,0.022776,0.0,12.0,0.010706
7,Pseudo-relevance,0.049534,0.233699,0.096809,1.0,16.0,0.005344,0.0,9.0,0.094776,0.0,6.0,0.027252
8,Combined,0.059476,0.217967,0.088333,9.0,10.0,0.831395,7.0,12.0,0.34263,5.0,9.0,0.290636
