In [10]:
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as gensim_downloader
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Searcher
import joblib
import warnings
import os
import datetime
import ir_datasets
import pyterrier as pt
from pyterrier.measures import *

index_path = './antiqueindex/index'

dataset =  pt.get_dataset("irds:antique/test/non-offensive")
index = pt.index.IterDictIndexer(
    index_path,
    type=pt.index.IndexingType.MEMORY,
).index(dataset.get_corpus_iter())


antique/test/non-offensive documents:   0%|                                                                                                  | 0/403666 [00:00<?, ?it/s][A
antique/test/non-offensive documents:   0%|                                                                                       | 1/403666 [00:00<48:51:42,  2.29it/s][A
antique/test/non-offensive documents:   0%|                                                                                       | 261/403666 [00:00<11:01, 609.72it/s][A
antique/test/non-offensive documents:   0%|                                                                                       | 386/403666 [00:00<09:41, 693.16it/s][A
antique/test/non-offensive documents:   0%|                                                                                       | 494/403666 [00:00<09:00, 745.47it/s][A
antique/test/non-offensive documents:   0%|▏                                                                                      | 594/403

In [11]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
word2vec_model = gensim_downloader.load("glove-wiki-gigaword-100")
vectorizer = joblib.load('trained/vectorizerantique.pkl')
file = open("trained/doc_vecsantique.pickle",'rb') 
doc_vecs = pickle.load(file)
data_dir = './project-root/antique/raw/'
collection = pd.read_csv(data_dir + "collection.tsv", sep='\t', 
                                names=['doc_id', 'text'])

In [12]:
if not pt.java.started():
    pt.init()

tokenizer = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
def strip_markup(text):
    return " ".join(tokenizer.getTokens(text))

def _preprocess_text(text: str) -> list:
    tokens = word_tokenize(text.lower())
    return [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

def stop_lemma(text: str) -> list:
    tokens = word_tokenize(text.lower())
    return ' '.join([lemmatizer.lemmatize(token) for token in tokens if token not in stop_words])

def stop_porter(text: str) -> list:
    tokens = word_tokenize(text.lower())
    return ' '.join([stemmer.stem(token) for token in tokens if token not in stop_words])
    
def stop_word(text: str) -> list:
    tokens = word_tokenize(text.lower())
    return ' '.join([token for token in tokens if token not in stop_words])

def expand_query_wordnet(query: str, num_expansions: int = 2) -> str:
    tokens = _preprocess_text(query)
    expanded_terms = set(tokens)
    for token in tokens:
        synonyms = set()
        for syn in wordnet.synsets(token)[:2]:  # Limit to top 2 synsets
            for lemma in syn.lemmas()[:num_expansions]:
                synonym = lemma.name().lower()
                if synonym != token and synonym not in synonyms:
                    synonyms.add(synonym)
            if len(synonyms) >= num_expansions:
                break
        expanded_terms.update(synonyms)
    return ' '.join(expanded_terms)

def expand_query_word2vec(query: str, num_expansions: int = 2, threshold: float = 0.7) -> str:
    topn=3
    words = query.split()
    expanded_words = words.copy()

    for word in words:
        try:
            similar_words = [w for w, _ in word2vec_model.most_similar(word, topn=topn) 
                            if w.lower() != word.lower()]
            expanded_words.extend(similar_words)
        except KeyError:
            continue

    return ' '.join(expanded_words)

def expand_query_pseudo_relevance(doc_vecs, query: str, collection: pd.DataFrame, 
                                 vectorizer: TfidfVectorizer, top_k: int = 3, 
                                 num_expansions: int = 2) -> str:
    try:
        query_vec = vectorizer.transform([query])
        similarities = cosine_similarity(query_vec, doc_vecs)[0]
        top_k_indices = np.argsort(similarities)[-top_k:]
        top_k_docs = collection.iloc[top_k_indices]
        top_k_vecs = vectorizer.transform(top_k_docs['text']).toarray()
        mean_top_k = np.mean(top_k_vecs, axis=0)
        original_vec = query_vec.toarray()[0]
        combined_vec = 0.7 * mean_top_k + 0.3 * original_vec  # Rocchio-like weighting
        feature_names = vectorizer.get_feature_names_out()
        top_indices = np.argsort(combined_vec)[-num_expansions:]
        expansion_terms = [feature_names[idx] for idx in top_indices 
                          if feature_names[idx] not in query.lower().split()]
    except IndexError:
        print(len(collection))
        print(collection)
        print(vectorizer)
        print(query)
        print(doc_vecs)
        print(query_vec)
        print(top_k_indices)
    return query + ' ' + ' '.join(expansion_terms)

def expand_porter_stemmer(text: str) -> str:
    tokens = word_tokenize(text.lower())
    return ' '.join([stemmer.stem(token) for token in tokens])
    
def comb(text: str, doc_vecs, collection, vectorizer) -> str:
    q = expand_query_wordnet(text)
    q = expand_query_word2vec(q)
    q = expand_query_pseudo_relevance(doc_vecs, q, collection, vectorizer)
    return expand_porter_stemmer(q)

  tokenizer = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()


In [6]:
  
no_qe = pt.terrier.Retriever(index, wmodel="BM25", metadata=["docno", "text"], properties={"termpipelines": ""}, controls={"qe": "off"})
qe_stop = pt.apply.query(lambda q: strip_markup(stop_word(q["query"]))) >> no_qe
qe_sp = pt.apply.query(lambda q: strip_markup(stop_porter(q["query"]))) >> no_qe
qe_sl = pt.apply.query(lambda q: strip_markup(stop_lemma(q["query"]))) >> no_qe
qe_wordnet = pt.apply.query(lambda q: strip_markup(expand_query_wordnet(q["query"]))) >> no_qe
qe_word2vec = pt.apply.query(lambda q: strip_markup(expand_query_word2vec(q["query"]))) >> no_qe
qe_pseudo = pt.apply.query(lambda q: strip_markup(expand_query_pseudo_relevance(doc_vecs, q["query"], collection, vectorizer))) >> no_qe
qe_stem = pt.apply.query(lambda q: strip_markup(expand_porter_stemmer(q["query"]))) >> no_qe
qe_comb = pt.apply.query(lambda q: strip_markup(comb(q["query"], doc_vecs, collection, vectorizer))) >> no_qe
pt.Experiment(
    [no_qe, qe_stop, qe_sp, qe_sl, qe_stem, qe_wordnet, qe_word2vec, qe_pseudo, qe_comb],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=[pt.measures.MAP(rel=3), RR(rel=3), nDCG@10],
    baseline = 0,
    names = ['None', 'Stop', 'Stop-Porter', 'Stop-Lemma', 'Stemming', 'Wordnet', 'Word2Vec', 'Pseudo-relevance', 'Combined']
)

Unnamed: 0,name,AP(rel=3),RR(rel=3),nDCG@10,AP(rel=3) +,AP(rel=3) -,AP(rel=3) p-value,RR(rel=3) +,RR(rel=3) -,RR(rel=3) p-value,nDCG@10 +,nDCG@10 -,nDCG@10 p-value
0,,0.206183,0.510478,0.516219,,,,,,,,,
1,Stop,0.206431,0.505513,0.521679,18.0,16.0,0.846007,11.0,7.0,0.3968924,23.0,10.0,0.02321216
2,Stop-Porter,0.197871,0.495555,0.501451,22.0,38.0,0.01111413,14.0,18.0,0.1263107,26.0,30.0,0.03673257
3,Stop-Lemma,0.204345,0.49408,0.517108,19.0,22.0,0.3808033,12.0,11.0,0.08490211,23.0,16.0,0.8224546
4,Stemming,0.191009,0.472298,0.463479,8.0,80.0,7.396386e-07,6.0,43.0,0.0006188521,6.0,70.0,1.687955e-10
5,Wordnet,0.142027,0.401388,0.327931,24.0,142.0,5.4594780000000005e-17,26.0,93.0,1.464995e-05,17.0,150.0,7.283639e-34
6,Word2Vec,0.118136,0.376206,0.283915,25.0,142.0,1.389732e-19,32.0,96.0,1.50102e-06,13.0,157.0,3.7379400000000004e-40
7,Pseudo-relevance,0.171921,0.449169,0.411002,1.0,167.0,4.4335150000000003e-23,2.0,75.0,1.092964e-07,2.0,136.0,5.27575e-25
8,Combined,0.100194,0.281996,0.229649,22.0,147.0,3.3727540000000003e-23,18.0,122.0,6.620245e-14,12.0,162.0,1.4369e-45


In [None]:
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Searcher
from colbert.data import Collection
import pyterrier as pt
import pandas as pd
import joblib
import pickle
from pyterrier.measures import *
if not pt.java.started():
  pt.init()
# vectorizer = joblib.load('trained/vectorizerantique.pkl')
# file = open("trained/doc_vecsantique.pickle",'rb') 
# doc_vecs = pickle.load(file)
data_dir = './project-root/antique/raw/collection.tsv'
collection_text = pd.read_csv(data_dir, sep="\t", names=['id', 'text'])
# collection = collection #.dropna().reset_index(drop=True)
collection2 = Collection(data=collection_text['text'].tolist())
# print(len(collection))
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
if __name__=='__main__':
    with Run().context(RunConfig(nranks=1, experiment="antiqueindex")):
        config = ColBERTConfig(
            root="experiments",
            collection=collection2
        )
        searcher = Searcher(index="antiqueindex", config=config)

class MyColbert:
    def __init__(self, searcher, method):
        self.searcher = searcher 
        self.method = method
    
    def transform(self, df):
        results = None
        for index, row in df.iterrows():
            query = row['query']
            if self.method is not None:
                query = self.method(query)
            result = self.searcher.search(query, k=100)
            result = pd.DataFrame(result).transpose()#, columns=['doc_index', 'rank', 'value'])
            result.columns = ['doc_index', 'rank', 'score']
            result['qid'] = row['qid']
            result['docno'] = ''
            for i in range(len(result)):
                ind = result.iloc[i]['doc_index']
                docno = collection_text.iloc[int(ind)]['id']
                result.loc[i,'docno'] = docno
            if results is None:
                results = result
            else:
                results = pd.concat([results, result])
        return results
        
no_qe = MyColbert(searcher, None)
qe_stop = MyColbert(searcher, lambda x: stop_word(x))
qe_sp = MyColbert(searcher, lambda x: stop_porter(x))
qe_sl = MyColbert(searcher, lambda x: stop_lemma(x))
qe_wordnet = MyColbert(searcher, lambda x: expand_query_wordnet(x))
qe_word2vec = MyColbert(searcher, lambda x: expand_query_word2vec(x))
qe_pseudo = MyColbert(searcher, lambda x: expand_query_pseudo_relevance(doc_vecs, x, collection_text, vectorizer))
qe_stem = MyColbert(searcher, lambda x: expand_porter_stemmer(x))
qe_comb = MyColbert(searcher, lambda x: comb(x, doc_vecs, collection_text, vectorizer))

pt.Experiment(
    [no_qe, qe_stop, qe_sp, qe_sl, qe_stem, qe_wordnet, qe_word2vec, qe_pseudo, qe_comb],
    # [ranker],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=[pt.measures.MAP(rel=3), RR(rel=3), nDCG@10],
    baseline = 0,
    names = ['None', 'Stop', 'Stop-Porter', 'Stop-Lemma', 'Stemming', 'Wordnet', 'Word2Vec', 'Pseudo-relevance', 'Combined']
)

[Apr 08, 15:07:30] #> Loading codec...
[Apr 08, 15:07:30] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


[Apr 08, 15:07:30] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


[Apr 08, 15:07:31] #> Loading IVF...
[Apr 08, 15:07:31] #> Loading doclens...



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 297.47it/s][A

[Apr 08, 15:07:31] #> Loading codes and residuals...




  0%|                                                                                                                                            | 0/17 [00:00<?, ?it/s][A
  6%|███████▊                                                                                                                            | 1/17 [00:00<00:03,  4.06it/s][A
 12%|███████████████▌                                                                                                                    | 2/17 [00:00<00:03,  4.56it/s][A
 18%|███████████████████████▎                                                                                                            | 3/17 [00:00<00:02,  5.29it/s][A
 24%|███████████████████████████████                                                                                                     | 4/17 [00:00<00:02,  5.95it/s][A
 29%|██████████████████████████████████████▊                                                                                             |


#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . how can we get concentration onsomething, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2129,  2064,  2057,  2131,  6693,  2006, 14045, 20744,
          102,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')



In [13]:
import xgboost as xgb
from pyterrier.measures import *
params = {'objective': 'rank:ndcg',
          'learning_rate': 0.1,
          'gamma': 1.0, 
          'min_child_weight': 0.1,
          'max_depth': 6,
          'random_state': 42
         }
topics = dataset.get_topics()
qrels = dataset.get_qrels()
train_topics, valid_topics, test_topics = np.split(topics, [int(.6*len(topics)), int(.8*len(topics))])

fbr3f = pt.terrier.FeaturesRetriever(index, wmodel="BM25", 
                                     features=  ['WMODEL:TF_IDF', 'WMODEL:PL2', 'WMODEL:BM25', 
                                                 'WMODEL:DirichletLM', 'WMODEL:Hiemstra_LM', 
                                                 'WMODEL:DFR_BM25', 'WMODEL:InL2', 'WMODEL:LGD', 
                                                 'WMODEL:DLH', 'WMODEL:DPH', 'WMODEL:LemurTF_IDF'], 
                                     properties={"termpipelines": ""}, controls={"qe": "off"})
BaseLTR_LM = fbr3f >> pt.ltr.apply_learned_model(xgb.sklearn.XGBRanker(**params), form='ltr')
BaseLTR_LM.fit(train_topics, qrels, valid_topics, qrels)

no_qe = BaseLTR_LM
qe_stop = pt.apply.query(lambda q: strip_markup(stop_word(q["query"]))) >> no_qe
qe_sp = pt.apply.query(lambda q: strip_markup(stop_porter(q["query"]))) >> no_qe
qe_sl = pt.apply.query(lambda q: strip_markup(stop_lemma(q["query"]))) >> no_qe
qe_stem = pt.apply.query(lambda q: strip_markup(expand_porter_stemmer(q["query"]))) >> no_qe
qe_wordnet = pt.apply.query(lambda q: strip_markup(expand_query_wordnet(q["query"]))) >> no_qe
qe_word2vec = pt.apply.query(lambda q: strip_markup(expand_query_word2vec(q["query"]))) >> no_qe
qe_pseudo = pt.apply.query(lambda q: strip_markup(expand_query_pseudo_relevance(doc_vecs, q["query"], collection, vectorizer))) >> no_qe
qe_comb = pt.apply.query(lambda q: strip_markup(comb(q["query"], doc_vecs, collection, vectorizer))) >> no_qe


pt.Experiment(
    [no_qe, qe_stop, qe_sp, qe_sl, qe_stem, qe_wordnet, qe_word2vec, qe_pseudo, qe_comb],
    test_topics,
    qrels,
    eval_metrics=[pt.measures.MAP(rel=3), RR(rel=3), nDCG@10],
    baseline = 0,
    names = ['None', 'Stop', 'Stop-Porter', 'Stop-Lemma', 'Stemming', 'Wordnet', 'Word2Vec', 'Pseudo-relevance', 'Combined']
)



Unnamed: 0,name,AP(rel=3),RR(rel=3),nDCG@10,AP(rel=3) +,AP(rel=3) -,AP(rel=3) p-value,RR(rel=3) +,RR(rel=3) -,RR(rel=3) p-value,nDCG@10 +,nDCG@10 -,nDCG@10 p-value
0,,0.070774,0.187955,0.189935,,,,,,,,,
1,Stop,0.106989,0.336787,0.29399,30.0,0.0,0.0005466004,24.0,0.0,0.001393457,21.0,1.0,0.000104452
2,Stop-Porter,0.224211,0.617373,0.498221,36.0,0.0,1.406408e-07,31.0,0.0,1.856026e-07,33.0,0.0,1.116302e-09
3,Stop-Lemma,0.138028,0.416627,0.358319,32.0,1.0,0.0002114026,27.0,0.0,0.0002203127,26.0,1.0,5.268125e-06
4,Stemming,0.191392,0.525078,0.420322,24.0,7.0,1.482381e-05,21.0,4.0,1.017049e-05,25.0,1.0,1.69742e-06
5,Wordnet,0.092059,0.335166,0.250261,22.0,12.0,0.0949733,19.0,11.0,0.009432858,18.0,11.0,0.01228663
6,Word2Vec,0.0513,0.143417,0.109214,8.0,24.0,0.1405187,10.0,22.0,0.288265,5.0,21.0,0.002834697
7,Pseudo-relevance,0.060833,0.186163,0.1553,1.0,29.0,0.07513778,1.0,19.0,0.9525271,2.0,16.0,0.01238989
8,Combined,0.093269,0.345265,0.227814,21.0,15.0,0.2447202,23.0,13.0,0.03503491,20.0,13.0,0.3279141


In [None]:
import pyterrier as pt
from pyterrier.measures import *
from pyterrier_t5 import MonoT5ReRanker, DuoT5ReRanker
monoT5 = MonoT5ReRanker() 
duoT5 = DuoT5ReRanker() 

bm25 = pt.BatchRetrieve(index, wmodel="BM25", properties={"termpipelines": ""}, controls={"qe": "off"})
mono_pipeline = (bm25 % 50) >> pt.text.get_text(dataset, "text") >> monoT5
duo_pipeline = mono_pipeline % 5 >> duoT5

no_qe = duo_pipeline
qe_stop = pt.apply.query(lambda q: strip_markup(stop_word(q["query"]))) >> no_qe
qe_sp = pt.apply.query(lambda q: strip_markup(stop_porter(q["query"]))) >> no_qe
qe_sl = pt.apply.query(lambda q: strip_markup(stop_lemma(q["query"]))) >> no_qe
qe_stem = pt.apply.query(lambda q: strip_markup(expand_porter_stemmer(q["query"]))) >> no_qe
qe_wordnet = pt.apply.query(lambda q: strip_markup(expand_query_wordnet(q["query"]))) >> no_qe
qe_word2vec = pt.apply.query(lambda q: strip_markup(expand_query_word2vec(q["query"]))) >> no_qe
qe_pseudo = pt.apply.query(lambda q: strip_markup(expand_query_pseudo_relevance(doc_vecs, q["query"], collection, vectorizer))) >> no_qe
qe_comb = pt.apply.query(lambda q: strip_markup(comb(q["query"], doc_vecs, collection, vectorizer))) >> no_qe


pt.Experiment(
    [no_qe, qe_stop, qe_sp, qe_sl, qe_stem, qe_wordnet, qe_word2vec, qe_pseudo, qe_comb],
    dataset.get_topics()[:50],
    dataset.get_qrels(),
    eval_metrics=[pt.measures.MAP(rel=3), RR(rel=3), nDCG@10],
    baseline = 0,
    names = ['None', 'Stop', 'Stop-Porter', 'Stop-Lemma', 'Stemming', 'Wordnet', 'Word2Vec', 'Pseudo-relevance', 'Combined']
)