In [14]:
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as gensim_downloader
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Searcher
import joblib
import warnings
import os
import datetime
import ir_datasets
import pyterrier as pt
from pyterrier.measures import *

dataset =  pt.get_dataset("irds:lotte/lifestyle/dev/search")

In [15]:
index_path = './lotteindex/'
index = pt.index.IterDictIndexer(
    index_path,
    type=pt.index.IndexingType.MEMORY,
).index(dataset.get_corpus_iter())

lotte/lifestyle/dev/search documents: 100%|███████████████████████████████████████████████████████████████████████████████████| 268893/268893 [03:16<00:00, 1366.08it/s]


In [16]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
word2vec_model = gensim_downloader.load("glove-wiki-gigaword-100")
vectorizer = joblib.load('trained/vectorizerlotte.pkl')
file = open("trained/doc_vecslotte.pickle",'rb') 
doc_vecs = pickle.load(file)
data_dir = './project-root/lotte/raw/'
collection = pd.read_csv(data_dir + "collection.tsv", sep='\t', 
                                names=['doc_id', 'text'])

In [17]:
if not pt.java.started():
    pt.init()

tokenizer = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
def strip_markup(text):
    return " ".join(tokenizer.getTokens(text))

def _preprocess_text(text: str) -> list:
    tokens = word_tokenize(text.lower())
    return [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

def stop_lemma(text: str) -> list:
    tokens = word_tokenize(text.lower())
    return ' '.join([lemmatizer.lemmatize(token) for token in tokens if token not in stop_words])

def stop_porter(text: str) -> list:
    tokens = word_tokenize(text.lower())
    return ' '.join([stemmer.stem(token) for token in tokens if token not in stop_words])
    
def stop_word(text: str) -> list:
    tokens = word_tokenize(text.lower())
    return ' '.join([token for token in tokens if token not in stop_words])

def expand_query_wordnet(query: str, num_expansions: int = 2) -> str:
    tokens = _preprocess_text(query)
    expanded_terms = set(tokens)
    for token in tokens:
        synonyms = set()
        for syn in wordnet.synsets(token)[:2]:  # Limit to top 2 synsets
            for lemma in syn.lemmas()[:num_expansions]:
                synonym = lemma.name().lower()
                if synonym != token and synonym not in synonyms:
                    synonyms.add(synonym)
            if len(synonyms) >= num_expansions:
                break
        expanded_terms.update(synonyms)
    return ' '.join(expanded_terms)

def expand_query_word2vec(query: str, num_expansions: int = 2, threshold: float = 0.7) -> str:
    topn=3
    words = query.split()
    expanded_words = words.copy()

    for word in words:
        try:
            similar_words = [w for w, _ in word2vec_model.most_similar(word, topn=topn) 
                            if w.lower() != word.lower()]
            expanded_words.extend(similar_words)
        except KeyError:
            continue

    return ' '.join(expanded_words)

def expand_query_pseudo_relevance(doc_vecs, query: str, collection: pd.DataFrame, 
                                 vectorizer: TfidfVectorizer, top_k: int = 3, 
                                 num_expansions: int = 2) -> str:
    try:
        query_vec = vectorizer.transform([query])
        similarities = cosine_similarity(query_vec, doc_vecs)[0]
        top_k_indices = np.argsort(similarities)[-top_k:]
        top_k_docs = collection.iloc[top_k_indices]
        top_k_vecs = vectorizer.transform(top_k_docs['text']).toarray()
        mean_top_k = np.mean(top_k_vecs, axis=0)
        original_vec = query_vec.toarray()[0]
        combined_vec = 0.7 * mean_top_k + 0.3 * original_vec  # Rocchio-like weighting
        feature_names = vectorizer.get_feature_names_out()
        top_indices = np.argsort(combined_vec)[-num_expansions:]
        expansion_terms = [feature_names[idx] for idx in top_indices 
                          if feature_names[idx] not in query.lower().split()]
    except IndexError:
        print(len(collection))
        print(collection)
        print(vectorizer)
        print(query)
        print(doc_vecs)
        print(query_vec)
        print(top_k_indices)
    return query + ' ' + ' '.join(expansion_terms)

def expand_porter_stemmer(text: str) -> str:
    tokens = word_tokenize(text.lower())
    return ' '.join([stemmer.stem(token) for token in tokens])
    
def comb(text: str, doc_vecs, collection, vectorizer) -> str:
    q = expand_query_wordnet(text)
    q = expand_query_word2vec(q)
    q = expand_query_pseudo_relevance(doc_vecs, q, collection, vectorizer)
    return expand_porter_stemmer(q)

  tokenizer = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()


In [10]:
  
no_qe = pt.terrier.Retriever(index, wmodel="BM25", metadata=["docno", "text"], properties={"termpipelines": ""}, controls={"qe": "off"})
qe_stop = pt.apply.query(lambda q: strip_markup(stop_word(q["query"]))) >> no_qe
qe_sp = pt.apply.query(lambda q: strip_markup(stop_porter(q["query"]))) >> no_qe
qe_sl = pt.apply.query(lambda q: strip_markup(stop_lemma(q["query"]))) >> no_qe
qe_wordnet = pt.apply.query(lambda q: strip_markup(expand_query_wordnet(q["query"]))) >> no_qe
qe_word2vec = pt.apply.query(lambda q: strip_markup(expand_query_word2vec(q["query"]))) >> no_qe
qe_pseudo = pt.apply.query(lambda q: strip_markup(expand_query_pseudo_relevance(doc_vecs, q["query"], collection, vectorizer))) >> no_qe
qe_stem = pt.apply.query(lambda q: strip_markup(expand_porter_stemmer(q["query"]))) >> no_qe
qe_comb = pt.apply.query(lambda q: strip_markup(comb(q["query"], doc_vecs, collection, vectorizer))) >> no_qe
pt.Experiment(
    [no_qe, qe_stop, qe_sp, qe_sl, qe_stem, qe_wordnet, qe_word2vec, qe_pseudo, qe_comb],
    dataset.get_topics()[:100],
    dataset.get_qrels(),
    eval_metrics=[pt.measures.MAP(rel=1), RR(rel=1), nDCG@10],
    baseline = 0,
    names = ['None', 'Stop', 'Stop-Porter', 'Stop-Lemma', 'Stemming', 'Wordnet', 'Word2Vec', 'Pseudo-relevance', 'Combined']
)

Unnamed: 0,name,AP,RR,nDCG@10,AP +,AP -,AP p-value,RR +,RR -,RR p-value,nDCG@10 +,nDCG@10 -,nDCG@10 p-value
0,,0.402428,0.542495,0.466339,,,,,,,,,
1,Stop,0.402499,0.541729,0.466227,5.0,2.0,0.04462129,2.0,1.0,0.361568,0.0,1.0,0.3197485
2,Stop-Porter,0.38908,0.519141,0.448789,7.0,11.0,0.04937568,4.0,9.0,0.106647,2.0,9.0,0.06909043
3,Stop-Lemma,0.402438,0.541606,0.466227,4.0,3.0,0.8674949,1.0,2.0,0.292129,0.0,1.0,0.3197485
4,Stemming,0.388123,0.519572,0.448334,2.0,14.0,0.03565601,2.0,10.0,0.112845,2.0,10.0,0.06228793
5,Wordnet,0.364904,0.524304,0.41981,20.0,55.0,0.0156816,19.0,28.0,0.520091,13.0,36.0,0.006128246
6,Word2Vec,0.279027,0.388899,0.319963,14.0,73.0,8.067188e-07,13.0,60.0,1.4e-05,14.0,56.0,6.976985e-08
7,Pseudo-relevance,0.378381,0.51582,0.433032,0.0,69.0,6.793019e-07,0.0,30.0,0.00058,0.0,27.0,6.06634e-05
8,Combined,0.273699,0.397468,0.319803,13.0,74.0,6.343878e-06,13.0,58.0,0.000285,13.0,56.0,1.650005e-06


In [12]:
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Searcher
from colbert.data import Collection
import pyterrier as pt
import pandas as pd
import joblib
import pickle
from pyterrier.measures import *
if not pt.java.started():
  pt.init()
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# data_dir = './project-root/lotte/raw/collection.tsv'
# collection2 = Collection(data_dir)

if __name__=='__main__':
    with Run().context(RunConfig(nranks=1, experiment="lotteindex")):
        config = ColBERTConfig(
            root="experiments",
            # collection=collection2
        )
        searcher = Searcher(index="lotteindex.nbits_2", config=config)

class MyColbert:
    def __init__(self, searcher, method):
        self.searcher = searcher 
        self.method = method
    
    def transform(self, df):
        results = None
        for index, row in df.iterrows():
            query = row['query']
            if self.method is not None:
                query = self.method(query)
            result = self.searcher.search(query, k=100)
            result = pd.DataFrame(result).transpose()
            result = result.dropna()
            result.columns = ['docno', 'rank', 'score']
            result['qid'] = row['qid']
            result['docno'] = result['docno'].astype(int)
            if results is None:
                results = result
            else:
                results = pd.concat([results, result])
        return results


no_qe = MyColbert(searcher, None)
qe_stop = MyColbert(searcher, lambda x: stop_word(x))
qe_sp = MyColbert(searcher, lambda x: stop_porter(x))
qe_sl = MyColbert(searcher, lambda x: stop_lemma(x))
qe_wordnet = MyColbert(searcher, lambda x: expand_query_wordnet(x))
qe_word2vec = MyColbert(searcher, lambda x: expand_query_word2vec(x))
qe_pseudo = MyColbert(searcher, lambda x: expand_query_pseudo_relevance(doc_vecs, x, collection, vectorizer))
qe_stem = MyColbert(searcher, lambda x: expand_porter_stemmer(x))
qe_comb = MyColbert(searcher, lambda x: comb(x, doc_vecs, collection, vectorizer))

# print(dataset.get_qrels()[dataset.get_qrels()['qid'] == '3'])
# print(searcher.search(dataset.get_topics()[3:5]['query'].to_list()[0], k=10))
# bm25 = pt.terrier.Retriever(index, wmodel="BM25", metadata=["docno", "text"], properties={"termpipelines": ""}, controls={"qe": "off"})
# print(bm25.search(dataset.get_topics()[3:4]['query'].tolist()[0]))
pt.Experiment(
    [no_qe, qe_stop, qe_sp, qe_sl, qe_stem, qe_wordnet, qe_word2vec, qe_pseudo, qe_comb],
    # [bm25, no_qe, qe_stop],
    dataset.get_topics()[:100],
    dataset.get_qrels(),
    eval_metrics=[pt.measures.MAP(rel=1), RR(rel=1), nDCG@10],
    baseline = 0,
    names = ['None', 'Stop', 'Stop-Porter', 'Stop-Lemma', 'Stemming', 'Wordnet', 'Word2Vec', 'Pseudo-relevance', 'Combined']
)

[Apr 08, 15:05:15] #> Loading collection...
0M 
[Apr 08, 15:05:18] #> Loading codec...
[Apr 08, 15:05:18] #> Loading IVF...
[Apr 08, 15:05:18] #> Loading doclens...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 881.14it/s]

[Apr 08, 15:05:18] #> Loading codes and residuals...



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:01<00:00,  8.47it/s]



#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . how much should i feed my 1 year old english mastiff, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2129,  2172,  2323,  1045,  5438,  2026,  1015,  2095,
         2214,  2394, 15429, 13355,   102,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')



Unnamed: 0,name,AP,RR,nDCG@10,AP +,AP -,AP p-value,RR +,RR -,RR p-value,nDCG@10 +,nDCG@10 -,nDCG@10 p-value
0,,0.543767,0.734818,0.619591,,,,,,,,,
1,Stop,0.513091,0.704623,0.585067,25.0,42.0,0.05805481,10.0,22.0,0.1740075,23.0,33.0,0.02502095
2,Stop-Porter,0.475116,0.637293,0.537085,22.0,47.0,0.001418708,10.0,30.0,0.006667636,23.0,40.0,0.0001835455
3,Stop-Lemma,0.520067,0.70885,0.588432,22.0,43.0,0.1376274,8.0,22.0,0.2248369,22.0,32.0,0.0434316
4,Stemming,0.493347,0.649204,0.555943,21.0,38.0,0.003360043,7.0,25.0,0.004612093,16.0,32.0,0.0002916572
5,Wordnet,0.399838,0.577746,0.469215,15.0,64.0,1.547434e-07,6.0,41.0,1.809109e-05,16.0,57.0,2.662564e-08
6,Word2Vec,0.440104,0.61229,0.50263,15.0,59.0,3.417246e-05,7.0,38.0,0.0001009065,11.0,55.0,6.097241e-07
7,Pseudo-relevance,0.510186,0.705128,0.571117,24.0,43.0,0.005145547,11.0,22.0,0.08836116,19.0,36.0,0.0001290521
8,Combined,0.250983,0.342083,0.289599,7.0,79.0,4.299057e-16,2.0,68.0,2.699006e-15,5.0,74.0,5.117821e-18


In [18]:
import xgboost as xgb
from pyterrier.measures import *
params = {'objective': 'rank:ndcg',
          'learning_rate': 0.1,
          'gamma': 1.0, 
          'min_child_weight': 0.1,
          'max_depth': 6,
          'random_state': 42
         }
topics = dataset.get_topics()[:200]
qrels = dataset.get_qrels()
train_topics, valid_topics, test_topics = np.split(topics, [int(.6*len(topics)), int(.8*len(topics))])

fbr3f = pt.terrier.FeaturesRetriever(index, wmodel="BM25", features=  ['WMODEL:TF_IDF', 'WMODEL:PL2', 'WMODEL:BM25', 
                                                 'WMODEL:DirichletLM', 'WMODEL:Hiemstra_LM', 
                                                 'WMODEL:DFR_BM25', 'WMODEL:InL2', 'WMODEL:LGD', 
                                                 'WMODEL:DLH', 'WMODEL:DPH', 'WMODEL:LemurTF_IDF'], properties={"termpipelines": ""}, controls={"qe": "off"})
BaseLTR_LM = fbr3f >> pt.ltr.apply_learned_model(xgb.sklearn.XGBRanker(**params), form='ltr')
BaseLTR_LM.fit(train_topics, qrels, valid_topics, qrels)

no_qe = BaseLTR_LM
qe_stop = pt.apply.query(lambda q: strip_markup(stop_word(q["query"]))) >> no_qe
qe_sp = pt.apply.query(lambda q: strip_markup(stop_porter(q["query"]))) >> no_qe
qe_sl = pt.apply.query(lambda q: strip_markup(stop_lemma(q["query"]))) >> no_qe
qe_stem = pt.apply.query(lambda q: strip_markup(expand_porter_stemmer(q["query"]))) >> no_qe
qe_wordnet = pt.apply.query(lambda q: strip_markup(expand_query_wordnet(q["query"]))) >> no_qe
qe_word2vec = pt.apply.query(lambda q: strip_markup(expand_query_word2vec(q["query"]))) >> no_qe
qe_pseudo = pt.apply.query(lambda q: strip_markup(expand_query_pseudo_relevance(doc_vecs, q["query"], collection, vectorizer))) >> no_qe
qe_comb = pt.apply.query(lambda q: strip_markup(comb(q["query"], doc_vecs, collection, vectorizer))) >> no_qe


pt.Experiment(
    [no_qe, qe_stop, qe_sp, qe_sl, qe_stem, qe_wordnet, qe_word2vec, qe_pseudo, qe_comb],
    test_topics,
    qrels,
    eval_metrics=[pt.measures.MAP(rel=1), RR(rel=1), nDCG@10],
    baseline = 0,
    names = ['None', 'Stop', 'Stop-Porter', 'Stop-Lemma', 'Stemming', 'Wordnet', 'Word2Vec', 'Pseudo-relevance', 'Combined']
)



Unnamed: 0,name,AP,RR,nDCG@10,AP +,AP -,AP p-value,RR +,RR -,RR p-value,nDCG@10 +,nDCG@10 -,nDCG@10 p-value
0,,0.059785,0.103185,0.082706,,,,,,,,,
1,Stop,0.058037,0.098661,0.07705,17.0,2.0,0.472541,13.0,3.0,0.348213,1.0,1.0,0.347125
2,Stop-Porter,0.131126,0.215826,0.181941,27.0,9.0,0.015466,26.0,8.0,0.030976,14.0,3.0,0.006221
3,Stop-Lemma,0.077329,0.142662,0.103562,23.0,3.0,0.239002,20.0,4.0,0.168471,4.0,2.0,0.370761
4,Stemming,0.128499,0.209785,0.183004,25.0,7.0,0.014539,24.0,7.0,0.024283,14.0,2.0,0.003392
5,Wordnet,0.047081,0.094073,0.065288,13.0,18.0,0.381543,13.0,16.0,0.815112,4.0,6.0,0.349704
6,Word2Vec,0.045462,0.080509,0.068017,14.0,17.0,0.334337,14.0,16.0,0.456105,5.0,5.0,0.410031
7,Pseudo-relevance,0.043942,0.078486,0.062751,0.0,24.0,0.085775,0.0,22.0,0.06889,0.0,4.0,0.184552
8,Combined,0.059283,0.10124,0.08518,22.0,15.0,0.976811,21.0,15.0,0.962632,7.0,6.0,0.923149


In [None]:
import pyterrier as pt
from pyterrier.measures import *
from pyterrier_t5 import MonoT5ReRanker, DuoT5ReRanker
monoT5 = MonoT5ReRanker() 
duoT5 = DuoT5ReRanker() 

bm25 = pt.BatchRetrieve(index, wmodel="BM25", properties={"termpipelines": ""}, controls={"qe": "off"})
mono_pipeline = (bm25 % 50) >> pt.text.get_text(dataset, "text") >> monoT5
duo_pipeline = mono_pipeline % 5 >> duoT5

no_qe = duo_pipeline
qe_stop = pt.apply.query(lambda q: strip_markup(stop_word(q["query"]))) >> no_qe
qe_sp = pt.apply.query(lambda q: strip_markup(stop_porter(q["query"]))) >> no_qe
qe_sl = pt.apply.query(lambda q: strip_markup(stop_lemma(q["query"]))) >> no_qe
qe_stem = pt.apply.query(lambda q: strip_markup(expand_porter_stemmer(q["query"]))) >> no_qe
qe_wordnet = pt.apply.query(lambda q: strip_markup(expand_query_wordnet(q["query"]))) >> no_qe
qe_word2vec = pt.apply.query(lambda q: strip_markup(expand_query_word2vec(q["query"]))) >> no_qe
qe_pseudo = pt.apply.query(lambda q: strip_markup(expand_query_pseudo_relevance(doc_vecs, q["query"], collection, vectorizer))) >> no_qe
qe_comb = pt.apply.query(lambda q: strip_markup(comb(q["query"], doc_vecs, collection, vectorizer))) >> no_qe


pt.Experiment(
    [no_qe, qe_stop, qe_sp, qe_sl, qe_stem, qe_wordnet, qe_word2vec, qe_pseudo, qe_comb],
    dataset.get_topics()[:50],
    dataset.get_qrels(),
    eval_metrics=[pt.measures.MAP(rel=1), RR(rel=1), nDCG@10],
    baseline = 0,
    names = ['None', 'Stop', 'Stop-Porter', 'Stop-Lemma', 'Stemming', 'Wordnet', 'Word2Vec', 'Pseudo-relevance', 'Combined']
)