In [1]:
import ir_datasets
from tqdm.notebook import tqdm
import pickle
import pandas as pd
import re
import string
from rank_bm25 import BM25Okapi
from ir_measures import nDCG, MAP, RBP, Recall, Qrel, ScoredDoc, calc_aggregate
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
with open('data/multi-subset.pkl', 'rb') as file:
    multi_subset = pickle.load(file)

In [3]:
dataset = ir_datasets.load("neuclir/1/multi/trec-2023")
qrels = [(qrel.query_id, qrel.doc_id, qrel.relevance, qrel.iteration) for qrel in dataset.qrels_iter()]

df_qrels = pd.DataFrame(
    qrels, 
    columns=[
        'query_id', 
        'doc_id', 
        'relevance', 
        'iteration'
    ]
)

english_queries = [
    (
        query.query_id, 
        query.title, 
        query.description, 
        query.fa_mt_title, 
        query.fa_mt_description, 
        query.ru_mt_title, 
        query.ru_mt_description, 
        query.zh_mt_title, 
        query.zh_mt_description, 
    ) 
    for query in dataset.queries_iter()
]

df_queries = pd.DataFrame(
    english_queries, 
    columns=[
        'query_id', 
        'title', 
        'description', 
        'fa_mt_title', 
        'fa_mt_description', 
        'ru_mt_title', 
        'ru_mt_description', 
        'zh_mt_title', 
        'zh_mt_description', 
    ]
)


df_documents = pd.DataFrame(multi_subset, columns=["id", "title", "content"])

In [4]:
def evaluate(qrels, result):
    qrels = [
        Qrel(query_id=query_id, doc_id=doc_id, relevance=relevance)
        for query_id, doc_id, relevance, _ in qrels   
    ]

    runs = [
        ScoredDoc(query_id=query_id, doc_id=doc_id, score=score)
        for query_id, doc_id, score in result
    ]
    scores = calc_aggregate([nDCG@20, MAP, RBP(rel=1), Recall@100, Recall@1000], qrels, runs)

    return scores

In [None]:
# Merge qrels with documents to get corresponding title 
df_merged = pd.merge(df_qrels, df_documents, how='left', left_on='doc_id', right_on='id')

df_merged.rename(columns={'title': 'document_title'}, inplace=True)

# Extract 
def get_mt_title(row):
    if row['iteration'] == 'fas':
        return row['fa_mt_title']
    elif row['iteration'] == 'rus':
        return row['ru_mt_title']
    elif row['iteration'] == 'zho':
        return row['zh_mt_title']
    else:
        return None  

# Apply the function to create a new column with the concatenated title
df_merged['query_title_translated'] = df_merged.apply(get_mt_title, axis=1)

# Optionally, drop unnecessary columns like fa_mt_title, ru_mt_title, zh_mt_title if you no longer need them
df_merged.drop(columns=['fa_mt_title', 'ru_mt_title', 'zh_mt_title', 'fa_mt_description', 'ru_mt_description', 'zh_mt_description'], inplace=True)

In [7]:
def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text.split()

In [None]:
def bm25(df_merged): 

    grouped = df_merged.groupby("query_id")

    # For each query, apply BM25
    results = []
    for query_id, group in tqdm(grouped):
        query_title = group["query_title_translated"].iloc[0] 
        docs_title = group["document_title"].tolist()  
        docs_content = group["content"].tolist()  
        
        # Preprocess titles and content
        query_tokens = preprocess(query_title)
        doc_title_tokens = [preprocess(doc) for doc in docs_title]
        doc_content_tokens = [preprocess(doc) for doc in docs_content]
        
        # BM25 calculation for document titles
        bm25_title = BM25Okapi(doc_title_tokens)
        scores_title = bm25_title.get_scores(query_tokens)
        
        # BM25 calculation for document content
        bm25_content = BM25Okapi(doc_content_tokens)
        scores_content = bm25_content.get_scores(query_tokens)
        
        # Calculate BM25 for concatenated title and content
        docs_title_and_content = [title + " " + content for title, content in zip(docs_title, docs_content)]
        doc_title_and_content_tokens = [preprocess(doc) for doc in docs_title_and_content]
        bm25_title_and_content = BM25Okapi(doc_title_and_content_tokens)
        scores_title_and_content = bm25_title_and_content.get_scores(query_tokens)
        
        # Append results for each document
        for doc_id, score_title, score_content, score_title_and_content, doc_title in zip(group["doc_id"], scores_title, scores_content, scores_title_and_content, docs_title):
            results.append({
                "query_id": query_id,
                "doc_id": doc_id,
                "document_title": doc_title,
                "score_title": score_title,
                "score_content": score_content,
                "score_title_and_content": score_title_and_content
            })

    # Create DataFrame with BM25 scores
    bm25_results = pd.DataFrame(results)

    # Merge BM25 results with original dataframe
    final_df = pd.merge(df_merged, bm25_results, on=["query_id", "doc_id"])

    # Sort documents within each query by bm25_score inside a query
    sorted_final_df = final_df.sort_values(by=["query_id", "score_title"], ascending=[True, False])

    return sorted_final_df


In [9]:
bm25_df = bm25(df_merged)

bm25_runs_title = sorted(zip(bm25_df['query_id'], bm25_df['doc_id'], bm25_df['score_title']), key=lambda x: (x[0], -x[2]))
bm25_runs_content = sorted(zip(bm25_df['query_id'], bm25_df['doc_id'], bm25_df['score_content']), key=lambda x: (x[0], -x[2]))
bm25_runs_title_and_content= sorted(zip(bm25_df['query_id'], bm25_df['doc_id'], bm25_df['score_title_and_content']), key=lambda x: (x[0], -x[2]))

  0%|          | 0/76 [00:00<?, ?it/s]

In [10]:
evaluate(qrels, bm25_runs_title)

{R@100: 0.14563090791393538,
 nDCG@20: 0.19282047568519042,
 RBP(rel=1): 0.2674535363370284,
 AP: 0.22238520862678107,
 R@1000: 0.879572641565378}

In [11]:
evaluate(qrels, bm25_runs_content)

{R@100: 0.18155917029250382,
 nDCG@20: 0.233174007671653,
 RBP(rel=1): 0.34040014205674474,
 AP: 0.2544200727112165,
 R@1000: 0.8834343550279027}

In [12]:
evaluate(qrels, bm25_runs_title_and_content)

{R@100: 0.18083872999666104,
 nDCG@20: 0.23678173299404812,
 RBP(rel=1): 0.34433565736077315,
 AP: 0.25477962239826385,
 R@1000: 0.88309759916862}

In [None]:
def tfidf(df_merged):
    # For each query, apply TF-IDF
    grouped = df_merged.groupby("query_id")

    results = []
    for query_id, group in tqdm(grouped):
        query_title = group["query_title_translated"].iloc[0] 
        docs_title = group["document_title"].tolist()  
        docs_content = group["content"].tolist() 
        
        # Preprocess titles and content
        query_tokens = preprocess(query_title)
        doc_title_tokens = [preprocess(doc) for doc in docs_title]
        doc_content_tokens = [preprocess(doc) for doc in docs_content]
        
        # TF-IDF calculation for document titles
        vectorizer_title = TfidfVectorizer()
        tfidf_title = vectorizer_title.fit_transform(docs_title)
        query_tfidf_title = vectorizer_title.transform([query_title])
        scores_title = cosine_similarity(query_tfidf_title, tfidf_title).flatten()
        
        # TF-IDF calculation for document content
        vectorizer_content = TfidfVectorizer()
        tfidf_content = vectorizer_content.fit_transform(docs_content)
        query_tfidf_content = vectorizer_content.transform([query_title])
        scores_content = cosine_similarity(query_tfidf_content, tfidf_content).flatten()
        
        # TF-IDF calculation for concatenated title and content
        docs_title_and_content = [title + " " + content for title, content in zip(docs_title, docs_content)]
        vectorizer_title_and_content = TfidfVectorizer()
        tfidf_title_and_content = vectorizer_title_and_content.fit_transform(docs_title_and_content)
        query_tfidf_title_and_content = vectorizer_title_and_content.transform([query_title])
        scores_title_and_content = cosine_similarity(query_tfidf_title_and_content, tfidf_title_and_content).flatten()
        
        # Append results for each document
        for doc_id, score_title, score_content, score_title_and_content, doc_title in zip(group["doc_id"], scores_title, scores_content, scores_title_and_content, docs_title):
            results.append({
                "query_id": query_id,
                "doc_id": doc_id,
                "document_title": doc_title,
                "score_title": score_title,
                "score_content": score_content,
                "score_title_and_content": score_title_and_content
            })

    # Create DataFrame with TF-IDF scores
    tfidf_results = pd.DataFrame(results)

    # Merge TF-IDF results with original dataframe
    final_df = pd.merge(df_merged, tfidf_results, on=["query_id", "doc_id"])

    # Sort documents within each query by score_title descending and query_id 
    sorted_final_df = final_df.sort_values(by=["query_id", "score_title"], ascending=[True, False])

    return sorted_final_df

In [14]:
tfidf_df = tfidf(df_merged)

tfidf_runs_title = sorted(zip(tfidf_df['query_id'], tfidf_df['doc_id'], tfidf_df['score_title']), key=lambda x: (x[0], -x[2]))
tfidf_runs_content = sorted(zip(tfidf_df['query_id'], tfidf_df['doc_id'], tfidf_df['score_content']), key=lambda x: (x[0], -x[2]))
tfidf_runs_title_and_content= sorted(zip(tfidf_df['query_id'], tfidf_df['doc_id'], tfidf_df['score_title_and_content']), key=lambda x: (x[0], -x[2]))

  0%|          | 0/76 [00:00<?, ?it/s]

In [15]:
evaluate(qrels, tfidf_runs_title)

{R@100: 0.1533779267713137,
 nDCG@20: 0.20222656522819074,
 RBP(rel=1): 0.2849204994203492,
 AP: 0.22690498394177694,
 R@1000: 0.8816415525072032}

In [16]:
evaluate(qrels, tfidf_runs_content)

{R@100: 0.18640436045800213,
 nDCG@20: 0.21617987974336925,
 RBP(rel=1): 0.30768511978516333,
 AP: 0.25478898588165183,
 R@1000: 0.8853175971573662}

In [17]:
evaluate(qrels, tfidf_runs_title_and_content)

{R@100: 0.18638355509462465,
 nDCG@20: 0.22151818944627474,
 RBP(rel=1): 0.3146175197853463,
 AP: 0.25490511606903354,
 R@1000: 0.8849928452878565}