# Baseline using BM25

## Imports

In [1]:
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
import nltk
import string

In [2]:
nltk.download("stopwords")
STOPWORDS = set(nltk.corpus.stopwords.words("english"))
PUNCTUATIONS = string.punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sigurd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# STOPWORDS

## Read data

In [45]:
# Queries dev, train or eval
# queriesDev = "data/queries/queries.dev.tsv"
queriesDev = "D:/DAT640/queries.dev.tsv"
queries_dev_df = pd.read_csv(queriesDev, sep='\t', header=None)
print(len(queries_dev_df))

# NOTE Not used
# queriesTrain = "data/queries/queries.train.tsv"
# queriesTrain = "D:/DAT640/queries.train.tsv"
# queries_train_df = pd.read_csv(queriesTrain, sep='\t', header=None)
# print(len(queries_train_df))

# queriesEval = "data/queries/queries.eval.tsv"
# queriesEval = "D:/DAT640/queries.eval.tsv"
# queries_eval_df = pd.read_csv(queriesEval, sep='\t', header=None)
# print(len(queries_eval_df))

101093
808731


In [5]:
# Passages to rank based on query
# collectionFile = "data/collection/collection.tsv"
collectionFile = "D:/DAT640/collection.tsv"
collection_df = pd.read_csv(collectionFile, sep='\t', header=None) #, index_col=0)
# len(collection_df)

In [6]:
collection_df.head()

Unnamed: 0,0,1
0,0,The presence of communication amid scientific ...
1,1,The Manhattan Project and its atomic bomb help...
2,2,Essay on The Manhattan Project - The Manhattan...
3,3,The Manhattan Project was the name for a proje...
4,4,versions of each volume as well as complementa...


In [None]:
# qrelsDev = "data/qrels.dev.tsv"
qrelsDev = "D:/DAT640/qrels.dev.tsv"
qrels_dev_df = pd.read_csv(qrelsDev, sep='\t', header=None)
qrels_dev_df.head(10)

In [213]:
ql = []
pl = []
for r in qrels_dev_df.iterrows():
    qid = r[1][0]
    pid = r[1][2]
    ql.append(queries_dev_df.loc[queries_dev_df[0]==qid])
    pl.append(collection_df.loc[collection_df[0]==pid])

qDF = pd.concat(ql,ignore_index=True)
colq =["qid", "query"]
qDF.columns = colq

pDF = pd.concat(pl,ignore_index=True)
colp =["pid", "passages"]
pDF.columns = colp


In [215]:
query_sample = qDF
passage_sample = pDF

## Preprocessing

In [216]:
queries_id = np.array(query_sample.iloc[:, 0])
queries = np.array(query_sample.iloc[:, -1])
queries[0]

'. what is a corporation?'

In [217]:
passages_id = np.array(passage_sample.iloc[:, 0])
passages = np.array(passage_sample.iloc[:, -1])
passages[0]

"McDonald's Corporation is one of the most recognizable corporations in the world. A corporation is a company or group of people authorized to act as a single entity (legally a person) and recognized as such in law. Early incorporated entities were established by charter (i.e. by an ad hoc act granted by a monarch or passed by a parliament or legislature)."

In [219]:
def tokenize(corpus):
    tokenized_corpus = []

    for doc in corpus:
        # Remove specific punctuations
        for punctuation in PUNCTUATIONS:
            doc = doc.replace(punctuation, " ")

        # Get only the words, not the whitespace
        words = [word for word in doc.split(" ") if word]

        # Remove specific stopwords
        words = [word for word in words if word not in STOPWORDS]

        # Add to the list of tokenized docs
        tokenized_corpus.append(words)
        

    return tokenized_corpus

In [220]:
tokenized_queries = tokenize(queries)
tokenized_queries[0:3]

[['corporation'],
 ['rachel', 'carson', 'write', 'obligation', 'endure'],
 ['rachel', 'carson', 'write', 'obligation', 'endure']]

In [145]:
# colTokenizedDF = pd.DataFrame(columns=["pid", "passage"])
# count = 0
# with open(collectionFile, encoding="utf-8") as f:
#     # f.readline(4)
#     for line in f:
#         if count % 100000 == 0:
#             print(count)
#         count += 1
#         # print(line.split("\t"))
#         tokenized = tokenize(line[1])
#         colTokenizedDF = pd.concat([colTokenizedDF, pd.DataFrame.from_dict({"pid": line[0], "passage": tokenized})], ignore_index=True)
#         # break

In [221]:
tokenized_passages = tokenize(passages)
# tokenized_passages[0:3]

In [222]:
# Dictionary to look up id
query_lookup = {}
for idx, query in enumerate(queries):
    query_lookup[query] = queries_id[idx]

# tokenized_query_lookup = {}
# for idx, (tokenized, query) in enumerate(zip(tokenized_queries, queries)):
#     tokenized_query_lookup[" ".join(tokenized)] = queries_id[idx]

passage_lookup = {}
for idx, passage in enumerate(passages):
    passage_lookup[passage] = passages_id[idx]

## BM25 Implementation
- https://pypi.org/project/rank-bm25/
- http://www.cs.otago.ac.nz/homepages/andrew/papers/2014-2.pdf

In [274]:
bm25 = BM25Okapi(tokenize(passages))
bm25_rankings = {}


In [249]:
#  Get ground truth
def load_ground_truth(qrels_df : pd.DataFrame) -> dict[str, set[str]]:
    qrels_dict = {}
    for idx, row in qrels_df.iterrows():
        qid = row[0]
        pid = row[2]
        relevance = row[3]
        if relevance > 0:
            if qid not in qrels_dict:
                qrels_dict[qid] = set()
            qrels_dict[qid].add(pid)
    return qrels_dict

ground_truth = load_ground_truth(qrels_dev_df)

In [250]:
def get_precision_1000(query_id, bm_25_rankings, ground_truth):
    bm_25 = set(bm_25_rankings[query_id])
    truth = ground_truth[query_id]
    prec = len(bm_25.intersection(truth)) / len(bm_25)
    return prec

In [293]:
def get_average_precision_1000(query_id, bm_25_rankings, ground_truth) -> float:
    vals = []
    over = 1
    for rank_idx, rank in enumerate(bm_25_rankings[query_id]):
        under = rank_idx+1
        if rank in ground_truth[query_id]:
            vals.append(over / under)
            over += 1
    AP = sum(vals) / len(ground_truth[query_id])

    return AP

In [296]:
def get_reciprocal_rank(query_id, bm_25_rankings, ground_truth) -> float:
    AP = 0
    for rank_idx, rank in enumerate(bm_25_rankings[query_id]):
        under = rank_idx+1
        if rank in ground_truth[query_id]:
            AP = 1 / under
            break

    return AP

In [None]:
def get_mean_eval_measure(bm_25_rankings, ground_truths, eval_function) -> float:
    results = []
    for query in bm_25_rankings:
        results.append(eval_function(bm_25_rankings[query], ground_truths[query]))
    return sum(results) / len(results)

In [278]:
num_queries = 6
query_tokens = tokenized_queries[0:num_queries]
query_tokens[0]

['corporation']

In [298]:
for idx, query in enumerate(query_tokens):
    top_1000 = bm25.get_top_n(query, passages, n=5)
    query_index = query_lookup[queries[idx]]
    bm25_rankings[query_index] = [passage_lookup[passage_key] for passage_key in top_1000]

In [301]:
measures = pd.DataFrame(columns=["query", "precision", "avg_precision", "reciprocal_rank"])
m =[]
for query_id in bm25_rankings:
    score = get_precision_1000(query_id, bm25_rankings, ground_truth)
    pres = get_average_precision_1000(query_id, bm25_rankings, ground_truth)
    rec = get_reciprocal_rank(query_id, bm25_rankings, ground_truth)
    m.append((query_id, score, pres, rec))

In [336]:
print(f"Query ID\tRankings\t\t\t\t\tGround truth")
for qid in bm25_rankings:
    r = " ".join([str(v) for v in bm25_rankings[qid]])
    g = " ".join([str(v) for v in ground_truth[qid]])
    print(f"{qid}\t\t{r}\t\t{g}")


Query ID	Rankings					Ground truth
1102432		5501299 5501299 6456242 6419354 6419354		2026790
1102431		1459230 7452193 7975471 7970517 7680370		7066866 7066867
1090282		7066900 426316 7622282 7185364 7622280		7066900
39449		7066905 7773569 7501532 7999377 7904147		7066905
76162		7467195 7066915 7755622 7443562 7755288		7066915


In [312]:
print(f"Query ID\tpre\tavg_pre\trr")
for vals in m:
    print(f"{vals[0]}\t\t{round(vals[1], 3)}\t{round(vals[2], 3)}\t{round(vals[3], 3)}")

Query ID	pre	avg_pre	rr
1102432		0.0	0.0	0
1102431		0.0	0.0	0
1090282		0.2	1.0	1.0
39449		0.2	1.0	1.0
76162		0.2	0.5	0.5


In [154]:
# TODO:
    # Use train/eval to see how well the bm25 works

In [None]:
# # Best result from each query
# results = {}
# t = 0
# # Some tokenized queries are the same
# for query, tokenized in zip(queries, tokenized_queries):
#     t += 1
#     scores = bm25.get_scores(tokenized)
#     d = {}
#     for id, i in enumerate(scores):
#         if i:
#             d[str(id)] = 1
#     results[str(query_lookup[query])] = d
# print(len(results))
# print(t)

## Resultater
- Se M5-retrieval_evaluation for å se hvordan man skal sammenlikne rank og ground truth

In [None]:
def create_qrels_dict(qrels_df : pd.DataFrame) -> dict[str, dict[str, int]]:
    qrels_dict = {}
    for idx, row in qrels_df.iterrows():
        qid = str(row[0])
        pid = str(row[2])
        relevance = row[3]

        if qid not in qrels_dict:
            qrels_dict[qid] = {}
        qrels_dict[qid][pid] = relevance

    return qrels_dict

qrels_dict = create_qrels_dict(qrels_dev_df)
# qrels_dict

In [None]:
from ranx import Qrels, Run


qrels = Qrels(q)

## Create a run
run = Run(r)

# Evaluate

In [None]:
from ranx import evaluate

# Compute score for a single metric
evaluate(qrels, run, "ndcg@5")


# Compute scores for multiple metrics at once
evaluate(qrels, run, ["map@5", "mrr"])

{'map@5': 0.0, 'mrr': 0.0}

In [None]:
from ranx import compare

# Compare different runs and perform Two-sided Paired Student's t-Test
report = compare(
    qrels=qrels,
    runs=[run_1, run_2, run_3, run_4, run_5],
    metrics=["map@100", "mrr@100", "ndcg@10"],
    max_p=0.01  # P-value threshold
)