In [10]:
import time
import string, json, csv
import numpy as np
import pandas as pd
import nltk
from rank_bm25 import BM25Okapi
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

NLTK setup

In [11]:
nltk.download('punkt',   quiet=True)
nltk.download('stopwords',quiet=True)
nltk.download('wordnet', quiet=True)

True

Loading datasets

In [12]:
COLLECTION_PATH     = '../subtask4b_collection_data.pkl'
TRAIN_QUERY_PATH    = '../subtask4b_query_tweets_train.tsv'
DEV_QUERY_PATH      = '../subtask4b_query_tweets_dev.tsv'
OUT_TRAIN_PRED_PATH = 'predictions_train.tsv'
OUT_DEV_PRED_PATH   = '../predictions_dev.tsv'

df_col   = pd.read_pickle(COLLECTION_PATH)
df_train = pd.read_csv(TRAIN_QUERY_PATH, sep='\t', dtype={'post_id':str})
df_dev   = pd.read_csv(DEV_QUERY_PATH,   sep='\t', dtype={'post_id':str})

Preprocessing + unigram & bigram tokenizer

In [13]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def tokenize_and_ngrams(text: str):
    txt = (text or '').lower().translate(
        str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    tokens = [
        lemmatizer.lemmatize(tok)
        for tok in nltk.word_tokenize(txt)
        if tok.isalpha() and tok not in stop_words
    ]
    bigrams = [f"{tokens[i]}_{tokens[i+1]}" for i in range(len(tokens)-1)]
    return tokens + bigrams

Build the BM25 corpus: title×2 + abstract

In [14]:
titles    = df_col['title'].fillna('').tolist()
abstracts = df_col['abstract'].fillna('').tolist()
uids      = df_col['cord_uid'].tolist()

bm25_corpus = []
for t,a in zip(titles, abstracts):
    t_toks = tokenize_and_ngrams(t)
    a_toks = tokenize_and_ngrams(a)
    bm25_corpus.append(t_toks*2 + a_toks)  # title-boost ×2

Initialize BM25 with tuned params

In [15]:
bm25 = BM25Okapi(bm25_corpus, k1=1.0, b=0.9)

Retrieval function (top-5)

In [16]:
def retrieve_top5(text: str):
    q_toks = tokenize_and_ngrams(text)
    scores = bm25.get_scores(q_toks)
    top5   = np.argsort(scores)[::-1][:5]
    return [uids[i] for i in top5]

Run + evaluate function

In [17]:
def compute_mrrs(df, ks=(1,5,10)):
    n = len(df)
    all_preds = [json.loads(p) for p in df['preds']]
    golds     = df['cord_uid'].tolist()
    mrrs = {}
    for k in ks:
        rr = 0.0
        for gold, preds in zip(golds, all_preds):
            topk = preds[:k]
            if gold in topk:
                rr += 1.0 / (topk.index(gold) + 1)
        mrrs[f"MRR@{k}"] = rr / n
    return mrrs

def run_and_time(df, name):
    start = time.perf_counter()
    df['preds'] = df['tweet_text'].map(lambda q: json.dumps(retrieve_top5(q)))
    elapsed = time.perf_counter() - start

    mrrs = compute_mrrs(df, ks=(1,5,10))
    print(f"[{name}] Retrieval time: {elapsed:.2f}s")
    for k, v in mrrs.items():
        print(f"[{name}] {k} = {v:.3f}")
    print()

    return elapsed, mrrs

Execute for train & dev

In [18]:
train_time, train_mrrs = run_and_time(df_train, 'TRAIN')
dev_time,   dev_mrrs   = run_and_time(df_dev,   'DEV')

[TRAIN] Retrieval time: 1254.35s
[TRAIN] MRR@1 = 0.583
[TRAIN] MRR@5 = 0.626
[TRAIN] MRR@10 = 0.626

[DEV] Retrieval time: 133.08s
[DEV] MRR@1 = 0.593
[DEV] MRR@5 = 0.638
[DEV] MRR@10 = 0.638

