# Traditional IR Model

TF-IDF Model 

In [5]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
from nltk.tokenize import RegexpTokenizer

### Load Dataset

In [7]:
df_train = pd.read_csv('subtask4b_query_tweets_train.tsv', sep='\t')
df_dev = pd.read_csv('subtask4b_query_tweets_dev.tsv', sep='\t')

with open('subtask4b_collection_data.pkl', 'rb') as f:
    collection_data = pickle.load(f)

# Check dataset 
display(df_train) # tweets with implicit references to CORD-19 papers
print(collection_data.iloc[0]) # CORD-19 academic papers' metadata

Unnamed: 0,post_id,tweet_text,cord_uid
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5
1,1,this study isn't receiving sufficient attentio...,4kfl29ul
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8
3,3,Taiwan - a population of 23 million has had ju...,0w9k8iy1
4,4,Obtaining a diagnosis of autism in lower incom...,tiqksd69
...,...,...,...
12848,14248,"""evidence on covid-19 reveals a growing body o...",9169o29b
12849,14249,Outdoor lighting has detrimental impacts on lo...,s2bpha8l
12850,14250,"26/ and influenza virus (and other pathogens, ...",atloc9th
12851,14251,does it?'sars-cov-2-naïve vaccinees had a 13.0...,t4y1ylb3


cord_uid                                                     umvrwgaw
source_x                                                          PMC
title               Professional and Home-Made Face Masks Reduce E...
doi                                      10.1371/journal.pone.0002618
pmcid                                                      PMC2440799
pubmed_id                                                    18612429
license                                                         cc-by
abstract            BACKGROUND: Governments are preparing for a po...
publish_time                                               2008-07-09
authors             van der Sande, Marianne; Teunis, Peter; Sabel,...
journal                                                      PLoS One
mag_id                                                            NaN
who_covidence_id                                                  NaN
arxiv_id                                                          NaN
label               

### TF-IDF (Term Frequency-Inverse Document Frequency)

In [8]:
collection_data['full_text'] = collection_data['title'].fillna('') + ' ' + collection_data['abstract'].fillna('')
paper_texts = collection_data['full_text'].tolist()
paper_ids = collection_data['cord_uid'].tolist()

# Tokenization
tokenizer = RegexpTokenizer(r'\w+')
def nltk_tokenizer(text):
    return tokenizer.tokenize(text.lower())

# Vectorization
vectorizer = TfidfVectorizer(
    tokenizer=nltk_tokenizer,
    stop_words='english',
    max_features=50000
)
doc_vectors = vectorizer.fit_transform(paper_texts)

# Compute TF-IDF similarity and retrieve top-5 documents
top5_predictions = []
reciprocal_ranks = []

for idx, row in tqdm(df_train.iterrows(), total=len(df_train)):
    tweet = row['tweet_text']
    true_id = row['cord_uid']
    
    tweet_vec = vectorizer.transform([tweet])
    scores = cosine_similarity(tweet_vec, doc_vectors)[0]
    
    top5_idx = np.argsort(scores)[::-1][:5]
    top5_ids = [paper_ids[i] for i in top5_idx]
    top5_predictions.append(top5_ids)
    
    # Compute MRR@5
    try:
        rank = top5_ids.index(true_id) + 1
        rr = 1 / rank
    except ValueError:
        rank = None
        rr = 0.0
    reciprocal_ranks.append(rr)

df_train['preds'] = top5_predictions
df_train[['post_id', 'preds']].to_csv('predictions_tfidf.tsv', sep='\t', index=False)

100%|██████████| 12853/12853 [01:07<00:00, 189.19it/s]


### Results

In [11]:
mrr_result_1 = np.mean([r if r == 1.0 else 0 for r in reciprocal_ranks])
mrr_result_5 = np.mean(reciprocal_ranks)
print(f"\nMRR@5 Results: {{1: {mrr_result_1:.10f}, 5: {mrr_result_5:.10f}}}")


MRR@5 Results: {1: 0.4667392827, 5: 0.5307412018}


In [12]:
match_ranks = []

for idx, preds in enumerate(top5_predictions):
    true_id = df_train.loc[idx, 'cord_uid']
    try:
        rank = preds.index(true_id) + 1
    except ValueError:
        rank = None
    match_ranks.append(rank)

df_result = df_train.copy()
df_result['match_rank'] = match_ranks
display_cols = ['post_id', 'tweet_text', 'cord_uid', 'preds', 'match_rank']
df_display = df_result[display_cols]

display(df_result)


Unnamed: 0,post_id,tweet_text,cord_uid,preds,match_rank
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5,"[htlvpvz5, am11yqbf, rwgqkow3, 65gedo6u, 4aps0...",1.0
1,1,this study isn't receiving sufficient attentio...,4kfl29ul,"[7xt894vr, fm8koqjd, bjvg2ivr, 7tto4hr7, z514v...",
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8,"[vx1hjh26, jbpmbm9m, jtwb17u8, w98847ai, s0w95...",3.0
3,3,Taiwan - a population of 23 million has had ju...,0w9k8iy1,"[0w9k8iy1, l4y7v729, lsgm7y5t, veeavho5, zxe95...",1.0
4,4,Obtaining a diagnosis of autism in lower incom...,tiqksd69,"[tiqksd69, k7smwz6w, b0dzhsrh, nuzt4jcf, ljt9r...",1.0
...,...,...,...,...,...
12848,14248,"""evidence on covid-19 reveals a growing body o...",9169o29b,"[s2hp3sat, ykxr9q1j, jgq968f6, 1y1ik2u9, 9169o...",5.0
12849,14249,Outdoor lighting has detrimental impacts on lo...,s2bpha8l,"[s2bpha8l, 8a3fp7ym, nwb7qf9l, jjiiutd5, v0dkb...",1.0
12850,14250,"26/ and influenza virus (and other pathogens, ...",atloc9th,"[6danlh8h, 7y6ok9a2, xp2qkk52, g17lp8ch, olhgu...",
12851,14251,does it?'sars-cov-2-naïve vaccinees had a 13.0...,t4y1ylb3,"[t4y1ylb3, 7a543f7v, a4klrp3h, o86wki37, sjsaw...",1.0
