In [36]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from rank_bm25 import BM25Okapi
from collections import Counter

In [37]:
test_df = pd.read_csv('subtask4b_query_tweets_test.tsv', sep='\t')

In [38]:
df_collection = pd.read_pickle('subtask4b_collection_data.pkl')

In [39]:
# Setup preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = re.findall(r'\b\w+\b', str(text).lower())
    return [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

def merge_fields(title, abstract, title_weight=3.0, abstract_weight=1.0):
    title_tokens = preprocess(title)
    abstract_tokens = preprocess(abstract)
    title_counter = Counter(title_tokens)
    abstract_counter = Counter(abstract_tokens)
    combined = Counter()
    for token in set(title_counter) | set(abstract_counter):
        combined[token] = title_weight * title_counter[token] + abstract_weight * abstract_counter[token]
    weighted_tokens = []
    for token, weight in combined.items():
        weighted_tokens.extend([token] * round(weight))
    return weighted_tokens

# Prepare corpus
tokenized_corpus = [
    merge_fields(row['title'], row['abstract']) for _, row in df_collection.iterrows()
]
bm25 = BM25Okapi(tokenized_corpus, k1=2.0, b=0.6)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mataonbas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mataonbas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/mataonbas/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [40]:
def get_top_cord_uids(query):
    tokenized_query = preprocess(query)
    doc_scores = bm25.get_scores(tokenized_query)
    indices = np.argsort(-doc_scores)[:5]
    return df_collection.iloc[indices]['cord_uid'].tolist()

In [41]:
# Get predictions for test set
test_df['preds'] = test_df['tweet_text'].apply(lambda x: get_top_cord_uids(x))

In [42]:
# Create DataFrame and save to TSV with exact format
test_df[['post_id', 'preds']].to_csv('predictions_fromtestset.tsv', sep='\t', index=False)

print("Predictions saved to predictions_fromtestset.tsv")

Predictions saved to predictions_fromtestset.tsv
