In [None]:
#Necessary imports
import os
import csv
import pandas as pd
import pickle
from sentence_transformers import SentenceTransformer
import re
import torch
from torch.nn.functional import cosine_similarity
from torch.nn.functional import normalize

In [None]:
# Load train & dev queries, and paper collection
# Load paper metadata
with open("subtask4b_collection_data.pkl", "rb") as f:
    papers_df = pickle.load(f)

papers_df["text"] = papers_df["title"] + ". " + papers_df["abstract"]

# Load tweet queries
train_df = pd.read_csv("subtask4b_query_tweets_train.tsv", sep="\t")
dev_df = pd.read_csv("subtask4b_query_tweets_dev.tsv", sep="\t")

print(f"Loaded: {len(train_df)} train tweets, {len(dev_df)} dev tweets, {len(papers_df)} papers.")

In [None]:
# Load SBERT model

#model = SentenceTransformer('all-MiniLM-L6-v2')
#model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
#model = SentenceTransformer('multi-qa-MiniLM-L6-dot-v1')
#model = SentenceTransformer('allenai/specter')
#model = SentenceTransformer('msmarco-distilbert-base-tas-b')

#“This model uses contextualized embeddings from BERT as fixed-length representations of queries and documents. 
# These representations are used for similarity-based retrieval, making this an NLP representation learning approach.”
#--> model = SentenceTransformer('multi-qa-mpnet-base-cos-v1') # This model has been used to fine-tune and below is the path to the fine-tuned model.

# Load the fine-tuned model
model = SentenceTransformer("multi-qa-mpnet-base-cos-v1")

In [None]:
import re

def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)     # Remove mentions
    text = re.sub(r"#\w+", "", text)     # Remove hashtags
    text = re.sub(r"\s+", " ", text)     # Normalize whitespace
    return text.strip()

dev_df["clean_tweet_text"] = dev_df["tweet_text"].apply(clean_text)

In [None]:
paper_embeddings = torch.load("paper_embeddings.pt")


In [None]:
dev_query_embeddings = model.encode(
    dev_df["clean_tweet_text"].tolist(),
    show_progress_bar=True,
    convert_to_tensor=True
)

In [None]:
from torch.nn.functional import cosine_similarity

# Ensure tensors are on the same device
dev_query_embeddings = dev_query_embeddings.to("cpu")
paper_embeddings = paper_embeddings.to("cpu")

# Compute cosine similarities: shape = [#queries, #papers]
similarity_scores = cosine_similarity(
    dev_query_embeddings.unsqueeze(1),  # shape: [Q, 1, D]
    paper_embeddings.unsqueeze(0)       # shape: [1, P, D]
)

# Get top 5 similar papers for each dev query
top_k = torch.topk(similarity_scores, k=5, dim=1)
top_k_indices = top_k.indices  # indices of top papers

In [None]:
for i in range(3):  # check first 3 tweets
    tweet_id = dev_df.loc[i, "post_id"]
    tweet_text = dev_df.loc[i, "tweet_text"]
    top_paper_ids = [papers_df.iloc[pid.item()]["cord_uid"] for pid in top_k_indices[i]]
    
    print(f"\nTweet {i+1} — ID: {tweet_id}")
    print("Tweet:", tweet_text)
    print("Top 5 paper IDs:", top_paper_ids)


In [None]:
def mean_reciprocal_rank_at_k(topk_indices, ground_truth_ids, paper_df, k=5):
    ranks = []
    for i, topk in enumerate(topk_indices):
        true_pid = ground_truth_ids[i]
        topk_cord_uids = [paper_df.iloc[pid.item()]["cord_uid"] for pid in topk[:k]]
        
        try:
            rank = topk_cord_uids.index(true_pid) + 1  # ranks are 1-based
            reciprocal_rank = 1.0 / rank
        except ValueError:
            reciprocal_rank = 0.0  # true document not in top-k

        ranks.append(reciprocal_rank)

    return sum(ranks) / len(ranks)

# Evaluate
mrr5 = mean_reciprocal_rank_at_k(top_k_indices, dev_df["cord_uid"].tolist(), papers_df, k=5)
print(f"📊 MRR@5 on dev set: {mrr5:.4f}")

In [None]:
#default
#def get_topk_predictions_from_embeddings(query_embeddings, top_k=5):
#    results = []
#    for query_emb in query_embeddings:
#        cos_scores = cosine_similarity(query_emb.unsqueeze(0), paper_embeddings).squeeze()
#        top_indices = torch.topk(cos_scores, k=min(top_k, len(paper_embeddings))).indices.tolist()
#        top_cord_uids = papers_df.iloc[top_indices]["cord_uid"].tolist()
#        results.append(top_cord_uids)
#    return results

# Batched version
def get_topk_predictions_batched(query_embeddings, paper_embeddings, papers_df, top_k=5, batch_size=16):
    paper_norm = normalize(paper_embeddings, p=2, dim=1)
    paper_ids = papers_df["cord_uid"].tolist()  # store once for speed
    predictions = []

    for start_idx in range(0, len(query_embeddings), batch_size):
        end_idx = min(start_idx + batch_size, len(query_embeddings))
        query_batch = query_embeddings[start_idx:end_idx]
        query_norm = normalize(query_batch, p=2, dim=1)

        similarity_matrix = torch.matmul(query_norm, paper_norm.T)
        top_k_indices = similarity_matrix.topk(k=top_k, dim=1).indices  # shape: [batch_size, top_k]

        for indices in top_k_indices:
            preds = [paper_ids[i] for i in indices.tolist()]
            predictions.append(preds)

    return predictions


In [None]:
# Run predictions default
#train_df["preds"] = get_topk_predictions_from_embeddings(train_query_embeddings)
#dev_df["preds"] = get_topk_predictions_from_embeddings(dev_query_embeddings)

train_df["preds"] = get_topk_predictions_batched(train_query_embeddings, paper_embeddings, papers_df, top_k=5)
dev_df["preds"] = get_topk_predictions_batched(dev_query_embeddings, paper_embeddings, papers_df, top_k=5)

Evaluation

In [None]:
# Evaluate MRR
def get_performance_mrr(data, col_gold, col_pred, list_k=[1, 5, 10]):
    d_performance = {}
    for k in list_k:
        scores = []
        for _, row in data.iterrows():
            gold = row[col_gold]
            preds = row[col_pred]
            if isinstance(preds, str):
                try:
                    preds = eval(preds)
                except:
                    preds = []
            if gold in preds[:k]:
                rank = preds[:k].index(gold) + 1
                scores.append(1.0 / rank)
            else:
                scores.append(0.0)
        d_performance[k] = sum(scores) / len(scores) if scores else 0.0
    return d_performance

In [None]:
# Print results default
print("Train MRR:", get_performance_mrr(train_df, "cord_uid", "preds"))
print("Dev MRR:", get_performance_mrr(dev_df, "cord_uid", "preds"))

In [None]:
# Save predictions to TSV file
with open("predictions.tsv", "w", newline='') as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["post_id", "preds"]) 

    for _, row in dev_df.iterrows():
        post_id = row["post_id"]
        preds = str(row["preds"])
        writer.writerow([post_id, preds])