In [1]:
#Necessary imports
import os
import csv
import pandas as pd
import pickle
from sentence_transformers import SentenceTransformer
import re
import torch
from torch.nn.functional import cosine_similarity
from torch.nn.functional import normalize

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load train & dev queries, and paper collection
# Load paper metadata
with open("subtask4b_collection_data.pkl", "rb") as f:
    papers_df = pickle.load(f)

papers_df["text"] = papers_df["title"] + ". " + papers_df["abstract"]

# Load tweet queries
train_df = pd.read_csv("subtask4b_query_tweets_train.tsv", sep="\t")
dev_df = pd.read_csv("subtask4b_query_tweets_dev.tsv", sep="\t")

print(f"Loaded: {len(train_df)} train tweets, {len(dev_df)} dev tweets, {len(papers_df)} papers.")

Loaded: 12853 train tweets, 1400 dev tweets, 7718 papers.


  papers_df = pickle.load(f)


In [18]:
# Load SBERT model

#model = SentenceTransformer('all-MiniLM-L6-v2')
#model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
#model = SentenceTransformer('multi-qa-MiniLM-L6-dot-v1')
#model = SentenceTransformer('allenai/specter')
#model = SentenceTransformer('msmarco-distilbert-base-tas-b')
#model = SentenceTransformer("intfloat/e5-base-v2")
#model = SentenceTransformer("intfloat/e5-large-v2") (way too slow and didn't work at the end)
#“This model uses contextualized embeddings from BERT as fixed-length representations of queries and documents. 
# These representations are used for similarity-based retrieval, making this an NLP representation learning approach.”
#--> model = SentenceTransformer('multi-qa-mpnet-base-cos-v1') # This model has been used to fine-tune and below is the path to the fine-tuned model.

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("fine_tuned_gte_b16_e4")  # change as needed

In [19]:
# Define cleaning function. This one is a bit agressive cleaning so result got worsen
#def clean_text(text):
#    text = text.lower()
#    text = re.sub(r"http\\S+|www\\S+", "", text)
#    text = re.sub(r"[@#]\\w+", "", text)
#    text = re.sub(r"[^a-z0-9\\s]", "", text)
#    return text.strip()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)         # remove URLs
    text = re.sub(r"[@#]\w+", "", text)                # remove @mentions and #hashtags
    text = re.sub(r"[^\w\s\-\/]", "", text)            # keep alphanum + dash/slash
    return text.strip()


In [20]:
# Apply cleaning
#train_df["clean_tweet_text"] = train_df["tweet_text"].apply(clean_text)
#dev_df["clean_tweet_text"] = dev_df["tweet_text"].apply(clean_text)

# Clean tweets
dev_df["clean_tweet_text"] = dev_df["tweet_text"].apply(clean_text)
train_df["clean_tweet_text"] = train_df["tweet_text"].apply(clean_text)

# Clean papers
papers_df["text"] = papers_df["title"] + ". " + papers_df["abstract"]
papers_df["clean_text"] = papers_df["text"].apply(clean_text)


In [21]:
# Encode train queries
train_query_embeddings = model.encode(
    train_df["clean_tweet_text"].tolist(),
    convert_to_tensor=True,
    show_progress_bar=True
)

# Encode dev queries
dev_query_embeddings = model.encode(
    dev_df["clean_tweet_text"].tolist(),
    convert_to_tensor=True,
    show_progress_bar=True
)

# Encode papers (passages)
paper_embeddings = model.encode(
    papers_df["clean_text"].tolist(),
    convert_to_tensor=True,
    show_progress_bar=True
)


Batches: 100%|████████████████████████████████| 402/402 [04:32<00:00,  1.48it/s]
Batches: 100%|██████████████████████████████████| 44/44 [00:28<00:00,  1.54it/s]
Batches: 100%|████████████████████████████████| 242/242 [23:25<00:00,  5.81s/it]


In [None]:
# Encode document collection
paper_embeddings = model.encode(papers_df["text"].tolist(), show_progress_bar=True, convert_to_tensor=True)

In [22]:
# Save paper embeddings to avoid recomputing in future runs
#torch.save(paper_embeddings, "paper_embeddings.pt")

# To load later:
paper_embeddings = torch.load("paper_embeddings.pt")

In [23]:
def get_topk_predictions_batched(query_embeddings, paper_embeddings, papers_df, top_k=5, batch_size=16):
    paper_norm = normalize(paper_embeddings, p=2, dim=1)
    paper_ids = papers_df["cord_uid"].tolist()  # store once for speed
    predictions = []

    for start_idx in range(0, len(query_embeddings), batch_size):
        end_idx = min(start_idx + batch_size, len(query_embeddings))
        query_batch = query_embeddings[start_idx:end_idx]
        query_norm = normalize(query_batch, p=2, dim=1)

        # 🔧 Fix device mismatch
        paper_norm_device = paper_norm.to(query_norm.device)

        similarity_matrix = torch.matmul(query_norm, paper_norm_device.T)
        top_k_indices = similarity_matrix.topk(k=top_k, dim=1).indices  # shape: [batch_size, top_k]

        for indices in top_k_indices:
            preds = [paper_ids[i] for i in indices.tolist()]
            predictions.append(preds)

    return predictions


In [24]:
# Run predictions default
#train_df["preds"] = get_topk_predictions_from_embeddings(train_query_embeddings)
#dev_df["preds"] = get_topk_predictions_from_embeddings(dev_query_embeddings)

train_df["preds"] = get_topk_predictions_batched(train_query_embeddings, paper_embeddings, papers_df, top_k=5)
dev_df["preds"] = get_topk_predictions_batched(dev_query_embeddings, paper_embeddings, papers_df, top_k=5)

Evaluation

In [25]:
# Evaluate MRR
def get_performance_mrr(data, col_gold, col_pred, list_k=[1, 5, 10]):
    d_performance = {}
    for k in list_k:
        scores = []
        for _, row in data.iterrows():
            gold = row[col_gold]
            preds = row[col_pred]
            if isinstance(preds, str):
                try:
                    preds = eval(preds)
                except:
                    preds = []
            if gold in preds[:k]:
                rank = preds[:k].index(gold) + 1
                scores.append(1.0 / rank)
            else:
                scores.append(0.0)
        d_performance[k] = sum(scores) / len(scores) if scores else 0.0
    return d_performance

In [26]:
# Print results default
print("Train MRR:", get_performance_mrr(train_df, "cord_uid", "preds"))
print("Dev MRR:", get_performance_mrr(dev_df, "cord_uid", "preds"))

Train MRR: {1: 7.780284758422158e-05, 5: 0.00024248554163749058, 10: 0.00024248554163749058}
Dev MRR: {1: 0.0, 5: 0.0, 10: 0.0}


In [None]:
# Save predictions to TSV file
with open("predictions.tsv", "w", newline='') as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["post_id", "preds"]) 

    for _, row in dev_df.iterrows():
        post_id = row["post_id"]
        preds = str(row["preds"])
        writer.writerow([post_id, preds])