# Import Libraries

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import torch
import random
from keybert import KeyBERT

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Loading and Preparation

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Research Paper Recommender System/arXiv_scientific dataset.csv')

In [6]:
df.head()

Unnamed: 0,id,title,category,category_code,published_date,updated_date,authors,first_author,summary,summary_word_count
0,cs-9308101v1,Dynamic Backtracking,Artificial Intelligence,cs.AI,8/1/93,8/1/93,['M. L. Ginsberg'],'M. L. Ginsberg',Because of their occasional need to return to ...,79
1,cs-9308102v1,A Market-Oriented Programming Environment and ...,Artificial Intelligence,cs.AI,8/1/93,8/1/93,['M. P. Wellman'],'M. P. Wellman',Market price systems constitute a well-underst...,119
2,cs-9309101v1,An Empirical Analysis of Search in GSAT,Artificial Intelligence,cs.AI,9/1/93,9/1/93,"['I. P. Gent', 'T. Walsh']",'I. P. Gent',We describe an extensive study of search in GS...,167
3,cs-9311101v1,The Difficulties of Learning Logic Programs wi...,Artificial Intelligence,cs.AI,11/1/93,11/1/93,"['F. Bergadano', 'D. Gunetti', 'U. Trinchero']",'F. Bergadano',As real logic programmers normally use cut (!)...,174
4,cs-9311102v1,Software Agents: Completing Patterns and Const...,Artificial Intelligence,cs.AI,11/1/93,11/1/93,"['J. C. Schlimmer', 'L. A. Hermens']",'J. C. Schlimmer',To support the goal of allowing users to recor...,187


In [7]:
df.shape

(136238, 10)

In [8]:
# Combine title and category and summary into a single text field for each paper
df["text"] = df["title"] + " " + df["category"] + " " + df["summary"]

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Model Initialization and Embeddings Generation

In [13]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embedder.to(device)
# Compute embeddings for all documents
text = df["text"].tolist()
embeddings = embedder.encode(text, show_progress_bar=True, convert_to_tensor=True, device = device)
kw_model = KeyBERT()

Batches:   0%|          | 0/4258 [00:00<?, ?it/s]

In [14]:
embeddings.shape

torch.Size([136238, 384])

# CORE FUNCTIONS (RECOMMENDER & KEYWORD EXTRACTION)

In [20]:
def get_top_n_recommendations(query_text, n=5, df_corpus=df, embeddings_corpus=embeddings, embedder=embedder, device=device):
    """
    Given a query text, returns the top n most similar research papers from the corpus.
    Uses SentenceTransformer embeddings and Cosine Similarity.
    """
    # 1. Encode the query text
    query_embedding = embedder.encode(query_text, convert_to_tensor=True, device=device)

    # 2. Compute cosine similarity scores
    cos_scores = util.pytorch_cos_sim(query_embedding, embeddings_corpus)[0]

    # Get the top n+1 scores to exclude the query itself (if it exists in the corpus)
    topk = torch.topk(cos_scores, k=n + 1, sorted=True)
    top_indices_with_self = topk.indices.cpu().tolist()

    # Find the index of the query paper in the corpus to filter it out
    query_paper_indices = df_corpus.index[df_corpus['text'] == query_text].tolist()

    # Filter out the query paper itself
    final_indices = []
    for idx in top_indices_with_self:
        # Check if the row index corresponds to the query text index
        if df_corpus.index[idx] not in query_paper_indices:
            final_indices.append(idx)
        if len(final_indices) == n:
            break

    return df_corpus.iloc[final_indices].copy(), final_indices

In [21]:
def extract_keywords(text, top_n=5, model=kw_model):
    """
    Extracts top keywords from a given text using KeyBERT.
    Returns a set of keywords.
    """
    keywords = model.extract_keywords(text, stop_words='english', top_n=top_n)
    # Each element in keywords is a tuple: (keyword, score). We extract only the keyword.
    return {kw for kw, score in keywords}

In [22]:
def calculate_map_at_k(df_corpus, num_queries=10, k=5):
    """
    Calculates Mean Average Precision at k (MAP@k) for a set of queries.
    Relevance proxy: Same 'category' as the query paper.
    """
    print(f"\n--- Starting MAP@{k} Validation with {num_queries} Queries ---")

    # Select random query indices for the test set
    # Using indices 5 onwards to select from the diverse set of simulated papers
    available_indices = df_corpus.index[5:].tolist()
    if len(available_indices) < num_queries:
        # Fallback for very small corpus - selecting randomly with replacement
        query_indices = random.choices(df_corpus.index.tolist(), k=num_queries)
    else:
        query_indices = random.sample(available_indices, num_queries)

    precision_scores = []
    results_list = []

    for idx in query_indices:
        query_paper = df_corpus.iloc[idx]
        query_text = query_paper['text']
        query_category = query_paper['category']

        # 1. Get Recommendations
        recommended_papers, _ = get_top_n_recommendations(query_text, n=k)

        # 2. Determine Relevance (Proxy: Same Category)
        num_relevant = (recommended_papers['category'] == query_category).sum()

        # 3. Calculate P@k
        precision_at_k = num_relevant / k
        precision_scores.append(precision_at_k)

        results_list.append({
            'Query Paper Title': query_paper['title'],
            'Query Category': query_category,
            f'Recommendations Retrieved (k={k})': k,
            'Relevant Recommendations (Same Category)': num_relevant,
            f'Precision@{k}': precision_at_k
        })

    # 4. Calculate MAP@k (Mean Average Precision)
    mean_average_precision_at_k = np.mean(precision_scores)

    results_df = pd.DataFrame(results_list)

    return results_df, mean_average_precision_at_k

# VALIDATION AND EVALUATION (MAP@K)

In [27]:
NUM_QUERIES = 10
K_VALUE = 5

# --- Execute MAP@5 Validation ---
results_df_map, map_at_k = calculate_map_at_k(df, num_queries=NUM_QUERIES, k=K_VALUE)

print("\nValidation Results (MAP@5 for a test set of 10 queries):")
print(results_df_map)
print("\n--- Evaluation Metric Summary ---")
print(f"Mean Average Precision @ {K_VALUE} (MAP@{K_VALUE}): {map_at_k:.4f}")


--- Starting MAP@5 Validation with 10 Queries ---

Validation Results (MAP@5 for a test set of 10 queries):
                                   Query Paper Title  \
0  Limitations of the NTK for Understanding Gener...   
1           Learning Features that Predict Cue Usage   
2  Transductive Zero-Shot Learning with Adaptive ...   
3                        Political Speech Generation   
4  Adversarial Regression. Generative Adversarial...   
5                    Sparse Probability of Agreement   
6  Fine-Tuning Pre-Trained Language Models Effect...   
7  Learning Named Entity Tagger using Domain-Spec...   
8                 Multi-view Hierarchical Clustering   
9  FiNER: Financial Numeric Entity Recognition fo...   

                                      Query Category  \
0                                   Machine Learning   
1         Computation and Language (Legacy category)   
2            Computer Vision and Pattern Recognition   
3  Computation and Language (Natural Language Pro.

In [26]:
# --- Execute Keyword Extraction for the 10 papers used in the MAP@5 validation ---
print("\nStarting KeyBERT extraction for the 10 query papers from the MAP@5 test set...")
K_VALUE = 5
keywords_list = []

# Iterate through the titles that we know exist from the successful validation results
for title in results_df_map['Query Paper Title'].tolist():
    # Retrieve the paper using the title (iloc[0] is safe now)
    # Using the title to find the paper in the corpus (df)
    paper = df[df['title'] == title].iloc[0]

    # Run the KeyBERT extraction
    keywords = extract_keywords(paper["text"], top_n=K_VALUE)

    keywords_list.append({'Paper Title': paper['title'], 'Keywords': list(keywords)})

keywords_df = pd.DataFrame(keywords_list)
print("\nExtracted Keywords (using KeyBERT) for the 10 MAP@5 test papers:")
print(keywords_df)


Starting KeyBERT extraction for the 10 query papers from the MAP@5 test set...

Extracted Keywords (using KeyBERT) for the 10 MAP@5 test papers:
                                         Paper Title  \
0  Accelerated Sparse Bayesian Learning via Scree...   
1  Wasserstein Soft Label Propagation on Hypergra...   
2  Efficient Model Compression Techniques with Fi...   
3  A NIR-to-VIS face recognition via part adaptiv...   
4  MMF: A loss extension for feature learning in ...   
5  Context-Dependent Anomaly Detection for Low Al...   
6  Kinematic-Layout-aware Random Forests for Dept...   
7  A block-random algorithm for learning on distr...   
8  ROIC-DM: Robust Text Inference and Classificat...   
9  Learning to Adapt Domain Shifts of Moral Value...   

                                            Keywords  
0     [lasso, sparsest, sparse, screening, sparsity]  
1  [hypergraphs, supervised, labels, graphs, wass...  
2  [pruning, fishleg, optimizer, compression, prune]  
3        [pose, r