In [1]:
"""
Hybrid Retrieval System for Scientific Claim Source Retrieval

This implementation combines Query Expansion and Multi-Hop approaches to improve
retrieval performance for matching tweets with relevant scientific papers.

Key Features:
1. Query Expansion: Enhances queries with semantically similar terms from a domain-specific word pool
2. Multi-Hop Enrichment: Uses a two-step process to enrich query representations
3. Hybrid Approach: Combines both lexical and semantic information for better retrieval

The system uses SBERT for encoding and cosine similarity for matching.
"""

'\nHybrid Retrieval System for Scientific Claim Source Retrieval\n\nThis implementation combines Query Expansion and Multi-Hop approaches to improve\nretrieval performance for matching tweets with relevant scientific papers.\n\nKey Features:\n1. Query Expansion: Enhances queries with semantically similar terms from a domain-specific word pool\n2. Multi-Hop Enrichment: Uses a two-step process to enrich query representations\n3. Hybrid Approach: Combines both lexical and semantic information for better retrieval\n\nThe system uses SBERT for encoding and cosine similarity for matching.\n'

In [2]:
import pandas as pd
import pickle
import torch
from sentence_transformers import SentenceTransformer, util
from torch.nn.functional import normalize
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class HybridRetrievalSystem:
    """
    A hybrid retrieval system that combines Query Expansion and Multi-Hop approaches.
    
    This system is designed to improve the matching between informal tweets and
    formal scientific papers by:
    1. Expanding queries with relevant domain terms
    2. Enriching query representations through multi-hop reasoning
    3. Combining both approaches for better retrieval performance
    """
    
    def __init__(self, model_name='multi-qa-mpnet-base-cos-v1'):
        """
        Initialize the hybrid retrieval system.
        
        Args:
            model_name (str): Name of the SBERT model to use for encoding
        """
        self.model = SentenceTransformer(model_name)
        self.paper_embeddings = None  # Pre-computed embeddings for all papers
        self.papers_df = None        # DataFrame containing paper information
        self.word_pool = None        # Domain-specific vocabulary
        self.word_embeddings = None  # Pre-computed embeddings for word pool
        self.tfidf_vectorizer = None # TF-IDF vectorizer for word pool creation
        
    def build_word_pool(self, papers_df, min_freq=5, max_terms=100):
        """
        Build a domain-specific word pool from the corpus using TF-IDF.
        
        This method:
        1. Extracts text from paper titles and abstracts
        2. Uses TF-IDF to identify important domain terms
        3. Pre-computes embeddings for these terms
        
        Args:
            papers_df (pd.DataFrame): DataFrame containing paper information
            min_freq (int): Minimum document frequency for terms
            max_terms (int): Maximum number of terms to include
            
        Returns:
            np.ndarray: Array of selected terms
        """
        # Create text column if it doesn't exist
        if 'text' not in papers_df.columns:
            papers_df['text'] = papers_df['title'] + '. ' + papers_df['abstract']
        
        # Combine all paper texts
        all_texts = papers_df['text'].tolist()
        
        # Initialize TF-IDF vectorizer
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=max_terms,
            min_df=min_freq,
            stop_words='english'
        )
        
        # Fit and transform the texts
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(all_texts)
        
        # Get feature names (terms)
        self.word_pool = self.tfidf_vectorizer.get_feature_names_out()
        
        # Pre-compute embeddings for the word pool
        self.word_embeddings = self.model.encode(
            self.word_pool.tolist(),
            convert_to_tensor=True,
            show_progress_bar=True
        )
        
        return self.word_pool
    
    def clean_text(self, text):
        """
        Clean and normalize text by removing URLs, mentions, and special characters.
        
        Args:
            text (str): Input text to clean
            
        Returns:
            str: Cleaned text
        """
        text = text.lower()
        text = re.sub(r"http\S+|www\S+", "", text)  # remove URLs
        text = re.sub(r"[@#]\w+", "", text)         # remove @mentions and #hashtags
        text = re.sub(r"[^\w\s\-/]", "", text)      # keep alphanum + dash/slash
        return text.strip()
    
    def expand_query(self, query, top_n=3, expansion_weight=0.3):
        """
        Expand query using semantic similarity and TF-IDF weighting.
        
        This method:
        1. Cleans the input query
        2. Finds semantically similar terms from the word pool
        3. Weights and adds these terms to create an expanded query
        
        Args:
            query (str): Original query text
            top_n (int): Number of terms to add
            expansion_weight (float): Weight for expanded terms
            
        Returns:
            tuple: (expanded_query, query_embedding)
        """
        # Clean the query
        clean_query = self.clean_text(query)
        
        # Get query embedding
        query_emb = self.model.encode(clean_query, convert_to_tensor=True)
        
        # Calculate similarity scores
        scores = util.cos_sim(query_emb, self.word_embeddings)[0]
        
        # Get top terms
        top_ids = scores.topk(top_n).indices
        top_terms = [self.word_pool[i] for i in top_ids]
        
        # Calculate term weights based on similarity scores
        term_weights = scores[top_ids].tolist()
        term_weights = [w * expansion_weight for w in term_weights]
        
        # Create weighted expansion
        expanded_terms = []
        for term, weight in zip(top_terms, term_weights):
            repeat_count = max(1, int(weight * 3))
            expanded_terms.extend([term] * repeat_count)
        
        # Combine original query with expanded terms
        expanded_query = f"{clean_query} {' '.join(expanded_terms)}"
        
        return expanded_query, query_emb
    
    def multi_hop_enrichment(self, query_emb, top_k=3, alpha=0.85):
        """
        Enrich query embedding using multi-hop approach.
        
        This method:
        1. Finds most similar documents to the query (first hop)
        2. Uses these documents to enrich the query embedding (second hop)
        3. Combines original and enriched embeddings
        
        Args:
            query_emb (torch.Tensor): Query embedding
            top_k (int): Number of documents to consider
            alpha (float): Weight for original vs. enriched embedding
            
        Returns:
            torch.Tensor: Enriched query embedding
        """
        # First hop: Find most similar documents
        cos_scores = torch.nn.functional.cosine_similarity(
            query_emb.unsqueeze(0), 
            self.paper_embeddings
        ).squeeze()
        
        # Get top document embeddings
        top_indices = torch.topk(cos_scores, k=top_k).indices
        top_doc_embs = self.paper_embeddings[top_indices]
        
        # Second hop: Combine query with document information
        avg_top_doc_emb = torch.mean(top_doc_embs, dim=0)
        enriched_emb = alpha * query_emb + (1 - alpha) * avg_top_doc_emb
        
        return enriched_emb
    
    def hybrid_retrieval(self, query, top_k=5, batch_size=16, 
                        expansion_params={'top_n': 3, 'expansion_weight': 0.3},
                        multi_hop_params={'top_k': 3, 'alpha': 0.85}):
        """
        Combine query expansion and multi-hop approaches for retrieval.
        
        This method:
        1. Expands the query with relevant terms
        2. Enriches the query embedding through multi-hop
        3. Uses the enriched embedding for final retrieval
        
        Args:
            query (str): Original query text
            top_k (int): Number of results to return
            batch_size (int): Batch size for processing
            expansion_params (dict): Parameters for query expansion
            multi_hop_params (dict): Parameters for multi-hop enrichment
            
        Returns:
            tuple: (predictions, expanded_query)
        """
        # Step 1: Query Expansion
        expanded_query, query_emb = self.expand_query(
            query, 
            top_n=expansion_params['top_n'],
            expansion_weight=expansion_params['expansion_weight']
        )
        
        # Step 2: Multi-Hop Enrichment
        enriched_emb = self.multi_hop_enrichment(
            query_emb,
            top_k=multi_hop_params['top_k'],
            alpha=multi_hop_params['alpha']
        )
        
        # Step 3: Get predictions using enriched embedding
        query_norm = normalize(enriched_emb.unsqueeze(0), p=2, dim=1)
        similarity_matrix = torch.matmul(query_norm, self.paper_embeddings.T)
        _, top_k_indices = torch.topk(similarity_matrix, k=top_k, dim=1)
        
        # Get predictions
        predictions = self.papers_df.iloc[top_k_indices[0].tolist()]['cord_uid'].tolist()
        
        return predictions, expanded_query
    
    def load_papers(self, papers_df):
        """
        Load and encode paper collection.
        
        This method:
        1. Stores paper information
        2. Computes embeddings for all papers
        3. Normalizes embeddings for better similarity matching
        
        Args:
            papers_df (pd.DataFrame): DataFrame containing paper information
        """
        self.papers_df = papers_df.copy()
        if 'text' not in self.papers_df.columns:
            self.papers_df['text'] = self.papers_df['title'] + '. ' + self.papers_df['abstract']
        
        # Encode papers
        self.paper_embeddings = self.model.encode(
            self.papers_df['text'].tolist(),
            show_progress_bar=True,
            convert_to_tensor=True
        )
        
        # Normalize embeddings
        self.paper_embeddings = normalize(self.paper_embeddings, p=2, dim=1)
    
    def batch_hybrid_retrieval(self, queries, top_k=5, batch_size=16,
                             expansion_params={'top_n': 3, 'expansion_weight': 0.3},
                             multi_hop_params={'top_k': 3, 'alpha': 0.85}):
        """
        Process a batch of queries using hybrid retrieval.
        
        Args:
            queries (list): List of query texts
            top_k (int): Number of results to return per query
            batch_size (int): Batch size for processing
            expansion_params (dict): Parameters for query expansion
            multi_hop_params (dict): Parameters for multi-hop enrichment
            
        Returns:
            tuple: (list of predictions, list of expanded queries)
        """
        all_predictions = []
        expanded_queries = []
        
        for query in queries:
            predictions, expanded_query = self.hybrid_retrieval(
                query,
                top_k=top_k,
                batch_size=batch_size,
                expansion_params=expansion_params,
                multi_hop_params=multi_hop_params
            )
            all_predictions.append(predictions)
            expanded_queries.append(expanded_query)
            
        return all_predictions, expanded_queries

In [4]:
def evaluate_performance(data, col_gold, col_pred, list_k=[1, 5, 10]):
    """
    Evaluate retrieval performance using Mean Reciprocal Rank (MRR).
    
    Args:
        data (pd.DataFrame): DataFrame containing predictions and gold labels
        col_gold (str): Column name for gold labels
        col_pred (str): Column name for predictions
        list_k (list): List of k values for evaluation
        
    Returns:
        dict: MRR scores for each k value
    """
    d_performance = {}
    for k in list_k:
        scores = []
        for _, row in data.iterrows():
            gold = row[col_gold]
            preds = row[col_pred]
            if isinstance(preds, str):
                try:
                    preds = eval(preds)
                except:
                    preds = []
            if gold in preds[:k]:
                rank = preds[:k].index(gold) + 1
                scores.append(1.0 / rank)
            else:
                scores.append(0.0)
        d_performance[k] = sum(scores) / len(scores) if scores else 0.0
    return d_performance

In [7]:
def main():
    """
    Main function to run the hybrid retrieval system.
    
    This function:
    1. Loads the data
    2. Initializes the system
    3. Processes queries
    4. Evaluates performance
    5. Prints results and examples
    """
    # Load data
    with open("/Users/mataonbas/AIR-CheckThat!-GroupProject/CheckThat-ScientificClaimSourceRetrieval/subtask4b_collection_data.pkl", "rb") as f:
        papers_df = pickle.load(f)
    
    train_df = pd.read_csv("/Users/mataonbas/AIR-CheckThat!-GroupProject/CheckThat-ScientificClaimSourceRetrieval/subtask4b_query_tweets_train.tsv", sep="\t", 
                          names=["post_id", "tweet_text", "cord_uid"])
    dev_df = pd.read_csv("/Users/mataonbas/AIR-CheckThat!-GroupProject/CheckThat-ScientificClaimSourceRetrieval/subtask4b_query_tweets_dev.tsv", sep="\t", 
                        names=["post_id", "tweet_text", "cord_uid"])
    
    # Initialize hybrid system
    hybrid_system = HybridRetrievalSystem()
    
    # Build word pool and load papers
    hybrid_system.build_word_pool(papers_df)
    hybrid_system.load_papers(papers_df)
    
    # Process queries
    train_predictions, train_expanded = hybrid_system.batch_hybrid_retrieval(
        train_df['tweet_text'].tolist(),
        expansion_params={'top_n': 3, 'expansion_weight': 0.3},
        multi_hop_params={'top_k': 3, 'alpha': 0.85}
    )
    
    dev_predictions, dev_expanded = hybrid_system.batch_hybrid_retrieval(
        dev_df['tweet_text'].tolist(),
        expansion_params={'top_n': 3, 'expansion_weight': 0.3},
        multi_hop_params={'top_k': 3, 'alpha': 0.85}
    )
    
    # Store results
    train_df['preds'] = train_predictions
    train_df['expanded_text'] = train_expanded
    dev_df['preds'] = dev_predictions
    dev_df['expanded_text'] = dev_expanded
    
    # Evaluate
    train_mrr = evaluate_performance(train_df, 'cord_uid', 'preds')
    dev_mrr = evaluate_performance(dev_df, 'cord_uid', 'preds')
    
    print("Train MRR:", train_mrr)
    print("Dev MRR:", dev_mrr)
    
    # Print some examples
    print("\nExample Query Expansions:")
    for i in range(3):
        print(f"\nOriginal: {train_df['tweet_text'].iloc[i]}")
        print(f"Expanded: {train_df['expanded_text'].iloc[i]}")

In [8]:
if __name__ == "__main__":
    main() 

Batches: 100%|██████████| 4/4 [00:00<00:00, 13.22it/s]
Batches: 100%|██████████| 242/242 [23:58<00:00,  5.94s/it]


Train MRR: {1: 0.4430527462268555, 5: 0.5072804833774134, 10: 0.5072804833774134}
Dev MRR: {1: 0.4475374732334047, 5: 0.51289555079705, 10: 0.51289555079705}

Example Query Expansions:

Original: tweet_text
Expanded: tweet_text transmission time use

Original: Oral care in rehabilitation medicine: oral vulnerability, oral muscle wasting, and hospital-associated oral issues
Expanded: oral care in rehabilitation medicine oral vulnerability oral muscle wasting and hospital-associated oral issues treatment patients hospital

Original: this study isn't receiving sufficient attention. it reveals black/latino/indigenous individuals aren't just succumbing to covid at higher rates than whites but are also passing away *earlier*.   90% of white fatalities occur in those 65+, 90% of black fatalities occur in those 55+,  89% of indigenous fatalities occur in those 45+
Expanded: this study isnt receiving sufficient attention it reveals black/latino/indigenous individuals arent just succumbing to co