In [2]:
import pandas as pd
from rank_bm25 import BM25Okapi
import numpy as np

class FactCheckRetrieval:
    def __init__(self, df_fact_checks, max_records=200000):
        """
        Initialize the fact check retrieval system with limited dataset
        
        Parameters:
        df_fact_checks (pd.DataFrame): DataFrame containing columns ['claim', 'translated_claim', 'language']
        max_records (int): Maximum number of records to use (default: 5000)
        """
        # Take only the first max_records
        self.df_fact_checks = df_fact_checks.head(max_records).copy()
        
        # Preprocessing step - clean and prepare the data
        self.df_fact_checks['claim'] = self.df_fact_checks['claim'].fillna('').str.lower()
        self.df_fact_checks['translated_claim'] = self.df_fact_checks['translated_claim'].fillna('').str.lower()
        
        # Create separate BM25 indexes for original and translated claims
        self.original_corpus = self.df_fact_checks['claim'].tolist()
        self.translated_corpus = self.df_fact_checks['translated_claim'].tolist()
        
        # Tokenize both corpora
        self.tokenized_original = [doc.split() for doc in self.original_corpus]
        self.tokenized_translated = [doc.split() for doc in self.translated_corpus]
        
        # Create BM25 instances
        print("Building BM25 index for original claims...")
        self.bm25_original = BM25Okapi(self.tokenized_original)
        print("Building BM25 index for translated claims...")
        self.bm25_translated = BM25Okapi(self.tokenized_translated)
        print("Indexing complete!")
        
        # Store the number of records
        self.num_records = len(self.df_fact_checks)
        print(f"Initialized with {self.num_records} records")
    
    def search(self, query, top_k=5):
        """
        Search for similar fact checks using BM25
        
        Parameters:
        query (str): The query text to search for
        top_k (int): Number of results to return
        
        Returns:
        pd.DataFrame: Top k matching fact checks with scores
        """
        # Preprocess query
        query = query.lower()
        tokenized_query = query.split()
        
        # Get scores from both original and translated claims
        scores_original = self.bm25_original.get_scores(tokenized_query)
        scores_translated = self.bm25_translated.get_scores(tokenized_query)
        
        # Combine scores (taking the maximum score for each document)
        combined_scores = np.maximum(scores_original, scores_translated)
        
        # Get top k indices
        top_k = min(top_k, self.num_records)  # Ensure we don't request more than we have
        top_indices = np.argsort(-combined_scores)[:top_k]
        
        # Create results DataFrame
        results = []
        for idx in top_indices:
            result = {
                'fact_check_id': self.df_fact_checks.index[idx],
                'claim': self.df_fact_checks.iloc[idx]['claim'],
                'translated_claim': self.df_fact_checks.iloc[idx]['translated_claim'],
                'language': self.df_fact_checks.iloc[idx]['language'],
                'similarity_score': combined_scores[idx]
            }
            results.append(result)
            
        return pd.DataFrame(results)

def main():
    """
    Example usage of the fact check retrieval system
    """
    # Load your fact check claims dataset
    print("Loading dataset...")
    df_fact_checks = pd.read_csv('fact_check_claims.csv')
    
    # Initialize the retrieval system with 5000 records
    retrieval_system = FactCheckRetrieval(df_fact_checks)
    
    # Example queries
    example_queries = [
        "COVID-19 vaccine effectiveness",
        "election fraud claims",
        "climate change impact",
        "STATE OF SIEGE PROVINCE OF ITURI Friday, May 7, 9 a.m. The state of siege is beginning to bear fruit. More than 26 armed groups estimated at 4,000 militiamen on their way to lay down their arms. Let's all support our #FARDC",
        "Austria stands up against the dictatorship of health The police and army refuse to control health passes in the name of “freedom and human dignity”. They will join a large demonstration against mandatory confinement on November 20, 2021 in Vienna"
    ]
    
    # Test each query
    for query in example_queries:
        print(f"\nSearching for: '{query}'")
        results = retrieval_system.search(query, top_k=3)
        
        print(f"\nTop 3 matching fact checks:")
        for _, result in results.iterrows():
            print(f"\nClaim: {result['claim']}")
            print(f"Translated: {result['translated_claim']}")
            print(f"Language: {result['language']}")
            print(f"Similarity Score: {result['similarity_score']:.4f}")
            print("-" * 80)

if __name__ == "__main__":
    main()

Loading dataset...
Building BM25 index for original claims...
Building BM25 index for translated claims...
Indexing complete!
Initialized with 153743 records

Searching for: 'COVID-19 vaccine effectiveness'

Top 3 matching fact checks:

Claim: moderna vaccine has 100 effectiveness against severe coronavirus infection
Translated: moderna vaccine has 100 effectiveness against severe coronavirus infection
Language: [('eng', 1.0)]
Similarity Score: 16.4972
--------------------------------------------------------------------------------

Claim: efektivitas vaksin pfizer hanya sebesar 12
Translated: pfizer vaccine effectiveness only 12
Language: [('msa', 1.0)]
Similarity Score: 15.5975
--------------------------------------------------------------------------------

Claim: the survival rate of covid19 without the vaccine is greater than the effectiveness of the vaccine itself
Translated: the survival rate of covid19 without the vaccine is greater than the effectiveness of the vaccine itself
