# Phase 3 — Evidence Retrieval (Hybrid)

This notebook tests and debugs the evidence retrieval pipeline:
- Retrieve from trusted sources: PIB, WHO/health advisories, RBI, major news portals
- Augment with Wikipedia + curated fact-check DB (cache in BigQuery)
- Freshness: bias recent content (last 7–14 days)

## Step 1: Setup and Dependencies

In [None]:
# Phase 3: Evidence Retrieval and Source Assessment
# Updated to use actual TruthLens evidence retrieval modules

import sys
import os
import json
import asyncio
from datetime import datetime, timezone
from typing import Dict, List, Optional, Union, Any, Tuple
from pathlib import Path

# Add project root to path for TruthLens modules
project_root = Path().resolve()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Import actual TruthLens evidence retrieval modules
try:
    # Phase 3 evidence retrieval modules
    from phase3_evidence_retrieval.connectors.wikipedia import WikipediaConnector
    from phase3_evidence_retrieval.connectors.factcheck import PIBFactCheckConnector, PolitiFactConnector
    from phase3_evidence_retrieval.schemas.evidence import RawEvidence, SourceType, Language
    from phase3_evidence_retrieval.scoring.evidence_score import score_evidence, ScoreWeights
    
    # Phase 4 evidence selection (part of retrieval pipeline)
    from phase4_verification.src.evidence_selection import select_top_evidence, EvidenceItem
    
    # Pipeline modules
    try:
        from phase3_evidence_retrieval.pipeline.retrieve_pipeline import RetrievePipeline
        pipeline_available = True
    except:
        pipeline_available = False
        
    print("✅ Successfully imported TruthLens evidence retrieval modules!")
    
except ImportError as e:
    print(f"⚠️ Some evidence modules not available: {e}")
    print("Will use available modules and fallback implementations")
    WikipediaConnector = None
    pipeline_available = False

# Fallback imports from Phase 7 pipeline
try:
    from Phase_7.pipeline.retriever import EvidenceRetriever
    phase7_available = True
except:
    phase7_available = False

print("📊 Available evidence retrieval capabilities:")
print(f"- Wikipedia Connector: {'✅' if WikipediaConnector else '❌'}")
print(f"- Fact-check Connectors: {'✅' if 'PIBFactCheckConnector' in globals() else '❌'}")
print(f"- Evidence Scoring: {'✅' if 'score_evidence' in globals() else '❌'}")
print(f"- Complete Pipeline: {'✅' if pipeline_available else '❌'}")
print(f"- Phase 7 Retriever: {'✅' if phase7_available else '❌'}")

# Test claims for evidence retrieval
test_claims = {
    'covid_vaccine': "COVID-19 vaccines reduce hospitalization rates by 90%",
    'climate_change': "Human activities are the primary cause of recent climate change",
    'conspiracy_5g': "5G towers cause COVID-19 symptoms and health problems",
    'medical_misinformation': "Drinking bleach can cure COVID-19",
    'political_claim': "The 2020 US election was rigged with widespread voter fraud",
    'factual_science': "Water boils at 100 degrees Celsius at sea level pressure",
    'economic_claim': "Inflation in the US reached 9.1% in June 2022"
}

print(f"\n🎯 Test claims prepared: {len(test_claims)} examples")
for key, claim in test_claims.items():
    print(f"- {key}: {claim[:60]}...")
    
print("\n🔍 Phase 3 testing will cover:")
print("1. Wikipedia Background Search")
print("2. Fact-check Source Retrieval")  
print("3. Evidence Relevance Scoring")
print("4. Source Quality Assessment")
print("5. Multi-source Evidence Aggregation")
print("6. Evidence Selection and Ranking")
print("7. Complete Retrieval Pipeline")

## Step 2: Trusted Source Configuration

In [None]:
# Configuration for trusted sources
TRUSTED_SOURCES = {
    'government': {
        'pib': {
            'name': 'Press Information Bureau (PIB)',
            'base_url': 'https://pib.gov.in',
            'rss_feeds': [
                'https://pib.gov.in/rss.aspx',
                'https://pib.gov.in/RSS.aspx?m=Health&ln=1'
            ],
            'search_url': 'https://pib.gov.in/Pressreleasesearch.aspx',
            'credibility_score': 0.95
        },
        'mohfw': {
            'name': 'Ministry of Health and Family Welfare',
            'base_url': 'https://www.mohfw.gov.in',
            'rss_feeds': [
                'https://www.mohfw.gov.in/RSS.xml'
            ],
            'credibility_score': 0.95
        },
        'rbi': {
            'name': 'Reserve Bank of India',
            'base_url': 'https://www.rbi.org.in',
            'rss_feeds': [
                'https://www.rbi.org.in/RSS.xml'
            ],
            'credibility_score': 0.95
        }
    },
    'international': {
        'who': {
            'name': 'World Health Organization',
            'base_url': 'https://www.who.int',
            'rss_feeds': [
                'https://www.who.int/rss-feeds/news-english.xml'
            ],
            'credibility_score': 0.95
        },
        'cdc': {
            'name': 'Centers for Disease Control and Prevention',
            'base_url': 'https://www.cdc.gov',
            'rss_feeds': [
                'https://tools.cdc.gov/api/v2/resources/media.rss'
            ],
            'credibility_score': 0.95
        }
    },
    'media': {
        'reuters': {
            'name': 'Reuters',
            'base_url': 'https://www.reuters.com',
            'rss_feeds': [
                'https://www.reuters.com/arcio/rss/'
            ],
            'credibility_score': 0.85
        },
        'bbc': {
            'name': 'BBC',
            'base_url': 'https://www.bbc.com',
            'rss_feeds': [
                'http://feeds.bbci.co.uk/news/rss.xml'
            ],
            'credibility_score': 0.85
        }
    },
    'factcheck': {
        'snopes': {
            'name': 'Snopes',
            'base_url': 'https://www.snopes.com',
            'credibility_score': 0.80
        },
        'factcheck_org': {
            'name': 'FactCheck.org',
            'base_url': 'https://www.factcheck.org',
            'credibility_score': 0.85
        }
    }
}

print("Trusted sources configuration loaded:")
for category, sources in TRUSTED_SOURCES.items():
    print(f"  {category}: {list(sources.keys())}")

## Step 3: RSS Feed Retrieval

In [None]:
def fetch_rss_feeds(source_config: Dict, max_articles: int = 20) -> List[Dict]:
    """Fetch articles from RSS feeds"""
    articles = []
    
    for feed_url in source_config.get('rss_feeds', []):
        try:
            # Parse RSS feed
            feed = feedparser.parse(feed_url)
            
            for entry in feed.entries[:max_articles]:
                # Extract article metadata
                published_date = None
                if hasattr(entry, 'published_parsed') and entry.published_parsed:
                    published_date = datetime(*entry.published_parsed[:6])
                elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
                    published_date = datetime(*entry.updated_parsed[:6])
                
                article = {
                    'title': entry.get('title', ''),
                    'url': entry.get('link', ''),
                    'summary': entry.get('summary', ''),
                    'published_date': published_date,
                    'source_name': source_config['name'],
                    'credibility_score': source_config['credibility_score'],
                    'feed_url': feed_url,
                    'content_type': 'rss_article'
                }
                
                articles.append(article)
                
        except Exception as e:
            print(f"Error fetching RSS feed {feed_url}: {str(e)}")
            continue
    
    return articles

# Test RSS feed retrieval (mocked for demo)
def mock_rss_articles():
    """Generate mock RSS articles for testing"""
    mock_articles = [
        {
            'title': 'WHO Updates COVID-19 Vaccine Safety Guidelines',
            'url': 'https://www.who.int/news/item/covid-vaccine-safety',
            'summary': 'WHO releases new guidelines on COVID-19 vaccine safety monitoring and adverse event reporting.',
            'published_date': datetime.now() - timedelta(days=2),
            'source_name': 'World Health Organization',
            'credibility_score': 0.95,
            'content_type': 'rss_article'
        },
        {
            'title': 'PIB Fact Check: No Evidence of 80% Severe Side Effects from COVID Vaccines',
            'url': 'https://pib.gov.in/factcheck/covid-vaccine-side-effects',
            'summary': 'PIB fact-checking unit debunks false claims about COVID vaccine side effects.',
            'published_date': datetime.now() - timedelta(days=1),
            'source_name': 'Press Information Bureau (PIB)',
            'credibility_score': 0.95,
            'content_type': 'rss_article'
        },
        {
            'title': 'Cancer Research: New Treatment Shows Promise in Clinical Trials',
            'url': 'https://www.reuters.com/health/cancer-treatment-trials',
            'summary': 'Researchers report positive results from Phase 2 clinical trials of new cancer treatment.',
            'published_date': datetime.now() - timedelta(days=5),
            'source_name': 'Reuters',
            'credibility_score': 0.85,
            'content_type': 'rss_article'
        }
    ]
    return mock_articles

# Get mock articles
mock_articles = mock_rss_articles()
print("Mock RSS articles retrieved:")
for article in mock_articles:
    print(f"  - {article['title']} ({article['source_name']})")
    print(f"    Published: {article['published_date'].strftime('%Y-%m-%d')}")
    print(f"    Credibility: {article['credibility_score']}")

## Step 4: Wikipedia Search and Retrieval

In [None]:
def search_wikipedia(query: str, max_results: int = 3) -> List[Dict]:
    """Search Wikipedia for relevant articles"""
    wikipedia_articles = []
    
    try:
        # Search for relevant pages
        search_results = wikipedia.search(query, results=max_results)
        
        for title in search_results:
            try:
                # Get page summary
                page = wikipedia.page(title)
                
                article = {
                    'title': page.title,
                    'url': page.url,
                    'summary': page.summary[:500],  # First 500 chars
                    'content': page.content[:2000],  # First 2000 chars
                    'published_date': None,  # Wikipedia doesn't have clear publication dates
                    'source_name': 'Wikipedia',
                    'credibility_score': 0.75,  # Medium credibility
                    'content_type': 'wikipedia_article',
                    'categories': getattr(page, 'categories', [])[:5]  # First 5 categories
                }
                
                wikipedia_articles.append(article)
                
            except wikipedia.exceptions.DisambiguationError as e:
                # Handle disambiguation pages by taking the first option
                try:
                    page = wikipedia.page(e.options[0])
                    article = {
                        'title': page.title,
                        'url': page.url,
                        'summary': page.summary[:500],
                        'content': page.content[:2000],
                        'published_date': None,
                        'source_name': 'Wikipedia',
                        'credibility_score': 0.75,
                        'content_type': 'wikipedia_article'
                    }
                    wikipedia_articles.append(article)
                except:
                    continue
                    
            except (wikipedia.exceptions.PageError, wikipedia.exceptions.WikipediaException):
                continue
                
    except Exception as e:
        print(f"Error searching Wikipedia: {str(e)}")
    
    return wikipedia_articles

# Test Wikipedia search (mocked for demo)
def mock_wikipedia_search(query: str) -> List[Dict]:
    """Generate mock Wikipedia results for testing"""
    mock_results = {
        'covid vaccine': [
            {
                'title': 'COVID-19 vaccine',
                'url': 'https://en.wikipedia.org/wiki/COVID-19_vaccine',
                'summary': 'COVID-19 vaccines are vaccines intended to provide immunity against SARS-CoV-2...',
                'content': 'COVID-19 vaccines have been shown to be safe and effective in clinical trials...',
                'source_name': 'Wikipedia',
                'credibility_score': 0.75,
                'content_type': 'wikipedia_article'
            }
        ],
        'cancer cure': [
            {
                'title': 'Cancer treatment',
                'url': 'https://en.wikipedia.org/wiki/Cancer_treatment',
                'summary': 'Cancer treatment includes various methods to treat cancer patients...',
                'content': 'Cancer treatment varies depending on the type and stage of cancer...',
                'source_name': 'Wikipedia',
                'credibility_score': 0.75,
                'content_type': 'wikipedia_article'
            }
        ]
    }
    
    # Simple keyword matching for demo
    query_lower = query.lower()
    if 'covid' in query_lower or 'vaccine' in query_lower:
        return mock_results['covid vaccine']
    elif 'cancer' in query_lower or 'cure' in query_lower:
        return mock_results['cancer cure']
    else:
        return []

# Test Wikipedia search
print("Wikipedia search results:")
for claim in test_claims[:2]:  # Test first 2 claims
    wiki_results = mock_wikipedia_search(claim)
    print(f"\nQuery: {claim}")
    for result in wiki_results:
        print(f"  - {result['title']}")
        print(f"    Summary: {result['summary'][:100]}...")

## Step 5: Semantic Similarity and Relevance Scoring

In [None]:
def calculate_relevance_score(claim: str, article: Dict) -> float:
    """Calculate relevance score between claim and article using TF-IDF similarity"""
    try:
        # Combine article title and summary for comparison
        article_text = f"{article.get('title', '')} {article.get('summary', '')}"
        
        # Create TF-IDF vectors
        vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
        tfidf_matrix = vectorizer.fit_transform([claim, article_text])
        
        # Calculate cosine similarity
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        
        return float(similarity)
        
    except Exception as e:
        print(f"Error calculating relevance score: {str(e)}")
        return 0.0

def calculate_freshness_score(article: Dict, max_age_days: int = 14) -> float:
    """Calculate freshness score based on article publication date"""
    if not article.get('published_date'):
        return 0.5  # Neutral score for articles without dates
    
    # Calculate age in days
    age_days = (datetime.now() - article['published_date']).days
    
    if age_days <= max_age_days:
        # Linear decay from 1.0 to 0.5 over max_age_days
        return 1.0 - (age_days / max_age_days) * 0.5
    else:
        # Exponential decay after max_age_days
        return 0.5 * np.exp(-(age_days - max_age_days) / 30)

def score_evidence_article(claim: str, article: Dict) -> Dict:
    """Score an evidence article for relevance, credibility, and freshness"""
    # Calculate individual scores
    relevance_score = calculate_relevance_score(claim, article)
    credibility_score = article.get('credibility_score', 0.5)
    freshness_score = calculate_freshness_score(article)
    
    # Calculate weighted overall score
    weights = {
        'relevance': 0.5,
        'credibility': 0.3,
        'freshness': 0.2
    }
    
    overall_score = (
        relevance_score * weights['relevance'] +
        credibility_score * weights['credibility'] +
        freshness_score * weights['freshness']
    )
    
    return {
        'article': article,
        'scores': {
            'relevance': relevance_score,
            'credibility': credibility_score,
            'freshness': freshness_score,
            'overall': overall_score
        },
        'claim': claim
    }

# Test evidence scoring
test_claim = "The new COVID vaccine causes severe side effects in 80% of patients"
test_articles = mock_rss_articles() + mock_wikipedia_search(test_claim)

print("Evidence article scoring results:")
print(f"Claim: {test_claim}\n")

scored_articles = []
for article in test_articles:
    scored_article = score_evidence_article(test_claim, article)
    scored_articles.append(scored_article)
    
    print(f"Article: {article['title']}")
    print(f"  Source: {article['source_name']}")
    print(f"  Relevance: {scored_article['scores']['relevance']:.3f}")
    print(f"  Credibility: {scored_article['scores']['credibility']:.3f}")
    print(f"  Freshness: {scored_article['scores']['freshness']:.3f}")
    print(f"  Overall Score: {scored_article['scores']['overall']:.3f}\n")

## Step 6: Evidence Ranking and Selection

In [None]:
def rank_and_select_evidence(claim: str, articles: List[Dict], top_k: int = 3) -> List[Dict]:
    """Rank articles by overall score and select top evidence"""
    # Score all articles
    scored_articles = []
    for article in articles:
        scored_article = score_evidence_article(claim, article)
        scored_articles.append(scored_article)
    
    # Sort by overall score (descending)
    scored_articles.sort(key=lambda x: x['scores']['overall'], reverse=True)
    
    # Select top k articles
    top_evidence = scored_articles[:top_k]
    
    # Add ranking information
    for i, evidence in enumerate(top_evidence):
        evidence['rank'] = i + 1
        evidence['selected'] = True
    
    return top_evidence

def extract_relevant_snippets(claim: str, article_content: str, max_snippets: int = 2) -> List[str]:
    """Extract relevant snippets from article content"""
    # Split content into sentences
    sentences = re.split(r'[.!?]+', article_content)
    sentences = [s.strip() for s in sentences if s.strip() and len(s.split()) > 5]
    
    if not sentences:
        return []
    
    # Calculate similarity for each sentence
    sentence_scores = []
    for sentence in sentences:
        try:
            # Simple keyword overlap scoring
            claim_words = set(claim.lower().split())
            sentence_words = set(sentence.lower().split())
            overlap = len(claim_words & sentence_words)
            score = overlap / len(claim_words) if claim_words else 0
            sentence_scores.append((sentence, score))
        except:
            sentence_scores.append((sentence, 0))
    
    # Sort by score and select top snippets
    sentence_scores.sort(key=lambda x: x[1], reverse=True)
    top_snippets = [sentence for sentence, score in sentence_scores[:max_snippets] if score > 0]
    
    return top_snippets

# Test evidence ranking and selection
print("Evidence ranking and selection:")
print(f"Claim: {test_claim}\n")

top_evidence = rank_and_select_evidence(test_claim, test_articles)

for evidence in top_evidence:
    article = evidence['article']
    print(f"Rank {evidence['rank']}: {article['title']}")
    print(f"  Source: {article['source_name']}")
    print(f"  URL: {article['url']}")
    print(f"  Overall Score: {evidence['scores']['overall']:.3f}")
    
    # Extract relevant snippets
    content = article.get('content', article.get('summary', ''))
    snippets = extract_relevant_snippets(test_claim, content)
    if snippets:
        print(f"  Key snippets:")
        for snippet in snippets:
            print(f"    - {snippet[:150]}...")
    print()

## Step 7: Complete Phase 3 Pipeline

In [None]:
def phase3_pipeline(claims: List[str], max_evidence_per_claim: int = 3) -> Dict:
    """Complete Phase 3 pipeline: Evidence Retrieval"""
    pipeline_result = {
        'phase': 'Phase 3 - Evidence Retrieval (Hybrid)',
        'input_claims': claims,
        'steps': [],
        'evidence_by_claim': {}
    }
    
    try:
        # Step 1: Fetch from trusted sources (RSS feeds)
        all_rss_articles = mock_rss_articles()  # In production, fetch from actual RSS feeds
        pipeline_result['steps'].append({
            'step': 'rss_feed_retrieval',
            'result': {
                'articles_retrieved': len(all_rss_articles),
                'sources': list(set([a['source_name'] for a in all_rss_articles]))
            }
        })
        
        # Step 2: Process each claim
        for claim in claims:
            claim_evidence = {
                'claim': claim,
                'evidence_sources': []
            }
            
            # Get RSS articles for this claim
            rss_evidence = rank_and_select_evidence(claim, all_rss_articles, max_evidence_per_claim)
            
            # Get Wikipedia articles for this claim
            wiki_articles = mock_wikipedia_search(claim)
            wiki_evidence = rank_and_select_evidence(claim, wiki_articles, 1)  # Max 1 Wikipedia result
            
            # Combine and re-rank all evidence
            all_evidence = rss_evidence + wiki_evidence
            all_evidence.sort(key=lambda x: x['scores']['overall'], reverse=True)
            
            # Select final evidence
            final_evidence = all_evidence[:max_evidence_per_claim]
            
            # Format evidence for output
            for i, evidence in enumerate(final_evidence):
                article = evidence['article']
                content = article.get('content', article.get('summary', ''))
                snippets = extract_relevant_snippets(claim, content)
                
                evidence_item = {
                    'rank': i + 1,
                    'title': article['title'],
                    'url': article['url'],
                    'source_name': article['source_name'],
                    'published_date': article.get('published_date'),
                    'credibility_score': article['credibility_score'],
                    'relevance_score': evidence['scores']['relevance'],
                    'overall_score': evidence['scores']['overall'],
                    'key_snippets': snippets,
                    'content_type': article['content_type']
                }
                
                claim_evidence['evidence_sources'].append(evidence_item)
            
            pipeline_result['evidence_by_claim'][claim] = claim_evidence
        
        # Step 3: Cache results (mock)
        cache_info = {
            'cached_articles': len(all_rss_articles) + sum(len(mock_wikipedia_search(c)) for c in claims),
            'cache_timestamp': datetime.now().isoformat(),
            'cache_ttl_hours': 24
        }
        
        pipeline_result['steps'].append({
            'step': 'evidence_caching',
            'result': cache_info
        })
        
        # Final summary
        total_evidence = sum(len(claim_data['evidence_sources']) 
                           for claim_data in pipeline_result['evidence_by_claim'].values())
        
        pipeline_result['final_output'] = {
            'claims_processed': len(claims),
            'total_evidence_retrieved': total_evidence,
            'ready_for_phase4': total_evidence > 0
        }
        pipeline_result['status'] = 'success'
        
    except Exception as e:
        pipeline_result['error'] = str(e)
        pipeline_result['status'] = 'failed'
    
    return pipeline_result

# Test complete Phase 3 pipeline
print("=== Testing Complete Phase 3 Pipeline ===")
phase3_result = phase3_pipeline(test_claims)

# Print summary
print(f"Status: {phase3_result['status']}")
print(f"Claims processed: {phase3_result['final_output']['claims_processed']}")
print(f"Total evidence retrieved: {phase3_result['final_output']['total_evidence_retrieved']}")
print(f"Ready for Phase 4: {phase3_result['final_output']['ready_for_phase4']}")

# Print evidence for each claim
for claim, evidence_data in phase3_result['evidence_by_claim'].items():
    print(f"\nClaim: {claim}")
    print(f"Evidence sources found: {len(evidence_data['evidence_sources'])}")
    
    for evidence in evidence_data['evidence_sources']:
        print(f"  {evidence['rank']}. {evidence['title']} ({evidence['source_name']})")
        print(f"     Score: {evidence['overall_score']:.3f} | Credibility: {evidence['credibility_score']}")
        if evidence['key_snippets']:
            print(f"     Key snippet: {evidence['key_snippets'][0][:100]}...")