# Phase 2 — Claim Extraction & Ranking

This notebook tests and debugs the claim extraction and ranking pipeline:
- Extract atomic claims (simple seq2seq / regex rules)
- Rank by check-worthiness → only factual-looking claims pass

## Step 1: Setup and Dependencies

In [1]:
# Phase 2: Claim Extraction and Ranking Testing Notebook
# Purpose: Test claim detection, span extraction, atomicization, context analysis, and ranking
# using actual TruthLens project modules

import sys
import os
from pathlib import Path
from typing import List, Dict, Tuple, Any
import json

# Ensure we can import from the project root
project_root = Path(__file__).parent.parent if '__file__' in globals() else Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Try to import from current directory structure
try:
    # Import actual TruthLens claim extraction modules
    from extractor.claim_detector import is_claim
    from extractor.claim_extractor import extract_claim_spans
    from extractor.atomicizer import to_atomic, AtomicClaim
    from extractor.context import analyze_context
    from extractor.ranker import score_claim
    from extractor.pipeline import process_text
    print("✅ Successfully imported all TruthLens modules")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("📁 Current working directory:", os.getcwd())
    print("📁 Python path:", sys.path[:3])
    print("📁 Trying alternative import method...")
    
    # Alternative: add both current and parent directories
    current_dir = Path.cwd()
    parent_dir = current_dir.parent
    
    if str(current_dir) not in sys.path:
        sys.path.insert(0, str(current_dir))
    if str(parent_dir) not in sys.path:
        sys.path.insert(0, str(parent_dir))
    
    # Try imports again
    try:
        from extractor.claim_detector import is_claim
        from extractor.claim_extractor import extract_claim_spans
        from extractor.atomicizer import to_atomic, AtomicClaim
        from extractor.context import analyze_context
        from extractor.ranker import score_claim
        from extractor.pipeline import process_text
        print("✅ Successfully imported TruthLens modules (alternative method)")
    except ImportError as e2:
        print(f"❌ Still failing: {e2}")
        # Final fallback - show available modules
        print("📂 Available directories:")
        for item in Path.cwd().iterdir():
            if item.is_dir():
                print(f"  {item.name}/")

# Test data for claim extraction and ranking
test_texts = {
    'factual_claim': "COVID-19 vaccines reduce hospitalization rates by 90%.",
    'false_claim': "5G towers cause COVID-19 transmission through radio waves.",
    'opinion': "Pizza is the best food in the world.",
    'conspiracy': "The government uses vaccines to implant tracking microchips.",
    'mixed_content': "The WHO says vaccines are safe, but some Twitter users claim they cause autism.",
    'statistical': "The unemployment rate decreased by 2.3% in Q3 2023.",
    'conditional': "If global temperatures rise by 2 degrees, sea levels may increase significantly.",
    'attributed': "According to Dr. Smith, vitamin D supplements boost immunity.",
    'hedged': "Studies suggest that exercise might reduce depression symptoms.",
    'direct_quote': "The president stated that 'inflation will be under control by next year.'"
}

print("\\n📊 Test Dataset Loaded:")
print(f"  • Total test texts: {len(test_texts)}")
print(f"  • Text types: {', '.join(test_texts.keys())}")
print("\\n🎯 Phase 2 Testing Focus:")
print("  • Claim detection (is_claim)")
print("  • Span extraction (extract_claim_spans)")
print("  • Atomicization (to_atomic)")
print("  • Context analysis (analyze_context)")
print("  • Claim ranking (score_claim)")
print("  • Complete pipeline (process_text)")
print("\\n✅ Ready to test TruthLens Phase 2 functionality!")

✅ Successfully imported all TruthLens modules
\n📊 Test Dataset Loaded:
  • Total test texts: 10
  • Text types: factual_claim, false_claim, opinion, conspiracy, mixed_content, statistical, conditional, attributed, hedged, direct_quote
\n🎯 Phase 2 Testing Focus:
  • Claim detection (is_claim)
  • Span extraction (extract_claim_spans)
  • Atomicization (to_atomic)
  • Context analysis (analyze_context)
  • Claim ranking (score_claim)
  • Complete pipeline (process_text)
\n✅ Ready to test TruthLens Phase 2 functionality!


## Step 2: Sentence-Level Claim Extraction

In [1]:
def test_claim_detection(texts: Dict[str, str]) -> Dict[str, Dict[str, any]]:
    """
    Test claim detection using TruthLens is_claim function
    
    Args:
        texts: Dictionary of test texts
        
    Returns:
        Dictionary with detection results for each text
    """
    results = {}
    
    print("=== Step 1: Claim Detection Using TruthLens ===\n")
    
    for label, text in texts.items():
        try:
            # Use TruthLens claim detector
            is_claim_detected, confidence = is_claim(text)
            
            result = {
                'text': text,
                'is_claim': is_claim_detected,
                'confidence': confidence,
                'threshold': 0.6,  # Default threshold used by is_claim
                'status': 'success',
                'method': 'truthlens_claim_detector'
            }
            
            # Add interpretation
            if is_claim_detected:
                if confidence > 0.8:
                    interpretation = "Strong claim detected"
                elif confidence > 0.6:
                    interpretation = "Moderate claim detected"
                else:
                    interpretation = "Weak claim detected"
            else:
                interpretation = "No factual claim detected"
                
            result['interpretation'] = interpretation
            results[label] = result
            
            print(f"📊 {label.upper()}")
            print(f"Text: {text}")
            print(f"Is Claim: {is_claim_detected} (confidence: {confidence:.3f})")
            print(f"Interpretation: {interpretation}")
            print(f"Method: TruthLens ML-based claim detector")
            print("-" * 60)
            
        except Exception as e:
            results[label] = {
                'text': text,
                'error': str(e),
                'status': 'failed'
            }
            print(f"❌ Error processing {label}: {e}")
    
    return results

# Run claim detection tests
claim_detection_results = test_claim_detection(test_texts)

# Summary statistics
successful_tests = sum(1 for r in claim_detection_results.values() if r.get('status') == 'success')
detected_claims = sum(1 for r in claim_detection_results.values() if r.get('is_claim', False))

print(f"\n📈 CLAIM DETECTION SUMMARY:")
print(f"✅ Successful tests: {successful_tests}/{len(test_texts)}")
print(f"🎯 Claims detected: {detected_claims}/{len(test_texts)}")
print(f"📊 Detection rate: {detected_claims/len(test_texts)*100:.1f}%")

# Expected vs Actual analysis
expected_claims = ['factual_claim', 'conspiracy_claim', 'complex_claim', 'mixed_content']
actual_claims = [label for label, result in claim_detection_results.items() if result.get('is_claim', False)]

print(f"\n🔍 ANALYSIS:")
print(f"Expected claims: {expected_claims}")
print(f"Detected claims: {actual_claims}")

correctly_detected = set(expected_claims) & set(actual_claims)
missed_claims = set(expected_claims) - set(actual_claims)
false_positives = set(actual_claims) - set(expected_claims)

if correctly_detected:
    print(f"✅ Correctly detected: {list(correctly_detected)}")
if missed_claims:
    print(f"❌ Missed claims: {list(missed_claims)}")
if false_positives:
    print(f"⚠️ False positives: {list(false_positives)}")

NameError: name 'Dict' is not defined

## Step 3: Atomic Claim Decomposition

In [None]:
def test_claim_span_extraction(texts: Dict[str, str]) -> Dict[str, Dict[str, any]]:
    """
    Test 
    claim span extraction using TruthLens extract_claim_spans function
    
    Args:
        texts: Dictionary of test texts
        
    Returns:
        Dictionary with span extraction results
    """
    results = {}
    
    print("=== Step 2: Claim Span Extraction Using TruthLens ===\n")
    
    for label, text in texts.items():
        try:
            # Use TruthLens claim span extractor
            spans = extract_claim_spans(text)
            
            result = {
                'text': text,
                'spans': spans,
                'num_spans': len(spans),
                'status': 'success',
                'method': 'truthlens_span_extractor'
            }
            
            # Process spans for display
            processed_spans = []
            for span in spans:
                if hasattr(span, '__dict__'):
                    # If span is an object with attributes
                    span_data = {
                        'text': getattr(span, 'text', ''),
                        'start': getattr(span, 'start', 0),
                        'end': getattr(span, 'end', 0),
                        'confidence': getattr(span, 'conf', 0.0)
                    }
                elif isinstance(span, dict):
                    # If span is a dictionary
                    span_data = {
                        'text': span.get('text', ''),
                        'start': span.get('start', 0),
                        'end': span.get('end', 0),
                        'confidence': span.get('conf', 0.0)
                    }
                else:
                    # Fallback for other types
                    span_data = {'text': str(span), 'start': 0, 'end': len(str(span)), 'confidence': 0.5}
                
                processed_spans.append(span_data)
            
            result['processed_spans'] = processed_spans
            results[label] = result
            
            print(f"🎯 {label.upper()}")
            print(f"Text: {text}")
            print(f"Number of spans found: {len(spans)}")
            
            if processed_spans:
                for i, span_data in enumerate(processed_spans, 1):
                    print(f"  Span {i}: '{span_data['text']}'")
                    print(f"    Position: {span_data['start']}-{span_data['end']}")
                    print(f"    Confidence: {span_data['confidence']:.3f}")
            else:
                print("  No claim spans extracted")
                
            print(f"Method: TruthLens BIO/ML-based span extractor")
            print("-" * 60)
            
        except Exception as e:
            results[label] = {
                'text': text,
                'error': str(e),
                'status': 'failed'
            }
            print(f"❌ Error processing {label}: {e}")
    
    return results

# Run claim span extraction tests
span_extraction_results = test_claim_span_extraction(test_texts)

# Summary statistics
successful_extractions = sum(1 for r in span_extraction_results.values() if r.get('status') == 'success')
total_spans = sum(r.get('num_spans', 0) for r in span_extraction_results.values() if r.get('status') == 'success')

print(f"\n📈 SPAN EXTRACTION SUMMARY:")
print(f"✅ Successful extractions: {successful_extractions}/{len(test_texts)}")
print(f"🎯 Total spans extracted: {total_spans}")
print(f"📊 Average spans per text: {total_spans/successful_extractions if successful_extractions > 0 else 0:.1f}")

# Show texts with most spans
texts_with_spans = [(label, r.get('num_spans', 0)) for label, r in span_extraction_results.items() 
                   if r.get('status') == 'success' and r.get('num_spans', 0) > 0]
texts_with_spans.sort(key=lambda x: x[1], reverse=True)

print(f"\n🔍 TEXTS WITH EXTRACTED SPANS:")
for label, count in texts_with_spans:
    spans_info = [f"'{s['text']}'" for s in span_extraction_results[label].get('processed_spans', [])]
    print(f"- {label}: {count} span(s) - {', '.join(spans_info)}")

if not texts_with_spans:
    print("- No spans were extracted from any texts")

## Step 4: Factual Content Detection

In [None]:
def test_atomic_decomposition(texts: Dict[str, str]) -> Dict[str, Dict[str, any]]:
    """
    Test atomic claim decomposition using TruthLens to_atomic function
    
    Args:
        texts: Dictionary of test texts
        
    Returns:
        Dictionary with atomic decomposition results
    """
    results = {}
    
    print("=== Step 3: Atomic Claim Decomposition Using TruthLens ===\n")
    
    # Add a specific test for complex claims that should decompose well
    complex_test = "Microsoft bought Activision Blizzard for $68.7 billion and Sony acquired Bungie for $3.6 billion in 2022."
    test_set = {**texts, 'complex_acquisition': complex_test}
    
    for label, text in test_set.items():
        try:
            # Use TruthLens atomic decomposer
            # to_atomic expects (span, srl_frames) - we'll pass None for srl_frames for simplicity
            atomic_claims = to_atomic(text, None)
            
            result = {
                'text': text,
                'atomic_claims': atomic_claims,
                'num_atomic_claims': len(atomic_claims) if atomic_claims else 0,
                'status': 'success',
                'method': 'truthlens_atomicizer'
            }
            
            # Process atomic claims for display
            processed_claims = []
            if atomic_claims:
                for claim in atomic_claims:
                    if isinstance(claim, dict):
                        processed_claim = {
                            'text': claim.get('text', ''),
                            'subject': claim.get('subject', ''),
                            'predicate': claim.get('predicate', ''),
                            'object': claim.get('object', '')
                        }
                    else:
                        # Fallback if claim is not a dict
                        processed_claim = {
                            'text': str(claim),
                            'subject': '',
                            'predicate': '',
                            'object': ''
                        }
                    processed_claims.append(processed_claim)
            
            result['processed_claims'] = processed_claims
            results[label] = result
            
            print(f"⚛️ {label.upper()}")
            print(f"Original: {text}")
            print(f"Number of atomic claims: {len(atomic_claims) if atomic_claims else 0}")
            
            if processed_claims:
                for i, claim in enumerate(processed_claims, 1):
                    print(f"  Atomic Claim {i}:")
                    print(f"    Text: '{claim['text']}'")
                    if claim['subject'] or claim['predicate'] or claim['object']:
                        print(f"    Subject: '{claim['subject']}'")
                        print(f"    Predicate: '{claim['predicate']}'")
                        print(f"    Object: '{claim['object']}'")
            else:
                print("  No atomic decomposition available")
                
            print(f"Method: TruthLens rule-based + LLM atomicizer")
            print("-" * 60)
            
        except Exception as e:
            results[label] = {
                'text': text,
                'error': str(e),
                'status': 'failed'
            }
            print(f"❌ Error processing {label}: {e}")
    
    return results

# Run atomic decomposition tests
atomic_results = test_atomic_decomposition(test_texts)

# Summary statistics
successful_decompositions = sum(1 for r in atomic_results.values() if r.get('status') == 'success')
total_atomic_claims = sum(r.get('num_atomic_claims', 0) for r in atomic_results.values() if r.get('status') == 'success')

print(f"\n📈 ATOMIC DECOMPOSITION SUMMARY:")
print(f"✅ Successful decompositions: {successful_decompositions}/{len(atomic_results)}")
print(f"⚛️ Total atomic claims generated: {total_atomic_claims}")
print(f"📊 Average atomic claims per text: {total_atomic_claims/successful_decompositions if successful_decompositions > 0 else 0:.1f}")

# Show texts that decomposed into multiple claims
multi_claim_texts = [(label, r.get('num_atomic_claims', 0)) for label, r in atomic_results.items() 
                    if r.get('status') == 'success' and r.get('num_atomic_claims', 0) > 1]
multi_claim_texts.sort(key=lambda x: x[1], reverse=True)

print(f"\n🔍 TEXTS WITH MULTIPLE ATOMIC CLAIMS:")
for label, count in multi_claim_texts:
    print(f"- {label}: {count} atomic claims")
    for i, claim in enumerate(atomic_results[label].get('processed_claims', []), 1):
        print(f"  {i}. {claim['text']}")

if not multi_claim_texts:
    print("- No texts were decomposed into multiple atomic claims")
    
print(f"\n🎯 Best decomposition examples:")
# Find the most successful decompositions
best_examples = [(label, r) for label, r in atomic_results.items() 
                if r.get('status') == 'success' and r.get('num_atomic_claims', 0) > 0]

for label, result in best_examples[:2]:  # Show top 2
    claims = result.get('processed_claims', [])
    if claims:
        print(f"- {label}: {result['text'][:50]}...")
        for claim in claims:
            if claim['subject'] and claim['predicate'] and claim['object']:
                print(f"  → {claim['subject']} | {claim['predicate']} | {claim['object']}")

## Step 5: Check-worthiness Scoring

In [None]:
def test_context_analysis(texts: Dict[str, str]) -> Dict[str, Dict[str, any]]:
    """
    Test context analysis using TruthLens analyze_context function
    
    Args:
        texts: Dictionary of test texts
        
    Returns:
        Dictionary with context analysis results
    """
    results = {}
    
    print("=== Step 4: Context Analysis Using TruthLens ===\n")
    
    # Create specific test cases for context analysis
    context_test_cases = {
        'definitive': "The WHO confirms that COVID-19 vaccines are safe and effective.",
        'speculative': "Some experts believe that vaccines might cause rare side effects.",
        'attributed': "According to a Twitter post, vaccines are dangerous.",
        'hedged': "It appears that social media may be spreading misinformation.",
        'uncertain': "There could be unknown long-term effects of vaccines.",
        'direct': "Vaccines reduce hospitalization by 90%."
    }
    
    # Combine with original texts for comprehensive testing
    all_test_cases = {**texts, **context_test_cases}
    
    for label, text in all_test_cases.items():
        try:
            # Use TruthLens context analyzer
            # analyze_context expects (claim, sentence) - we'll use text for both
            context_result = analyze_context(text, text)
            
            result = {
                'text': text,
                'context_analysis': context_result,
                'status': 'success',
                'method': 'truthlens_context_analyzer'
            }
            
            # Extract key context features
            if isinstance(context_result, dict):
                modality = context_result.get('modality', 'unknown')
                attribution = context_result.get('attribution', 'none')
                certainty = context_result.get('certainty', 'unknown')
                hedging = context_result.get('hedging', False)
                
                result.update({
                    'modality': modality,
                    'attribution': attribution, 
                    'certainty': certainty,
                    'hedging': hedging
                })
            else:
                result.update({
                    'modality': 'unknown',
                    'attribution': 'none',
                    'certainty': 'unknown',
                    'hedging': False
                })
            
            results[label] = result
            
            print(f"🔍 {label.upper()}")
            print(f"Text: {text}")
            if isinstance(context_result, dict):
                print(f"Context Analysis:")
                for key, value in context_result.items():
                    print(f"  {key}: {value}")
            else:
                print(f"Context result: {context_result}")
            print(f"Method: TruthLens linguistic context analyzer")
            print("-" * 60)
            
        except Exception as e:
            results[label] = {
                'text': text,
                'error': str(e),
                'status': 'failed'
            }
            print(f"❌ Error processing {label}: {e}")
    
    return results

# Run context analysis tests
context_results = test_context_analysis(test_texts)

# Summary statistics
successful_analyses = sum(1 for r in context_results.values() if r.get('status') == 'success')

print(f"\n📈 CONTEXT ANALYSIS SUMMARY:")
print(f"✅ Successful analyses: {successful_analyses}/{len(context_results)}")

# Group by modality
modality_groups = {}
attribution_groups = {}

for label, result in context_results.items():
    if result.get('status') == 'success':
        modality = result.get('modality', 'unknown')
        attribution = result.get('attribution', 'none')
        
        if modality not in modality_groups:
            modality_groups[modality] = []
        modality_groups[modality].append(label)
        
        if attribution not in attribution_groups:
            attribution_groups[attribution] = []
        attribution_groups[attribution].append(label)

print(f"\n🔍 MODALITY DISTRIBUTION:")
for modality, labels in modality_groups.items():
    print(f"- {modality}: {labels}")

print(f"\n🔍 ATTRIBUTION DISTRIBUTION:")  
for attribution, labels in attribution_groups.items():
    print(f"- {attribution}: {labels}")

# Show examples of interesting context features
print(f"\n🎯 INTERESTING CONTEXT FEATURES:")
for label, result in context_results.items():
    if result.get('status') == 'success':
        features = []
        if result.get('hedging'):
            features.append('hedged')
        if result.get('attribution') and result.get('attribution') != 'none':
            features.append(f"attributed to: {result.get('attribution')}")
        if result.get('modality') and result.get('modality') != 'definitive':
            features.append(f"modality: {result.get('modality')}")
            
        if features:
            print(f"- {label}: {', '.join(features)}")

print(f"\n✅ Context analysis helps determine:")
print(f"  • How certain/definitive claims are")
print(f"  • What sources are attributed")
print(f"  • Language hedging and uncertainty markers")
print(f"  • Modality (definitive, speculative, etc.)")

## Step 6: Claim Ranking and Filtering

In [None]:
def test_claim_scoring(texts: Dict[str, str]) -> Dict[str, Dict[str, any]]:
    """
    Test claim scoring/ranking using TruthLens score_claim function
    
    Args:
        texts: Dictionary of test texts
        
    Returns:
        Dictionary with claim scoring results
    """
    results = {}
    
    print("=== Step 5: Claim Scoring/Ranking Using TruthLens ===\n")
    
    # Add specific high-priority claims for testing
    priority_test_cases = {
        'high_priority_medical': "COVID-19 vaccines cause blood clots in 50% of recipients.",
        'high_priority_tech': "5G towers emit radiation that causes cancer.",
        'high_priority_conspiracy': "The government is using vaccines to implant microchips for tracking.",
        'low_priority_opinion': "Pizza is the best food in the world.",
        'low_priority_greeting': "Have a wonderful day!",
        'medium_priority_politics': "The election results were manipulated by foreign interference."
    }
    
    # Combine with original texts
    all_test_cases = {**texts, **priority_test_cases}
    
    scored_claims = []
    
    for label, text in all_test_cases.items():
        try:
            # Use TruthLens claim scorer
            score = score_claim(text)
            
            result = {
                'text': text,
                'score': score,
                'status': 'success',
                'method': 'truthlens_claim_scorer'
            }
            
            # Categorize priority based on score
            if score >= 0.8:
                priority = "High Priority"
                urgency = "🔴 Immediate fact-check needed"
            elif score >= 0.6:
                priority = "Medium Priority" 
                urgency = "🟡 Should be fact-checked"
            elif score >= 0.4:
                priority = "Low-Medium Priority"
                urgency = "🟢 Consider for fact-checking"
            else:
                priority = "Low Priority"
                urgency = "⚪ Likely not worth fact-checking"
            
            result.update({
                'priority': priority,
                'urgency': urgency
            })
            
            results[label] = result
            scored_claims.append((label, score, text))
            
            print(f"🎯 {label.upper()}")
            print(f"Text: {text}")
            print(f"Check-worthiness Score: {score:.3f}")
            print(f"Priority: {priority}")
            print(f"Urgency: {urgency}")
            print(f"Method: TruthLens ML/heuristic claim scorer")
            print("-" * 60)
            
        except Exception as e:
            results[label] = {
                'text': text,
                'error': str(e),
                'status': 'failed'
            }
            print(f"❌ Error processing {label}: {e}")
    
    return results, scored_claims

# Run claim scoring tests
scoring_results, scored_claims = test_claim_scoring(test_texts)

# Sort claims by score for ranking
scored_claims.sort(key=lambda x: x[1], reverse=True)

# Summary statistics
successful_scorings = sum(1 for r in scoring_results.values() if r.get('status') == 'success')
avg_score = sum(r.get('score', 0) for r in scoring_results.values() if r.get('status') == 'success') / successful_scorings if successful_scorings > 0 else 0

print(f"\n📈 CLAIM SCORING SUMMARY:")
print(f"✅ Successful scorings: {successful_scorings}/{len(scoring_results)}")
print(f"📊 Average score: {avg_score:.3f}")

# Priority distribution
priority_counts = {}
for result in scoring_results.values():
    if result.get('status') == 'success':
        priority = result.get('priority', 'Unknown')
        priority_counts[priority] = priority_counts.get(priority, 0) + 1

print(f"\n🎯 PRIORITY DISTRIBUTION:")
for priority, count in priority_counts.items():
    print(f"- {priority}: {count} claims")

# Top ranked claims
print(f"\n🏆 TOP RANKED CLAIMS (by check-worthiness):")
for i, (label, score, text) in enumerate(scored_claims[:5], 1):
    priority = scoring_results[label].get('priority', 'Unknown')
    print(f"{i}. {label} (Score: {score:.3f}, {priority})")
    print(f"   Text: {text[:80]}...")

# Bottom ranked claims  
print(f"\n📉 LOWEST RANKED CLAIMS:")
for i, (label, score, text) in enumerate(scored_claims[-3:], 1):
    priority = scoring_results[label].get('priority', 'Unknown')
    print(f"{i}. {label} (Score: {score:.3f}, {priority})")
    print(f"   Text: {text[:80]}...")

# Score distribution analysis
high_scores = [s for _, s, _ in scored_claims if s >= 0.8]
medium_scores = [s for _, s, _ in scored_claims if 0.4 <= s < 0.8]
low_scores = [s for _, s, _ in scored_claims if s < 0.4]

print(f"\n📊 SCORE DISTRIBUTION ANALYSIS:")
print(f"🔴 High priority (≥0.8): {len(high_scores)} claims")
print(f"🟡 Medium priority (0.4-0.8): {len(medium_scores)} claims") 
print(f"⚪ Low priority (<0.4): {len(low_scores)} claims")

if high_scores:
    print(f"🔴 High priority range: {min(high_scores):.3f} - {max(high_scores):.3f}")
if medium_scores:
    print(f"🟡 Medium priority range: {min(medium_scores):.3f} - {max(medium_scores):.3f}")
if low_scores:
    print(f"⚪ Low priority range: {min(low_scores):.3f} - {max(low_scores):.3f}")

print(f"\n✅ Claim scoring identifies which claims need immediate fact-checking attention!")

## Step 7: Complete Phase 2 Pipeline

In [None]:
def test_complete_claim_pipeline(texts: Dict[str, str]) -> Dict[str, Dict[str, any]]:
    """
    Test complete claim extraction pipeline using TruthLens process_text function
    
    Args:
        texts: Dictionary of test texts
        
    Returns:
        Dictionary with complete pipeline results
    """
    results = {}
    
    print("=== Step 6: Complete Claim Processing Pipeline ===\n")
    
    # Test with a comprehensive document
    comprehensive_test = """
    Breaking News: Recent studies from Johns Hopkins University show that COVID-19 vaccines 
    reduce hospitalization rates by 90% among vaccinated individuals. However, social media 
    posts claim that vaccines cause severe side effects in 80% of recipients. According to 
    the WHO, these claims are unsubstantiated. Meanwhile, some people believe that 5G towers 
    are somehow connected to the spread of COVID-19, though scientists have debunked this theory.
    The pharmaceutical company Pfizer reported $36.8 billion in revenue from their COVID-19 
    vaccine in 2021. Critics argue that profit motives may influence vaccine recommendations.
    """
    
    # Combine with original texts plus comprehensive test
    all_test_cases = {**texts, 'comprehensive_document': comprehensive_test}
    
    for label, text in all_test_cases.items():
        try:
            # Use TruthLens complete processing pipeline
            pipeline_result = process_text(text)
            
            result = {
                'text': text,
                'pipeline_result': pipeline_result,
                'num_claims_extracted': len(pipeline_result) if pipeline_result else 0,
                'status': 'success',
                'method': 'truthlens_complete_pipeline'
            }
            
            # Process results for display
            processed_claims = []
            if pipeline_result:
                for claim_data in pipeline_result:
                    if isinstance(claim_data, dict):
                        processed_claim = {
                            'id': claim_data.get('id', ''),
                            'text': claim_data.get('text', ''),
                            'subject': claim_data.get('subject', ''),
                            'predicate': claim_data.get('predicate', ''),
                            'object': claim_data.get('object', ''),
                            'checkworthiness': claim_data.get('checkworthiness', 0.0),
                            'context': claim_data.get('context', {})
                        }
                        processed_claims.append(processed_claim)
            
            result['processed_claims'] = processed_claims
            results[label] = result
            
            print(f"🔄 {label.upper()}")
            print(f"Input: {text[:100]}...")
            print(f"Claims extracted: {len(pipeline_result) if pipeline_result else 0}")
            
            if processed_claims:
                # Sort by checkworthiness for display
                sorted_claims = sorted(processed_claims, key=lambda x: x['checkworthiness'], reverse=True)
                
                for i, claim in enumerate(sorted_claims, 1):
                    print(f"\n  Claim {i} (ID: {claim['id'][:8]}...):")
                    print(f"    Text: '{claim['text']}'")
                    print(f"    Checkworthiness: {claim['checkworthiness']:.3f}")
                    
                    if claim['subject'] or claim['predicate'] or claim['object']:
                        print(f"    Structure: {claim['subject']} | {claim['predicate']} | {claim['object']}")
                    
                    if claim['context']:
                        context = claim['context']
                        if isinstance(context, dict):
                            context_features = []
                            if context.get('modality'):
                                context_features.append(f"modality: {context['modality']}")
                            if context.get('attribution'):
                                context_features.append(f"attribution: {context['attribution']}")
                            if context_features:
                                print(f"    Context: {', '.join(context_features)}")
            else:
                print("  No claims extracted by pipeline")
                
            print(f"Method: TruthLens end-to-end pipeline (detection→extraction→atomicization→context→scoring)")
            print("-" * 80)
            
        except Exception as e:
            results[label] = {
                'text': text,
                'error': str(e),
                'status': 'failed'
            }
            print(f"❌ Error processing {label}: {e}")
    
    return results

# Run complete pipeline tests
pipeline_results = test_complete_claim_pipeline(test_texts)

# Summary statistics
successful_pipelines = sum(1 for r in pipeline_results.values() if r.get('status') == 'success')
total_pipeline_claims = sum(r.get('num_claims_extracted', 0) for r in pipeline_results.values() if r.get('status') == 'success')

print(f"\n📈 COMPLETE PIPELINE SUMMARY:")
print(f"✅ Successful pipeline runs: {successful_pipelines}/{len(pipeline_results)}")
print(f"🎯 Total claims extracted: {total_pipeline_claims}")
print(f"📊 Average claims per document: {total_pipeline_claims/successful_pipelines if successful_pipelines > 0 else 0:.1f}")

# Find most productive documents
productive_docs = [(label, r.get('num_claims_extracted', 0)) for label, r in pipeline_results.items() 
                  if r.get('status') == 'success' and r.get('num_claims_extracted', 0) > 0]
productive_docs.sort(key=lambda x: x[1], reverse=True)

print(f"\n🏆 MOST PRODUCTIVE DOCUMENTS:")
for label, count in productive_docs[:3]:
    print(f"- {label}: {count} claims extracted")

# Show highest scoring claims across all documents
all_claims = []
for result in pipeline_results.values():
    if result.get('status') == 'success':
        for claim in result.get('processed_claims', []):
            all_claims.append((claim['text'], claim['checkworthiness']))

all_claims.sort(key=lambda x: x[1], reverse=True)

print(f"\n🎯 HIGHEST PRIORITY CLAIMS (across all documents):")
for i, (claim_text, score) in enumerate(all_claims[:5], 1):
    print(f"{i}. Score {score:.3f}: {claim_text}")

print(f"\n=== Phase 2 Testing Complete ===")
print("✅ Claim detection, span extraction, atomicization, context analysis, and ranking all tested!")
print("✅ Complete end-to-end pipeline validated with actual TruthLens modules!")
print("✅ Ready to proceed to Phase 3: Evidence Retrieval")