In [1]:
#!/usr/bin/env python3
"""
Science Live - Zenodo CSV Screening Example

This script demonstrates the full workflow:
1. Load papers from Zenodo CSV
2. Enrich missing abstracts from OpenAlex
3. Screen papers against PICO criteria (with checkpoint/resume support)
4. Export results

Run locally where you have network access to Zenodo/OpenAlex.

CHECKPOINT SUPPORT:
- Saves results after each paper to checkpoint.jsonl
- If interrupted, just run again - already-screened DOIs are skipped
- To start fresh: clear_checkpoint(str(CHECKPOINT_FILE))
"""

from screeningPaper import (
    PICOScreener,
    Paper,
    load_papers_from_zenodo_csv,
    load_papers_from_csv_file,
    enrich_paper_from_openalex,
    fetch_pdfs_for_papers,
    load_results_from_checkpoint,
    clear_checkpoint,
    checkpoint_summary,
    clean_checkpoint,           # NEW: Remove bad entries so they get re-screened
    find_suspicious_entries,    # NEW: Find entries that need re-screening
)
import json
import csv
from pathlib import Path


# =============================================================================
# CONFIGURATION
# =============================================================================

PICO_NANOPUB_URI = "https://w3id.org/np/RAjO8tdVOla9I77PeXF4iY92ULngrpx5_ZSKFkVrCmsW0"
ZENODO_CSV_URL = None

CSV_FILE = "wildfire-sentinel2-ml/screening_results/included_studies.csv"


PICO_NANOPUB_URI = "https://w3id.org/np/RAqmVeNbWgL7sNtsr9GqdX0ZTa6aQf3itQmort-JMy4tM"
ZENODO_CSV_URL = None

CSV_FILE = "pets-biodiversity/screening_results/included_studies.csv"

# Project folder - all files go here
PROJECT_FOLDER = Path("./wildfire-sentinel2-ml")
PDF_FOLDER = PROJECT_FOLDER / "pdfs"
RESULTS_FOLDER = PROJECT_FOLDER / "scanning_results"
CHECKPOINT_FILE = RESULTS_FOLDER / "checkpoint.jsonl"
DEBUG_FOLDER = PROJECT_FOLDER / "debug"  # Saves raw LLM responses for uncertain parses


# =============================================================================
# STEP 1: Load papers from Zenodo CSV
# =============================================================================

def step1_load_papers(limit: int = None) -> list:
    """Load papers from Zenodo CSV with abstract enrichment."""
    
    print("=" * 60)
    print("STEP 1: Loading papers from Zenodo CSV")
    print("=" * 60)

    if ZENODO_CSV_URL is not None:
        papers = load_papers_from_zenodo_csv(
            url=ZENODO_CSV_URL,
            enrich_missing_abstracts=True,  # Fetch from OpenAlex if missing
            limit=limit  # None for all papers
        )
    else:    
        papers = load_papers_from_csv_file(
            filepath=CSV_FILE,
            enrich_missing=True,  # Fetch from OpenAlex if missing
        )
    
    # Summary
    with_abstract = sum(1 for p in papers if p.abstract)
    with_doi = sum(1 for p in papers if p.doi)
    
    print(f"\nSummary:")
    print(f"  Total papers: {len(papers)}")
    print(f"  With DOI: {with_doi}")
    print(f"  With abstract: {with_abstract}")
    
    return papers


# =============================================================================
# STEP 2: Pre-download PDFs (optional but recommended)
# =============================================================================

def step2_download_pdfs(papers: list) -> dict:
    """Pre-download PDFs from Unpaywall for faster screening."""
    
    print("\n" + "=" * 60)
    print("STEP 2: Downloading PDFs (optional)")
    print("=" * 60)
    
    PDF_FOLDER.mkdir(parents=True, exist_ok=True)
    
    stats = fetch_pdfs_for_papers(
        papers=papers,
        pdf_dir=str(PDF_FOLDER),
        skip_existing=True  # Don't re-download
    )
    
    return stats


# =============================================================================
# STEP 3: Screen papers against PICO (with checkpoint support)
# =============================================================================

def step3_screen_papers(papers: list) -> list:
    """Screen papers against PICO research question with checkpoint/resume."""
    
    print("\n" + "=" * 60)
    print("STEP 3: Screening papers against PICO")
    print("=" * 60)
    
    # Ensure results folder exists for checkpoint
    RESULTS_FOLDER.mkdir(parents=True, exist_ok=True)
    
    # Check existing progress
    if CHECKPOINT_FILE.exists():
        summary = checkpoint_summary(str(CHECKPOINT_FILE))
        print(f"\nüìÇ Existing checkpoint found:")
        print(f"   Already screened: {summary['total']}")
        print(f"   INCLUDE: {summary['include']}, EXCLUDE: {summary['exclude']}, ERROR: {summary['error']}")
        print(f"   Remaining: {len(papers) - summary['total']}")
    
    # Create screener
    screener = PICOScreener.from_nanopub_url(
        PICO_NANOPUB_URI,
        pdf_folder=str(PDF_FOLDER),
        model="qwen2.5:14b",  # Or llama3.1:8b, mistral, etc.
        char_limit=25000,
        debug_dir=str(DEBUG_FOLDER)  # Saves raw responses for uncertain parses
    )
    
    print(f"\n{screener.get_pico_summary()}")
    
    # Check Ollama
    if not screener.is_ollama_available():
        print("\n‚ö†Ô∏è  Ollama not available!")
        print("   Start with: ollama run qwen2.5:14b")
        return []
    
    # Screen papers (with checkpoint - will skip already-screened DOIs)
    screener.screen_papers(
        papers, 
        verbose=True,
        checkpoint_file=str(CHECKPOINT_FILE)
    )
    
    # Load ALL results from checkpoint (includes previous runs)
    all_results = load_results_from_checkpoint(str(CHECKPOINT_FILE))
    
    return all_results


# =============================================================================
# STEP 4: Export results
# =============================================================================

def step4_export_results(papers: list, results: list):
    """Export screening results to various formats."""
    
    print("\n" + "=" * 60)
    print("STEP 4: Exporting results")
    print("=" * 60)
    
    RESULTS_FOLDER.mkdir(parents=True, exist_ok=True)
    
    # Create lookup for paper metadata by DOI
    paper_by_doi = {p.doi: p for p in papers if p.doi}
    
    # Combine results with paper metadata
    combined = []
    for result in results:
        paper = paper_by_doi.get(result.paper_doi)
        combined.append({
            "doi": result.paper_doi,
            "title": result.paper_title,
            "year": paper.year if paper else "",
            "decision": result.decision,
            "confidence": result.confidence,
            "reason": result.reason,
            "exclusion_code": result.exclusion_code,
            "matched_population": result.matched_population,
            "matched_intervention": result.matched_intervention,
            "screening_time_ms": result.screening_time_ms
        })
    
    if not combined:
        print("No results to export.")
        return
    
    # Export JSON (full details)
    json_path = RESULTS_FOLDER / "screening_results.json"
    with open(json_path, "w") as f:
        json.dump(combined, f, indent=2)
    print(f"  Saved: {json_path}")
    
    # Export CSV (summary)
    csv_path = RESULTS_FOLDER / "screening_results.csv"
    with open(csv_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=combined[0].keys())
        writer.writeheader()
        writer.writerows(combined)
    print(f"  Saved: {csv_path}")
    
    # Export included/excluded lists
    included = [c for c in combined if c["decision"] == "INCLUDE"]
    excluded = [c for c in combined if c["decision"] == "EXCLUDE"]
    
    included_path = RESULTS_FOLDER / "included_papers.csv"
    with open(included_path, "w", newline="") as f:
        if included:
            writer = csv.DictWriter(f, fieldnames=["doi", "title", "year", "confidence", "reason"])
            writer.writeheader()
            writer.writerows([{k: c[k] for k in ["doi", "title", "year", "confidence", "reason"]} for c in included])
    print(f"  Saved: {included_path}")
    
    excluded_path = RESULTS_FOLDER / "excluded_papers.csv"
    with open(excluded_path, "w", newline="") as f:
        if excluded:
            writer = csv.DictWriter(f, fieldnames=["doi", "title", "year", "exclusion_code", "reason"])
            writer.writeheader()
            writer.writerows([{k: c[k] for k in ["doi", "title", "year", "exclusion_code", "reason"]} for c in excluded])
    print(f"  Saved: {excluded_path}")
    
    # Print summary
    print(f"\nüìä SCREENING SUMMARY:")
    print(f"   Total screened: {len(results)}")
    print(f"   INCLUDE: {len(included)} ({100*len(included)/len(results):.1f}%)")
    print(f"   EXCLUDE: {len(excluded)} ({100*len(excluded)/len(results):.1f}%)")
    
    # Exclusion breakdown
    if excluded:
        print(f"\n   Exclusion reasons:")
        from collections import Counter
        codes = Counter(c["exclusion_code"] for c in excluded if c["exclusion_code"])
        for code, count in codes.most_common():
            print(f"     {code}: {count}")


# =============================================================================
# MAIN
# =============================================================================

def main():
    """Run the full screening pipeline."""
    
    print("\nüî¨ SCIENCE LIVE - Systematic Review Screening Pipeline")
    print("=" * 60)
    
    # Create project folder
    PROJECT_FOLDER.mkdir(parents=True, exist_ok=True)
    print(f"Project folder: {PROJECT_FOLDER.absolute()}")
    print(f"Checkpoint: {CHECKPOINT_FILE}")
    print(f"Debug responses: {DEBUG_FOLDER} (for uncertain parses)")
    
    # =========================================================================
    # CHECKPOINT MANAGEMENT OPTIONS (uncomment as needed)
    # =========================================================================
    
    # Option A: Start completely fresh
    # clear_checkpoint(str(CHECKPOINT_FILE))
    
    # Option B: Clean bad entries (empty reasons, suspicious INCLUDEs) 
    #           so they get re-screened with improved parsing
    # clean_checkpoint(str(CHECKPOINT_FILE))
    
    # Option C: Just see what's suspicious
    # suspicious = find_suspicious_entries(str(CHECKPOINT_FILE))
    # print(f"Found {len(suspicious)} suspicious entries:")
    # for s in suspicious[:5]:  # Show first 5
    #     print(f"  - {s['paper_title'][:50]}... ({s['decision']}, conf={s['confidence']})")
    
    # =========================================================================
    
    # Step 1: Load papers
    papers = step1_load_papers(limit=None)  # Set limit=10 for testing
    
    if not papers:
        print("No papers loaded. Exiting.")
        return
    
    # Step 2: Download PDFs (optional - screening will download on-demand)
    # Uncomment to pre-download:
    # step2_download_pdfs(papers)
    
    # Step 3: Screen papers (with checkpoint - safe to interrupt and resume)
    results = step3_screen_papers(papers)
    
    if not results:
        print("No results. Check Ollama status.")
        return
    
    # Step 4: Export
    step4_export_results(papers, results)
    
    print("\n‚úÖ Pipeline complete!")
    print(f"   Results in: {RESULTS_FOLDER.absolute()}")


if __name__ == "__main__":
    main()


üî¨ SCIENCE LIVE - Systematic Review Screening Pipeline
Project folder: /Users/annef/Documents/FAIR2Adapt/systematic-review-pipeline/notebooks/wildfire-sentinel2-ml
Checkpoint: wildfire-sentinel2-ml/scanning_results/checkpoint.jsonl
Debug responses: wildfire-sentinel2-ml/debug (for uncertain parses)
STEP 1: Loading papers from Zenodo CSV
Loaded 7 papers from pets-biodiversity/screening_results/included_studies.csv
  Missing abstracts: 7
Enriching from OpenAlex...
  Enriched 7 papers

Summary:
  Total papers: 7
  With DOI: 7
  With abstract: 7

STEP 3: Screening papers against PICO

üìÇ Existing checkpoint found:
   Already screened: 150
   INCLUDE: 73, EXCLUDE: 77, ERROR: 0
   Remaining: -143
Fetching PICO from: https://w3id.org/np/RAqmVeNbWgL7sNtsr9GqdX0ZTa6aQf3itQmort-JMy4tM
‚úì Loaded: Privacy-Enhancing Technologies for Geospatial Biodiversity Data Sharing: A Scoping Review

PICO Research Question
Title: Privacy-Enhancing Technologies for Geospatial Biodiversity Data Sharing: A S