# Database Search Execution via APIs

This notebook executes systematic review searches across multiple free academic databases using their APIs.

**Supported Databases:**
- OpenAlex (replaces Scopus/WoS)
- arXiv
- Semantic Scholar
- PubMed (via Entrez)
- Europe PMC

**Note:** IEEE Xplore API requires institutional subscription - manual search recommended.

# Install required packages
!pip install requests pyalex arxiv semanticscholar biopython -q

In [1]:
import requests
import json
import time
from datetime import datetime
from pathlib import Path
import csv

# Optional imports - will check availability
try:
    from pyalex import Works, config as pyalex_config
    PYALEX_AVAILABLE = True
except ImportError:
    PYALEX_AVAILABLE = False

try:
    import arxiv
    ARXIV_AVAILABLE = True
except ImportError:
    ARXIV_AVAILABLE = False

try:
    from semanticscholar import SemanticScholar
    S2_AVAILABLE = True
except ImportError:
    S2_AVAILABLE = False

try:
    from Bio import Entrez
    ENTREZ_AVAILABLE = True
except ImportError:
    ENTREZ_AVAILABLE = False

print(f"PyAlex available: {PYALEX_AVAILABLE}")
print(f"arXiv available: {ARXIV_AVAILABLE}")
print(f"Semantic Scholar available: {S2_AVAILABLE}")
print(f"Entrez (PubMed) available: {ENTREZ_AVAILABLE}")

PyAlex available: True
arXiv available: True
Semantic Scholar available: True
Entrez (PubMed) available: True


## Configuration

Edit these settings for your search:

In [2]:
# ============== EDIT THIS SECTION ==============

# Your email (required for polite API access)
EMAIL = "anne.fouilloux@gmail.com"  # CHANGE THIS

# Search parameters
SEARCH_TERMS = {
    "quantum": ["quantum computing", "quantum machine learning", "QML", "QAOA", "quantum annealing"],
    "biodiversity": ["biodiversity", "conservation", "species distribution", "ecological network", "population genetics"]
}

# Date range
START_YEAR = 2015
END_YEAR = 2025

# Output directory
OUTPUT_DIR = Path("../search_results")
OUTPUT_DIR.mkdir(exist_ok=True)

# Maximum results to retrieve per database (set None for all)
MAX_RESULTS = 500

# ==============================================

In [3]:
# Results storage
search_results = {
    "search_date": datetime.now().isoformat(),
    "databases": {}
}

all_records = []  # Combined records for export

## 1. OpenAlex Search

OpenAlex provides comprehensive bibliographic coverage as a free alternative to Scopus/WoS.

In [4]:
def search_openalex(search_terms, start_year, end_year, max_results=500, email=None):
    """Search OpenAlex using their REST API."""
    
    # Build query: (quantum terms) AND (biodiversity terms)
    quantum_query = " OR ".join(search_terms["quantum"])
    bio_query = " OR ".join(search_terms["biodiversity"])
    full_query = f"({quantum_query}) AND ({bio_query})"
    
    base_url = "https://api.openalex.org/works"
    
    params = {
        "search": full_query,
        "filter": f"publication_year:{start_year}-{end_year}",
        "per_page": 200,
        "cursor": "*"
    }
    
    if email:
        params["mailto"] = email
    
    results = []
    total_count = 0
    
    print(f"Searching OpenAlex: {full_query[:80]}...")
    
    while True:
        response = requests.get(base_url, params=params)
        
        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            break
            
        data = response.json()
        
        if total_count == 0:
            total_count = data.get("meta", {}).get("count", 0)
            print(f"Total results available: {total_count}")
        
        works = data.get("results", [])
        if not works:
            break
            
        for work in works:
            # Safely extract journal name with multiple null checks
            journal = ""
            primary_loc = work.get("primary_location")
            if primary_loc:
                source_obj = primary_loc.get("source")
                if source_obj:
                    journal = source_obj.get("display_name", "")
            
            record = {
                "source": "OpenAlex",
                "id": work.get("id", ""),
                "doi": work.get("doi", ""),
                "title": work.get("title", ""),
                "year": work.get("publication_year"),
                "authors": "; ".join([name for name in [(a.get("author") or {}).get("display_name") for a in work.get("authorships", [])] if name]),
                "journal": journal,
                "abstract": work.get("abstract", "") or "",
                "type": work.get("type", ""),
                "is_oa": work.get("open_access", {}).get("is_oa", False)
            }
            results.append(record)
            
        if max_results and len(results) >= max_results:
            results = results[:max_results]
            break
            
        # Get next page cursor
        next_cursor = data.get("meta", {}).get("next_cursor")
        if not next_cursor:
            break
        params["cursor"] = next_cursor
        
        time.sleep(0.1)  # Rate limiting
    
    print(f"Retrieved {len(results)} records from OpenAlex")
    return results, total_count

# Execute OpenAlex search
openalex_results, openalex_total = search_openalex(SEARCH_TERMS, START_YEAR, END_YEAR, MAX_RESULTS, EMAIL)
search_results["databases"]["OpenAlex"] = {
    "total_available": openalex_total,
    "retrieved": len(openalex_results)
}
all_records.extend(openalex_results)

Searching OpenAlex: (quantum computing OR quantum machine learning OR QML OR QAOA OR quantum anneali...
Total results available: 467
Retrieved 467 records from OpenAlex


## 2. arXiv Search

Essential for quantum computing preprints.

In [5]:
def search_arxiv(search_terms, start_year, end_year, max_results=500):
    """Search arXiv using their API."""
    
    if not ARXIV_AVAILABLE:
        print("arXiv package not available. Install with: pip install arxiv")
        return [], 0
    
    # Build query
    quantum_terms = " OR ".join([f'all:"{t}"' for t in search_terms["quantum"]])
    bio_terms = " OR ".join([f'all:"{t}"' for t in search_terms["biodiversity"]])
    query = f"({quantum_terms}) AND ({bio_terms})"
    
    # Add category filter for relevant areas
    categories = "cat:quant-ph OR cat:cs.LG OR cat:cs.AI OR cat:q-bio*"
    
    print(f"Searching arXiv...")
    
    client = arxiv.Client()
    search = arxiv.Search(
        query=query,
        max_results=max_results or 1000,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )
    
    results = []
    for paper in client.results(search):
        pub_year = paper.published.year
        if start_year <= pub_year <= end_year:
            record = {
                "source": "arXiv",
                "id": paper.entry_id,
                "doi": paper.doi or "",
                "title": paper.title,
                "year": pub_year,
                "authors": "; ".join([a.name for a in paper.authors]),
                "journal": "arXiv preprint",
                "abstract": paper.summary,
                "type": "preprint",
                "is_oa": True,
                "categories": ", ".join(paper.categories)
            }
            results.append(record)
    
    print(f"Retrieved {len(results)} records from arXiv")
    return results, len(results)

# Execute arXiv search
arxiv_results, arxiv_total = search_arxiv(SEARCH_TERMS, START_YEAR, END_YEAR, MAX_RESULTS)
search_results["databases"]["arXiv"] = {
    "total_available": arxiv_total,
    "retrieved": len(arxiv_results)
}
all_records.extend(arxiv_results)

Searching arXiv...
Retrieved 178 records from arXiv


## 3. PubMed Search

Via NCBI Entrez E-utilities.

In [6]:
def search_pubmed(search_terms, start_year, end_year, max_results=500, email=None):
    """Search PubMed using Entrez E-utilities."""
    
    if not ENTREZ_AVAILABLE:
        print("Biopython not available. Install with: pip install biopython")
        return [], 0
    
    Entrez.email = email or "user@example.com"
    
    # Build PubMed query
    quantum_terms = " OR ".join([f'"{t}"[Title/Abstract]' for t in search_terms["quantum"]])
    bio_terms = " OR ".join([f'"{t}"[Title/Abstract]' for t in search_terms["biodiversity"]])
    query = f"({quantum_terms}) AND ({bio_terms}) AND ({start_year}:{end_year}[pdat])"
    
    print(f"Searching PubMed...")
    
    # Search
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results or 10000)
    record = Entrez.read(handle)
    handle.close()
    
    id_list = record["IdList"]
    total_count = int(record["Count"])
    
    print(f"Found {total_count} records, retrieving {len(id_list)}...")
    
    if not id_list:
        return [], total_count
    
    # Fetch details
    handle = Entrez.efetch(db="pubmed", id=id_list, rettype="xml", retmode="xml")
    records = Entrez.read(handle)
    handle.close()
    
    results = []
    for article in records.get("PubmedArticle", []):
        medline = article.get("MedlineCitation", {})
        art = medline.get("Article", {})
        
        # Get authors
        authors = []
        for author in art.get("AuthorList", []):
            if "LastName" in author:
                name = f"{author.get('LastName', '')} {author.get('ForeName', '')}".strip()
                authors.append(name)
        
        # Get abstract
        abstract_parts = art.get("Abstract", {}).get("AbstractText", [])
        abstract = " ".join([str(p) for p in abstract_parts]) if abstract_parts else ""
        
        # Get year
        pub_date = art.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {})
        year = pub_date.get("Year", "")
        
        # Get DOI
        doi = ""
        for eid in article.get("PubmedData", {}).get("ArticleIdList", []):
            if eid.attributes.get("IdType") == "doi":
                doi = str(eid)
                break
        
        record = {
            "source": "PubMed",
            "id": f"PMID:{medline.get('PMID', '')}",
            "doi": doi,
            "title": str(art.get("ArticleTitle", "")),
            "year": int(year) if year.isdigit() else None,
            "authors": "; ".join(authors),
            "journal": art.get("Journal", {}).get("Title", ""),
            "abstract": abstract,
            "type": "article",
            "is_oa": False  # Would need separate PMC check
        }
        results.append(record)
    
    print(f"Retrieved {len(results)} records from PubMed")
    return results, total_count

# Execute PubMed search
pubmed_results, pubmed_total = search_pubmed(SEARCH_TERMS, START_YEAR, END_YEAR, MAX_RESULTS, EMAIL)
search_results["databases"]["PubMed"] = {
    "total_available": pubmed_total,
    "retrieved": len(pubmed_results)
}
all_records.extend(pubmed_results)

Searching PubMed...
Found 4 records, retrieving 4...
Retrieved 4 records from PubMed


## 4. Europe PMC Search

In [7]:
def search_europepmc(search_terms, start_year, end_year, max_results=500):
    """Search Europe PMC using their REST API."""
    
    # Build query
    quantum_query = " OR ".join([f'"{t}"' for t in search_terms["quantum"]])
    bio_query = " OR ".join([f'"{t}"' for t in search_terms["biodiversity"]])
    query = f"({quantum_query}) AND ({bio_query}) AND (PUB_YEAR:[{start_year} TO {end_year}])"
    
    base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    
    params = {
        "query": query,
        "format": "json",
        "pageSize": 1000,
        "resultType": "core"
    }
    
    print(f"Searching Europe PMC...")
    
    response = requests.get(base_url, params=params)
    
    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        return [], 0
    
    data = response.json()
    total_count = data.get("hitCount", 0)
    
    print(f"Found {total_count} records")
    
    results = []
    for item in data.get("resultList", {}).get("result", [])[:max_results]:
        record = {
            "source": "Europe PMC",
            "id": item.get("id", ""),
            "doi": item.get("doi", ""),
            "title": item.get("title", ""),
            "year": item.get("pubYear"),
            "authors": item.get("authorString", ""),
            "journal": item.get("journalTitle", ""),
            "abstract": item.get("abstractText", ""),
            "type": item.get("pubType", ""),
            "is_oa": item.get("isOpenAccess", "N") == "Y"
        }
        results.append(record)
    
    print(f"Retrieved {len(results)} records from Europe PMC")
    return results, total_count

# Execute Europe PMC search
epmc_results, epmc_total = search_europepmc(SEARCH_TERMS, START_YEAR, END_YEAR, MAX_RESULTS)
search_results["databases"]["Europe PMC"] = {
    "total_available": epmc_total,
    "retrieved": len(epmc_results)
}
all_records.extend(epmc_results)

Searching Europe PMC...
Found 597 records
Retrieved 500 records from Europe PMC


## 5. Semantic Scholar Search

In [8]:
def search_semantic_scholar(search_terms, start_year, end_year, max_results=500):
    """Search Semantic Scholar using their API."""
    
    # Build simple query (S2 works better with simpler queries)
    query = "quantum computing biodiversity conservation"
    
    base_url = "https://api.semanticscholar.org/graph/v1/paper/search"
    
    params = {
        "query": query,
        "year": f"{start_year}-{end_year}",
        "fields": "paperId,externalIds,title,year,authors,venue,abstract,isOpenAccess",
        "limit": min(max_results or 100, 100)  # S2 limits to 100 per request
    }
    
    print(f"Searching Semantic Scholar...")
    
    headers = {"Accept": "application/json"}
    
    results = []
    offset = 0
    total_count = 0
    
    while True:
        params["offset"] = offset
        response = requests.get(base_url, params=params, headers=headers)
        
        if response.status_code == 429:
            print("Rate limited, waiting...")
            time.sleep(5)
            continue
            
        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            break
        
        data = response.json()
        
        if total_count == 0:
            total_count = data.get("total", 0)
            print(f"Total results available: {total_count}")
        
        papers = data.get("data", [])
        if not papers:
            break
        
        for paper in papers:
            record = {
                "source": "Semantic Scholar",
                "id": paper.get("paperId", ""),
                "doi": paper.get("externalIds", {}).get("DOI", ""),
                "title": paper.get("title", ""),
                "year": paper.get("year"),
                "authors": "; ".join([a.get("name", "") for a in paper.get("authors", [])]),
                "journal": paper.get("venue", ""),
                "abstract": paper.get("abstract", "") or "",
                "type": "article",
                "is_oa": paper.get("isOpenAccess", False)
            }
            results.append(record)
        
        if max_results and len(results) >= max_results:
            results = results[:max_results]
            break
            
        offset += len(papers)
        if offset >= total_count:
            break
            
        time.sleep(1)  # Rate limiting
    
    print(f"Retrieved {len(results)} records from Semantic Scholar")
    return results, total_count

# Execute Semantic Scholar search
s2_results, s2_total = search_semantic_scholar(SEARCH_TERMS, START_YEAR, END_YEAR, MAX_RESULTS)
search_results["databases"]["Semantic Scholar"] = {
    "total_available": s2_total,
    "retrieved": len(s2_results)
}
all_records.extend(s2_results)

Searching Semantic Scholar...
Rate limited, waiting...
Total results available: 621
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Rate limited, waiting...
Retrieved 500 records from Semantic Scholar


## Results Summary

In [9]:
print("\n" + "="*60)
print("SEARCH RESULTS SUMMARY")
print("="*60)
print(f"Search Date: {search_results['search_date']}")
print(f"Date Range: {START_YEAR}-{END_YEAR}")
print()

total_retrieved = 0
for db, counts in search_results["databases"].items():
    print(f"{db}:")
    print(f"  - Total available: {counts['total_available']}")
    print(f"  - Retrieved: {counts['retrieved']}")
    total_retrieved += counts['retrieved']

print()
print(f"TOTAL RECORDS (before deduplication): {total_retrieved}")
print("="*60)


SEARCH RESULTS SUMMARY
Search Date: 2025-12-26T19:05:32.698171
Date Range: 2015-2025

OpenAlex:
  - Total available: 467
  - Retrieved: 467
arXiv:
  - Total available: 178
  - Retrieved: 178
PubMed:
  - Total available: 4
  - Retrieved: 4
Europe PMC:
  - Total available: 597
  - Retrieved: 500
Semantic Scholar:
  - Total available: 621
  - Retrieved: 500

TOTAL RECORDS (before deduplication): 1649


## Export Results

In [10]:
# Export to CSV
csv_path = OUTPUT_DIR / "search_results_combined.csv"

if all_records:
    fieldnames = ["source", "id", "doi", "title", "year", "authors", "journal", "abstract", "type", "is_oa"]
    
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
        writer.writeheader()
        writer.writerows(all_records)
    
    print(f"Exported {len(all_records)} records to {csv_path}")
else:
    print("No records to export")

Exported 1649 records to search_results/search_results_combined.csv


In [11]:
# Export to BibTeX format
def to_bibtex(records, filepath):
    """Convert records to BibTeX format."""
    
    with open(filepath, "w", encoding="utf-8") as f:
        for i, rec in enumerate(records):
            # Generate citation key
            first_author = rec.get("authors", "Unknown").split(";")[0].split()[-1] if rec.get("authors") else "Unknown"
            year = rec.get("year", "XXXX")
            key = f"{first_author}{year}_{i}"
            
            f.write(f"@article{{{key},\n")
            f.write(f"  title = {{{rec.get('title', '')}}},\n")
            f.write(f"  author = {{{rec.get('authors', '')}}},\n")
            f.write(f"  year = {{{year}}},\n")
            f.write(f"  journal = {{{rec.get('journal', '')}}},\n")
            if rec.get("doi"):
                f.write(f"  doi = {{{rec.get('doi')}}},\n")
            f.write(f"  note = {{Source: {rec.get('source', '')}}},\n")
            f.write("}\n\n")
    
    print(f"Exported {len(records)} records to {filepath}")

bibtex_path = OUTPUT_DIR / "search_results_combined.bib"
if all_records:
    to_bibtex(all_records, bibtex_path)

Exported 1649 records to search_results/search_results_combined.bib


In [12]:
# Export to RIS format
def to_ris(records, filepath):
    """Convert records to RIS format."""
    
    with open(filepath, "w", encoding="utf-8") as f:
        for rec in records:
            f.write("TY  - JOUR\n")
            f.write(f"TI  - {rec.get('title', '')}\n")
            
            for author in rec.get("authors", "").split("; "):
                if author:
                    f.write(f"AU  - {author}\n")
            
            f.write(f"PY  - {rec.get('year', '')}\n")
            f.write(f"JO  - {rec.get('journal', '')}\n")
            
            if rec.get("doi"):
                f.write(f"DO  - {rec.get('doi')}\n")
            
            if rec.get("abstract"):
                f.write(f"AB  - {rec.get('abstract', '')[:1000]}\n")
            
            f.write(f"N1  - Source: {rec.get('source', '')}\n")
            f.write("ER  - \n\n")
    
    print(f"Exported {len(records)} records to {filepath}")

ris_path = OUTPUT_DIR / "search_results_combined.ris"
if all_records:
    to_ris(all_records, ris_path)

Exported 1649 records to search_results/search_results_combined.ris


In [13]:
# Save search summary JSON
summary_path = OUTPUT_DIR / "search_summary.json"

with open(summary_path, "w") as f:
    json.dump(search_results, f, indent=2)

print(f"Saved search summary to {summary_path}")

Saved search summary to search_results/search_summary.json


## Deduplication Preview

In [15]:
# Simple deduplication based on DOI
def deduplicate_by_doi(records):
    """Remove duplicates based on DOI."""
    seen_dois = set()
    unique = []
    duplicates = 0
    no_doi = 0
    
    for rec in records:
        doi = (rec.get("doi") or "").strip().lower()
        if not doi:
            no_doi += 1
            unique.append(rec)  # Keep records without DOI
        elif doi not in seen_dois:
            seen_dois.add(doi)
            unique.append(rec)
        else:
            duplicates += 1
    
    return unique, duplicates, no_doi

unique_records, dup_count, no_doi_count = deduplicate_by_doi(all_records)

print(f"\nDEDUPLICATION PREVIEW (DOI-based):")
print(f"  Total records: {len(all_records)}")
print(f"  Duplicates removed: {dup_count}")
print(f"  Records without DOI: {no_doi_count}")
print(f"  Unique records: {len(unique_records)}")
print("\nNote: Full deduplication should be done in Zotero/Rayyan with title matching")


DEDUPLICATION PREVIEW (DOI-based):
  Total records: 1649
  Duplicates removed: 33
  Records without DOI: 240
  Unique records: 1616

Note: Full deduplication should be done in Zotero/Rayyan with title matching


## Update Search Execution JSON

In [17]:
# Path to your search execution JSON file
JSON_CONFIG_PATH = "../inputs/search-execution-quantum-biodiversity.json"  # Update path as needed

def update_search_execution_json(json_path, search_results, unique_count):
    """Update the search execution JSON with actual results."""
    
    try:
        with open(json_path, "r") as f:
            config = json.load(f)
    except FileNotFoundError:
        print(f"JSON file not found: {json_path}")
        return
    
    # Update creation date
    config["search_execution_dataset"]["creation_date"] = datetime.now().strftime("%Y-%m-%d")
    
    # Update database counts
    db_mapping = {
        "OpenAlex": "openalex.org",
        "arXiv": "arxiv.org",
        "Semantic Scholar": "semanticscholar.org",
        "PubMed": "pubmed.ncbi.nlm.nih.gov",
        "Europe PMC": "europepmc.org"
    }
    
    for db_search in config["search_execution_dataset"]["db_searches"]:
        for db_name, url_part in db_mapping.items():
            if url_part in db_search["database_url"]:
                if db_name in search_results["databases"]:
                    db_search["results_count"] = search_results["databases"][db_name]["total_available"]
                break
    
    # Update screening counts (placeholder - actual screening happens later)
    total = sum(d["total_available"] for d in search_results["databases"].values())
    config["search_execution_dataset"]["screened_record_count"] = str(unique_count)
    
    # Save updated JSON
    updated_path = json_path.replace(".json", "_updated.json")
    with open(updated_path, "w") as f:
        json.dump(config, f, indent=2)
    
    print(f"Updated JSON saved to: {updated_path}")

# Uncomment to update your JSON file:
update_search_execution_json(JSON_CONFIG_PATH, search_results, len(unique_records))

Updated JSON saved to: search-execution-quantum-biodiversity_updated.json


## Next Steps

1. **Import to reference manager**: Load `search_results_combined.ris` into Zotero or Rayyan
2. **Full deduplication**: Use Zotero's duplicate detection or Rayyan's deduplication
3. **Title/Abstract screening**: Apply inclusion/exclusion criteria
4. **Full-text screening**: Review remaining candidates
5. **Update counts**: Fill in final numbers in search execution JSON
6. **Generate nanopub**: Run search-execution-nanopub-from-json.ipynb