# Gather SRA Datasets
2025-04-21 ZD

Initial exploration of gathering SRA Datasets associated with Publications associated with NCI-supported studies. This will be very closely modeled off of the process to gather geo datasets. 

In [1]:
import io
import gzip
import os
import sys
import json
import re
import time
import uuid
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
from collections import defaultdict
import xml.etree.ElementTree as ET

from ftplib import FTP
import pandas as pd
import requests
import concurrent.futures as cf
from urllib.parse import urlparse
from tqdm import tqdm  # for progress bars
from Bio import Entrez  # for e-Utils API

## Initial Exploration

In [2]:
pmids = [
        #'10637239', 
         '38738472'
         ]

In [3]:
def fetch_sra_ids(pmid: str) -> Tuple[str, List[str]]:
    """
    Fetch SRA IDs for a single PubMed ID
    
    Args:
        pmid: PubMed ID to query
    
    Returns:
        Tuple of (pmid, list_of_sra_ids)
    """

    try:
        # Small delay to control API rate
        time.sleep(0.1)
        
        link_handle = Entrez.elink(
            dbfrom="pubmed",
            db="sra",
            id=pmid,
            linkname="pubmed_sra"
        )
        
        link_record = Entrez.read(link_handle)
        link_handle.close()
        
        sra_ids = [
            link['Id']
            for link_set in link_record
            for link in link_set.get('LinkSetDb', [])
            for link in link.get('Link', [])
        ]
        
        return (pmid, sra_ids)
    
    except Exception as e:
        print(f"Error processing PMID {pmid}: {e}")
        return (pmid, [])



def get_sra_ids_for_pubmed_ids(pubmed_ids: List[str]) -> Dict[str, List[str]]:
    """
    Retrieve SRA dataset IDs associated with each PubMed ID in a list of PMIDs. 
    
    Args:
        pubmed_ids: List of PubMed IDs to query
    
    Returns:
        Dictionary mapping PubMed IDs to their associated SRA IDs
    """

    # Configure Entrez
    # Email and api key from hidden local env file. Use default if not defined
    Entrez.email = os.environ.get('NCBI_EMAIL', 'your-email@example.com')
    Entrez.api_key = os.environ.get('NCBI_API_KEY', '')
    if not Entrez.api_key: 
        print(f"WARNING: No NCBI API key in use. Check readme and local .env file."
              f"\nNCBI E-Utilities rate will be limited and may cause errors.")
    
    # Configure API rate limiting and max parallel threads
    Entrez.max_tries = 3
    Entrez.sleep_between_tries = 2
    max_workers = 10
    
    # Get counts for progress tracking
    pmid_count = len(pubmed_ids)
    
    # Use ThreadPoolExecutor with rate-limited concurrency
    # This will run multiple API-calling threads while waiting for responses
    pmid_sra_links = {}
    with cf.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks (PMIDs) to the executor and store futures in dict
        futures = {executor.submit(fetch_sra_ids, pmid): pmid for pmid in pubmed_ids}
        
        # Iterate through futures (PMIDs) as they become available
        for future in tqdm(cf.as_completed(futures),
                          unit="PMID", total=pmid_count, ncols=80,
                          desc="Fetching SRA IDs"):
            pmid, sra_ids = future.result()
            pmid_sra_links[pmid] = sra_ids
    
    return pmid_sra_links

In [4]:
pmid_sra_map = get_sra_ids_for_pubmed_ids(pmids)
pmid_sra_map

Fetching SRA IDs: 100%|█████████████████████████| 1/1 [00:02<00:00,  2.01s/PMID]


{'38738472': ['31172113',
  '31172112',
  '31172111',
  '31172110',
  '31172109',
  '31172108']}

In [5]:
def get_full_sra_record(sra_id: str) -> Optional[ET.Element]:
    """
    Get the full NCBI EFetch record for a given SRA Accession, handling XML parsing with ElementTree.

    Args:
        sra_id: SRA ID from Accession (e.g., '31172108').

    Returns:
        An ElementTree Element representing the root of the XML record, or None if an error occurs.
    """
    try:
        handle = Entrez.efetch(db="sra", id=sra_id, retmode="xml")
        xml_content = handle.read().decode('utf-8')
        handle.close()
        root = ET.fromstring(xml_content)
        return root
    except Exception as e:
        print(f"Error fetching or parsing information for SRA Accession {sra_id}: {e}")
        return None


def get_all_sra_efetch_records(sra_id_list: List[str]) -> List[Optional[ET.Element]]:
    """
    Get NCBI EFetch metadata for each SRA accession in a list of SRA IDs.

    Args:
        sra_id_list: List of SRA IDs

    Returns:
        List of ElementTree Elements representing the root of the metadata records.
    """

    # Configure Entrez
    # Email and api key from hidden local env file. Use default if not defined
    Entrez.email = os.environ.get('NCBI_EMAIL', 'your-email@example.com')
    Entrez.api_key = os.environ.get('NCBI_API_KEY', '')
    if not Entrez.api_key:
        print(f"WARNING: No NCBI API key in use. Check readme and local .env file."
              f"\nNCBI E-Utilities rate will be limited and may cause errors.")

    # Build empty list to hold gathered records
    all_records = []

    # Iterate through list of ids to get metadata for each
    for sra_id in tqdm(sra_id_list, ncols=80, desc="Fetching SRA metadata"):
        record = get_full_sra_record(sra_id)

        if record is not None:
            all_records.append(record)
        else:
            print(f"Error retrieving {sra_id} metadata. Skipping.")

    return all_records

In [6]:
sra_list = []
for pmid, sra_list_items in pmid_sra_map.items():
    for sra_id in sra_list_items:
        print(sra_id)


31172113
31172112
31172111
31172110
31172109
31172108


In [7]:
# sra_record = get_full_sra_record('31172108')
# if sra_record is not None:
#     # Now you can parse the XML using ElementTree methods
#     # The root element is 'EXPERIMENT_PACKAGE_SET'
#     experiment_packages = sra_record.findall('EXPERIMENT_PACKAGE')
#     print(f"Number of experiment packages found: {len(experiment_packages)}")
#     for package in experiment_packages:
#         experiment = package.find('EXPERIMENT')
#         if experiment is not None:
#             accession = experiment.get('accession')
#             title = experiment.find('TITLE').text if experiment.find('TITLE') is not None else "No Title"
#             print(f"Experiment Accession: {accession}, Title: {title}")

#         study_ref = package.find('STUDY_REF')
#         if study_ref is not None:
#             study_accession = study_ref.get('accession')
#             print(f"  Study Accession: {study_accession}")

#         submission = package.find('SUBMISSION')
#         if submission is not None:
#             submission_accession = submission.get('accession')
#             print(f"  Submission Accession: {submission_accession}")

#         study = package.find('STUDY')
#         if study is not None:
#             study_title = study.find('DESCRIPTOR/STUDY_TITLE').text if study.find('DESCRIPTOR/STUDY_TITLE') is not None else "No Study Title"
#             print(f"  Study Title: {study_title}")
# else:
#     print("Failed to retrieve or parse SRA record.")

## Reopening this exploration after a short break...
ZD 2025-09-28  

SRA IDs are too granular and will cause repetiion in INS that is not useful. Instead, try looking at the study-level identifier, SRP#. 


In [8]:
pmid_sra_map

{'38738472': ['31172113',
  '31172112',
  '31172111',
  '31172110',
  '31172109',
  '31172108']}

In [32]:
def get_srp_ids_for_sra_id(sra_id: str) -> List[str]:
    """
    Get all SRP (study-level) IDs associated with a SRA (run-level) ID.

    This parses the XML structure returned by NCBI SRA EFetch and searches for
    Study accessions (SRP numbers from NCBI or ERP numbers from ENA/EBI).

    Args:
        sra_id: String of a Sequence Read Archive run ID (e.g. '31172113')

    Returns:
        List of SRP/ERP accessions e.g. ['SRP123456', 'ERP001234', ...]
        Returns [] if none found or on error.
    """
    # Entrez setup (matches your typical pattern)
    Entrez.email = os.environ.get("NCBI_EMAIL", "your-email@example.com")
    Entrez.api_key = os.environ.get("NCBI_API_KEY", "")
    Entrez.max_tries = 3
    Entrez.sleep_between_tries = 2

    try:
        # EFetch returns full XML for this SRA ID
        handle = Entrez.efetch(db="sra", id=str(sra_id), rettype="xml", retmode="text")
        xml_data = handle.read()
        handle.close()

        if not xml_data.strip():
            return []

        # Use ElementTree (ET) to parse XML for SRP/ERP
        root = ET.fromstring(xml_data)

        study_ids: List[str] = []
        for elem in root.iter():
            # Check attributes for SRP or ERP accessions
            for attr_val in elem.attrib.values():
                if isinstance(attr_val, str):
                    # Match SRP (NCBI) or ERP (ENA/EBI) study accessions
                    if re.match(r"^(SRP|ERP)\d+$", attr_val.strip()):
                        study_ids.append(attr_val.strip())
            # Occasionally appears in text content too
            if elem.text:
                if re.match(r"^(SRP|ERP)\d+$", elem.text.strip()):
                    study_ids.append(elem.text.strip())

        # Deduplicate while preserving order
        seen = set()
        out = []
        for s in study_ids:
            if s not in seen:
                seen.add(s)
                out.append(s)
        return out

    except Exception as e:
        print(f"Error fetching SRP/ERP for SRA ID {sra_id}: {e}")
        return []


In [33]:
get_srp_ids_for_sra_id(31172113)

['SRP480992']

In [34]:
def get_srp_ids_for_sra_ids(
    sra_ids: List[str],
    max_workers: int = 10,
    sleep_between_calls: float = 0.0,
) -> Dict[str, List[str]]:
    """
    Batch map SRA IDs → SRP accessions using EFetch(db='sra').
    - Concurrency via ThreadPoolExecutor (default 10 workers)
    - Optional small sleep between calls to be polite
    - Returns dict like: { '31172113': ['SRP123456'], ... }
    """
    # one-time Entrez init (kept flat to match your style)
    Entrez.email = os.environ.get("NCBI_EMAIL", "your-email@example.com")
    Entrez.api_key = os.environ.get("NCBI_API_KEY", "")
    Entrez.max_tries = 3
    Entrez.sleep_between_tries = 2
    if not Entrez.api_key:
        print("WARNING: No NCBI_API_KEY set; requests may be rate limited.")

    sra_ids = [str(u).strip() for u in sra_ids if str(u).strip()]
    result: Dict[str, List[str]] = {}

    def worker(sra_id: str) -> (str, List[str]):
        if sleep_between_calls > 0:
            time.sleep(sleep_between_calls)
        srps = get_srp_ids_for_sra_id(sra_id)
        return sra_id, srps

    with cf.ThreadPoolExecutor(max_workers=max_workers) as ex:
        futures = {ex.submit(worker, sra_id): sra_id for sra_id in sra_ids}
        for fut in tqdm(
            cf.as_completed(futures),
            total=len(futures),
            unit="ID",
            ncols=80,
            desc="Fetching SRP IDs for SRA IDs",
        ):
            uid, srps = fut.result()
            result[uid] = srps

    return result

In [35]:
sra_ids = ["31172113", "31172112", 31172111, "test_bad_sra_id"]  # example UIDs
sra_id_to_srp = get_srp_ids_for_sra_ids(sra_ids, 
                                      max_workers=10, 
                                      sleep_between_calls=0.05
                                      )
for sra_id, srps in sra_id_to_srp.items():
    print(sra_id, "→", srps)

Fetching SRP IDs for SRA IDs:  25%|███▊           | 1/4 [00:01<00:03,  1.14s/ID]

Error fetching SRP/ERP for SRA ID test_bad_sra_id: HTTP Error 400: Bad Request


Fetching SRP IDs for SRA IDs: 100%|███████████████| 4/4 [00:01<00:00,  2.52ID/s]

test_bad_sra_id → []
31172112 → ['SRP480992']
31172111 → ['SRP480992']
31172113 → ['SRP480992']





In [13]:
test_sra_list = list(pmid_sra_map.values())[0]
test_sra_list

['31172113', '31172112', '31172111', '31172110', '31172109', '31172108']

In [36]:
sra_id_to_srp = get_srp_ids_for_sra_ids(test_sra_list, 
                                      max_workers=10, 
                                      sleep_between_calls=0.05
                                      )
for sra_id, srps in sra_id_to_srp.items():
    print(sra_id, "→", srps)

Fetching SRP IDs for SRA IDs: 100%|███████████████| 6/6 [00:01<00:00,  3.71ID/s]

31172111 → ['SRP480992']
31172110 → ['SRP480992']
31172112 → ['SRP480992']
31172109 → ['SRP480992']
31172108 → ['SRP480992']
31172113 → ['SRP480992']





In [15]:
def compose_pmid_to_srp(
    pmid_to_sra_ids: Dict[str, List[str]],
    sra_id_to_srps: Dict[str, List[str]],
) -> Dict[str, List[str]]:
    """
    Compose a strict PMID → SRP mapping by chaining:
      PMID → [sra_id, ...]   (run-level)
      sra_id → [srp_id, ...] (study-level)

    Returns:
        { pmid: sorted unique [srp_id, ...] }
    """

    pmid_to_srps: Dict[str, set[str]] = defaultdict(set)

    for pmid, sra_ids in pmid_to_sra_ids.items():
        for sra_id in sra_ids:
            srp_list = sra_id_to_srps.get(str(sra_id), [])
            for srp_id in srp_list:
                if srp_id:
                    pmid_to_srps[pmid].add(srp_id)

    # deterministic sorted lists
    return {pmid: sorted(srps) for pmid, srps in pmid_to_srps.items()}


def invert_to_srp_to_pmids(
    pmid_to_srps: Dict[str, List[str]]
) -> Dict[str, List[str]]:
    """
    Invert a PMID → SRP mapping into SRP → PMIDs.
    
    Returns:
        { srp_id: sorted unique [pmid, ...] }
    """

    srp_to_pmids: Dict[str, set[str]] = defaultdict(set)
    for pmid, srp_ids in pmid_to_srps.items():
        for srp_id in srp_ids:
            if srp_id:
                srp_to_pmids[srp_id].add(pmid)
    return {srp_id: sorted(pmids) for srp_id, pmids in srp_to_pmids.items()}

In [16]:
pmids = ['38738472',
         '38227896',
         'bad_pmid'
         ]

In [17]:
pmid_to_sra_ids = get_sra_ids_for_pubmed_ids(pmids)
pmid_to_sra_ids

Fetching SRA IDs: 100%|█████████████████████████| 3/3 [00:02<00:00,  1.27PMID/s]


{'38738472': ['31172113',
  '31172112',
  '31172111',
  '31172110',
  '31172109',
  '31172108'],
 '38227896': ['31173335', '31173334', '31173333', '31173332'],
 'bad_pmid': []}

In [18]:
all_sra_ids = [item for sublist in pmid_to_sra_ids.values() for item in sublist]
unique_sra_ids = list(set(all_sra_ids))

unique_sra_ids

['31172111',
 '31173333',
 '31173332',
 '31172113',
 '31172108',
 '31173334',
 '31172110',
 '31173335',
 '31172109',
 '31172112']

In [19]:
srp_ids = get_srp_ids_for_sra_ids(unique_sra_ids)
srp_ids

Fetching SRP IDs for SRA IDs: 100%|█████████████| 10/10 [00:02<00:00,  3.44ID/s]


{'31172111': ['SRP480992'],
 '31173332': ['SRP480996'],
 '31172113': ['SRP480992'],
 '31172108': ['SRP480992'],
 '31172112': ['SRP480992'],
 '31173333': ['SRP480996'],
 '31173334': ['SRP480996'],
 '31172110': ['SRP480992'],
 '31173335': ['SRP480996'],
 '31172109': ['SRP480992']}

In [20]:
test_map = compose_pmid_to_srp(pmid_to_sra_ids, sra_id_to_srp)
test_map

{'38738472': ['SRP480992']}

## Reopening Again
2026-01-08 ZD

Need to run a quick test to get all DCEG SRA studies. Start with download of publications from DCEG programs page on production INS 2026-01-08.

In [21]:
dceg_ins_pubs = pd.read_csv('21_sra_scrap_data/DCEG_Publications_INS_2.2.0.csv')
dceg_ins_pubs

Unnamed: 0,PubMed ID,Project(s),Title,Authors,Publication Date,Cited By,Relative Citation Ratio
0,18160098,ZIACP010144,The role of telomere biology in bone marrow fa...,Sharon A Savage;Blanche P Alter,2007-11-19,52,1.02
1,18162814,ZIACP010124,Spectroscopic imaging as triage test for cervi...,Julia C Gage;Mahboobeh Safaeian;Jose Jeronimo;...,2008-01-01,2,0.09
2,18256926,ZIACP010182;ZIACP010183,Human epidermal growth factor receptor-2 and e...,W F Anderson;S Luo;N Chatterjee;P S Rosenberg;...,2008-02-07,35,0.71
3,18291036,ZIACP010125,Household vacuum cleaners vs. the high-volume ...,Joanne S Colt;Robert B Gunier;Catherine Metaye...,2008-02-21,62,2.31
4,18308129,ZIACP010126,"Re: Nguyen MM, Ellison LM: Testicular cancer p...",Katherine A McGlynn;Susan S Devesa,2008-02-01,0,0
...,...,...,...,...,...,...,...
4773,39095780,ZIACP010196,Methods and participant characteristics in the...,Yashvee Dunneram;Jia Yi Lee;Cody Z Watling;Gar...,2024-08-02,7,2.6
4774,39100747,ZIACP000185,Volatile organic compounds and mortality from ...,Mahdi Nalini;Hossein Poustchi;Deepak Bhandari;...,2024-06-23,13,4.66
4775,39141862,ZIACP010131;ZIACP010170,T-Cell Neoplasms after B-Cell Neoplasms - The ...,Graca M Dores;Lindsay M Morton,2024-08-15,0,
4776,39236816,ZIABC012134;ZIACP010170,A novel approach to triazole fungicides risk c...,Luiz P A Marciano;Nicole Kleinstreuer;Xiaoqing...,2024-09-03,19,7.23


In [22]:
dceg_ins_pubs['PubMed ID'].iloc[:100]

0     18160098
1     18162814
2     18256926
3     18291036
4     18308129
        ...   
95    19066394
96    19067193
97    19073792
98    19074211
99    19077439
Name: PubMed ID, Length: 100, dtype: int64

In [23]:
dceg_sra_ids_0_100 = get_sra_ids_for_pubmed_ids(dceg_ins_pubs['PubMed ID'].iloc[:100])
dceg_sra_ids_0_100

Fetching SRA IDs: 100%|█████████████████████| 100/100 [00:40<00:00,  2.48PMID/s]


{18160098: [],
 18162814: [],
 18414202: [],
 18386141: [],
 18414504: [],
 18291036: [],
 18384551: [],
 18308129: [],
 18385720: [],
 18256926: [],
 18444153: [],
 18415690: [],
 18445826: [],
 18511430: [],
 18543071: [],
 18523885: [],
 18632753: [],
 18504144: [],
 18619701: [],
 18452848: [],
 18575953: [],
 18633131: [],
 18676680: [],
 18703425: [],
 18691756: [],
 18685556: [],
 18691788: [],
 18704714: [],
 18711000: [],
 18722574: [],
 18752252: [],
 18716224: [],
 18757663: [],
 18767034: [],
 18712724: [],
 18766187: [],
 18727930: [],
 18768506: [],
 18791192: [],
 18786276: [],
 18796628: [],
 18797424: [],
 18805886: [],
 18805876: [],
 18798002: [],
 18801887: [],
 18830263: [],
 18812546: [],
 18830676: [],
 18838890: [],
 18941914: [],
 18831063: [],
 18948386: [],
 18849014: [],
 18843021: [],
 18953052: [],
 18971419: [],
 18842992: [],
 18976449: [],
 18974125: [],
 18950631: [],
 18989634: [],
 18990766: [],
 18980285: [],
 18844222: [],
 18978339: [],
 18990770:

## Main Function: PMID → SRA → SRP Pipeline

Consolidate the workflow into a single function that takes PMIDs and outputs a structured CSV.

In [37]:
def pmid_to_sra_srp_pipeline(
    pmid_list: List[str],
    max_workers: int = 10,
    sleep_between_calls: float = 0.05
) -> pd.DataFrame:
    """
    Main pipeline: Convert PMIDs → SRA IDs → SRP IDs and output structured DataFrame.
    
    Each row in the output represents a PMID with its associated SRA and SRP IDs.
    
    Args:
        pmid_list: List of PubMed IDs to process
        max_workers: Number of concurrent threads for API calls
        sleep_between_calls: Delay between API calls (seconds)
    
    Returns:
        DataFrame with columns: ['PMID', 'SRP_IDs', 'SRA_IDs']
        where SRP_IDs and SRA_IDs are semicolon-separated strings
    """
    
    print(f"Processing {len(pmid_list)} PMIDs...")
    
    # Step 1: PMID → SRA IDs
    print("\n[Step 1/3] Fetching SRA IDs for PMIDs...")
    pmid_to_sra_ids = get_sra_ids_for_pubmed_ids(pmid_list)
    print(f"DEBUG: pmid_to_sra_ids: {pmid_to_sra_ids}")
    
    # Step 2: Get unique SRA IDs and fetch their SRP IDs
    all_sra_ids = [item for sublist in pmid_to_sra_ids.values() for item in sublist]
    unique_sra_ids = list(set(all_sra_ids))
    print(f"\n[Step 2/3] Found {len(unique_sra_ids)} unique SRA IDs. Fetching SRP IDs...")
    print(f"DEBUG: unique_sra_ids: {unique_sra_ids}")
    
    if unique_sra_ids:
        sra_to_srp_ids = get_srp_ids_for_sra_ids(
            unique_sra_ids,
            max_workers=max_workers,
            sleep_between_calls=sleep_between_calls
        )
    else:
        sra_to_srp_ids = {}
    
    print(f"DEBUG: sra_to_srp_ids: {sra_to_srp_ids}")
    
    # Check for SRA IDs that did not return any SRP IDs
    sra_without_srp = [sra_id for sra_id, srp_list in sra_to_srp_ids.items() if not srp_list]
    if sra_without_srp:
        print(f"\nWARNING: {len(sra_without_srp)} SRA ID(s) did not return any SRP IDs:")
        for sra_id in sra_without_srp[:10]:  # Show first 10
            print(f"  - {sra_id}")
        if len(sra_without_srp) > 10:
            print(f"  ... and {len(sra_without_srp) - 10} more")
    
    # Step 3: Build structured output
    print("\n[Step 3/3] Building output DataFrame...")
    rows = []
    
    for pmid in pmid_list:
        sra_ids = pmid_to_sra_ids.get(pmid, [])
        
        # Gather all SRP IDs associated with this PMID's SRA IDs
        srp_ids_set = set()
        for sra_id in sra_ids:
            srp_list = sra_to_srp_ids.get(str(sra_id), [])
            srp_ids_set.update(srp_list)
        
        # Convert to sorted lists for consistent output
        sra_ids_sorted = sorted([str(x) for x in sra_ids])
        srp_ids_sorted = sorted(list(srp_ids_set))
        
        rows.append({
            'PMID': pmid,
            'SRP_IDs': '; '.join(srp_ids_sorted) if srp_ids_sorted else '',
            'SRA_IDs': '; '.join(sra_ids_sorted) if sra_ids_sorted else ''
        })
    
    df = pd.DataFrame(rows)
    
    # Summary statistics
    total_pmids = len(df)
    pmids_with_sra = len(df[df['SRA_IDs'] != ''])
    pmids_with_srp = len(df[df['SRP_IDs'] != ''])
    
    print(f"\n{'='*60}")
    print(f"Pipeline Complete!")
    print(f"{'='*60}")
    print(f"Total PMIDs processed: {total_pmids}")
    print(f"PMIDs with SRA IDs: {pmids_with_sra} ({pmids_with_sra/total_pmids*100:.1f}%)")
    print(f"PMIDs with SRP IDs: {pmids_with_srp} ({pmids_with_srp/total_pmids*100:.1f}%)")
    print(f"{'='*60}\n")
    
    return df

### Test the Pipeline with Sample PMIDs

In [38]:
# Test with a small sample
test_pmids = ['38738472', '38227896', '10637239']

result_df = pmid_to_sra_srp_pipeline(
    pmid_list=test_pmids,
    max_workers=10,
    sleep_between_calls=0.05
)

# Display the results
result_df

Processing 3 PMIDs...

[Step 1/3] Fetching SRA IDs for PMIDs...


Fetching SRA IDs: 100%|█████████████████████████| 3/3 [00:01<00:00,  2.34PMID/s]



DEBUG: pmid_to_sra_ids: {'38227896': ['31173335', '31173334', '31173333', '31173332'], '38738472': ['31172113', '31172112', '31172111', '31172110', '31172109', '31172108'], '10637239': []}

[Step 2/3] Found 10 unique SRA IDs. Fetching SRP IDs...
DEBUG: unique_sra_ids: ['31172111', '31173332', '31172113', '31172112', '31173334', '31172108', '31172110', '31173335', '31172109', '31173333']


Fetching SRP IDs for SRA IDs: 100%|█████████████| 10/10 [00:01<00:00,  5.83ID/s]

DEBUG: sra_to_srp_ids: {'31172111': ['SRP480992'], '31173334': ['SRP480996'], '31173332': ['SRP480996'], '31172110': ['SRP480992'], '31172113': ['SRP480992'], '31172112': ['SRP480992'], '31173333': ['SRP480996'], '31172108': ['SRP480992'], '31172109': ['SRP480992'], '31173335': ['SRP480996']}

[Step 3/3] Building output DataFrame...

Pipeline Complete!
Total PMIDs processed: 3
PMIDs with SRA IDs: 2 (66.7%)
PMIDs with SRP IDs: 2 (66.7%)






Unnamed: 0,PMID,SRP_IDs,SRA_IDs
0,38738472,SRP480992,31172108; 31172109; 31172110; 31172111; 311721...
1,38227896,SRP480996,31173332; 31173333; 31173334; 31173335
2,10637239,,


### Export to CSV

In [33]:
# Save to CSV
output_path = '21_sra_scrap_data/pmid_to_sra_srp_test.csv'
result_df.to_csv(output_path, index=False)
print(f"Results saved to: {output_path}")

Results saved to: 21_sra_scrap_data/pmid_to_sra_srp_test.csv


## Process All DCEG PMIDs in Batches of 1000

Process the complete DCEG dataset in groups of 1000 PMIDs, saving each batch to a separate CSV file.

In [35]:
# Check total number of DCEG PMIDs
total_dceg_pmids = len(dceg_ins_pubs)
print(f"Total DCEG PMIDs: {total_dceg_pmids}")
print(f"Number of batches (1000 per batch): {(total_dceg_pmids + 999) // 1000}")

Total DCEG PMIDs: 4778
Number of batches (1000 per batch): 5


### Batch 1: PMIDs 0-999

In [36]:
# Process PMIDs 0-999
batch_1_pmids = dceg_ins_pubs['PubMed ID'].iloc[0:1000].tolist()

batch_1_df = pmid_to_sra_srp_pipeline(
    pmid_list=batch_1_pmids,
    max_workers=10,
    sleep_between_calls=0.05
)

# Save to CSV
batch_1_output = '21_sra_scrap_data/dceg_batch_0001_pmids_0000-0999.csv'
batch_1_df.to_csv(batch_1_output, index=False)
print(f"Batch 1 saved to: {batch_1_output}")

batch_1_df.head(10)

Processing 1000 PMIDs...

[Step 1/3] Fetching SRA IDs for PMIDs...


Fetching SRA IDs: 100%|███████████████████| 1000/1000 [03:18<00:00,  5.05PMID/s]


[Step 2/3] Found 0 unique SRA IDs. Fetching SRP IDs...

[Step 3/3] Building output DataFrame...

Pipeline Complete!
Total PMIDs processed: 1000
PMIDs with SRA IDs: 0 (0.0%)
PMIDs with SRP IDs: 0 (0.0%)

Batch 1 saved to: 21_sra_scrap_data/dceg_batch_0001_pmids_0000-0999.csv





Unnamed: 0,PMID,SRP_IDs,SRA_IDs
0,18160098,,
1,18162814,,
2,18256926,,
3,18291036,,
4,18308129,,
5,18384551,,
6,18385720,,
7,18386141,,
8,18414202,,
9,18414504,,


### Batch 2: PMIDs 1000-1999

In [37]:
# Process PMIDs 1000-1999
batch_2_pmids = dceg_ins_pubs['PubMed ID'].iloc[1000:2000].tolist()

batch_2_df = pmid_to_sra_srp_pipeline(
    pmid_list=batch_2_pmids,
    max_workers=10,
    sleep_between_calls=0.05
)

# Save to CSV
batch_2_output = '21_sra_scrap_data/dceg_batch_0002_pmids_1000-1999.csv'
batch_2_df.to_csv(batch_2_output, index=False)
print(f"Batch 2 saved to: {batch_2_output}")

batch_2_df.head(10)

Processing 1000 PMIDs...

[Step 1/3] Fetching SRA IDs for PMIDs...


Fetching SRA IDs: 100%|███████████████████| 1000/1000 [03:25<00:00,  4.88PMID/s]



[Step 2/3] Found 0 unique SRA IDs. Fetching SRP IDs...

[Step 3/3] Building output DataFrame...

Pipeline Complete!
Total PMIDs processed: 1000
PMIDs with SRA IDs: 0 (0.0%)
PMIDs with SRP IDs: 0 (0.0%)

Batch 2 saved to: 21_sra_scrap_data/dceg_batch_0002_pmids_1000-1999.csv


Unnamed: 0,PMID,SRP_IDs,SRA_IDs
0,21965309,,
1,21969503,,
2,21971816,,
3,21971817,,
4,21974856,,
5,21975279,,
6,21976309,,
7,21980134,,
8,21981348,,
9,21989791,,


### Batch 3: PMIDs 2000-2999

In [39]:
# Process PMIDs 2000-2999
batch_3_pmids = dceg_ins_pubs['PubMed ID'].iloc[2000:3000].tolist()

batch_3_df = pmid_to_sra_srp_pipeline(
    pmid_list=batch_3_pmids,
    max_workers=10,
    sleep_between_calls=0.05
)

# Save to CSV
batch_3_output = '21_sra_scrap_data/dceg_batch_0003_pmids_2000-2999.csv'
batch_3_df.to_csv(batch_3_output, index=False)
print(f"Batch 3 saved to: {batch_3_output}")

batch_3_df.head(10)

Processing 1000 PMIDs...

[Step 1/3] Fetching SRA IDs for PMIDs...


Fetching SRA IDs: 100%|███████████████████| 1000/1000 [05:36<00:00,  2.97PMID/s]



DEBUG: pmid_to_sra_ids: {25743242: [], 25742478: [], 25757805: [], 25737332: [], 25762808: [], 25739496: [], 25751625: [], 25758095: [], 25748701: [], 25742475: [], 25773928: [], 25776013: [], 25785929: [], 25802341: [], 25802059: [], 25794878: [], 25788956: [], 25799011: [], 25796338: [], 25804953: ['1455908', '1455907', '1455906', '1455905', '1455904', '1455903', '1455902', '1455901', '1455900', '1455899', '1455898', '1455897', '1455896', '1455895', '1455894', '1455893', '1455892', '1455891', '1455890', '1455889', '1455888', '1455887', '1455886', '1455885', '1455884', '1455883', '1455882', '1455881', '1455880', '1455879', '1455878', '1455877', '1455876', '1455875', '1455874', '1455873', '1455872', '1455871', '1455870', '1455869', '1455868', '1455867', '1455866'], 25811150: [], 25823661: [], 25824105: [], 25829162: [], 25845708: [], 25830658: [], 25837669: [], 25849217: [], 25844730: [], 25849327: [], 25851181: [], 25855707: [], 25857409: [], 25867262: [], 25882629: [], 25890600: [], 

Fetching SRP IDs for SRA IDs: 100%|█████████████| 75/75 [00:27<00:00,  2.74ID/s]

DEBUG: sra_to_srp_ids: {'1455908': ['SRP057510'], '1455890': ['SRP057510'], '1455901': ['SRP057510'], '1052145': ['SRP048744'], '1455880': ['SRP057510'], '1455893': ['SRP057510'], '1052142': ['SRP048744'], '1455903': ['SRP057510'], '1052151': ['SRP048744'], '1455871': ['SRP057510'], '1455878': ['SRP057510'], '1455898': ['SRP057510'], '1455889': ['SRP057510'], '1455888': ['SRP057510'], '1455892': ['SRP057510'], '1455874': ['SRP057510'], '1455867': ['SRP057510'], '1052155': ['SRP048744'], '1052146': ['SRP048744'], '1455891': ['SRP057510'], '1052139': ['SRP048744'], '1894428': ['SRP048744'], '1455902': ['SRP057510'], '1052162': ['SRP048744'], '1052158': ['SRP048744'], '1455869': ['SRP057510'], '1455905': ['SRP057510'], '1052143': ['SRP048744'], '1052137': ['SRP048744'], '1052156': ['SRP048744'], '1052138': ['SRP048744'], '1052157': ['SRP048744'], '1052152': ['SRP048744'], '1455900': ['SRP057510'], '1455866': ['SRP057510'], '1052150': ['SRP048744'], '1894429': ['SRP048744'], '1052144': ['S




Unnamed: 0,PMID,SRP_IDs,SRA_IDs
0,25737332,,
1,25739496,,
2,25742475,,
3,25742478,,
4,25743242,,
5,25748701,,
6,25751625,,
7,25757805,,
8,25758095,,
9,25762808,,


### Batch 4: PMIDs 3000-3999

In [40]:
# Process PMIDs 3000-3999
batch_4_pmids = dceg_ins_pubs['PubMed ID'].iloc[3000:4000].tolist()

batch_4_df = pmid_to_sra_srp_pipeline(
    pmid_list=batch_4_pmids,
    max_workers=10,
    sleep_between_calls=0.05
)

# Save to CSV
batch_4_output = '21_sra_scrap_data/dceg_batch_0004_pmids_3000-3999.csv'
batch_4_df.to_csv(batch_4_output, index=False)
print(f"Batch 4 saved to: {batch_4_output}")

batch_4_df.head(10)

Processing 1000 PMIDs...

[Step 1/3] Fetching SRA IDs for PMIDs...


Fetching SRA IDs: 100%|███████████████████| 1000/1000 [06:18<00:00,  2.64PMID/s]



DEBUG: pmid_to_sra_ids: {29801099: [], 29855508: [], 29863827: [], 29807233: [], 29801475: [], 29863445: [], 29809280: [], 29844655: [], 29860330: [], 29852126: [], 29865259: [], 29866411: [], 29873077: [], 29889248: [], 29879518: [], 29878065: [], 29892016: [], 29894512: [], 29899550: [], 29901778: [], 29906417: [], 29904148: [], 29908479: [], 29912394: [], 29920516: [], 29915430: [], 29916399: [], 29917119: [], 29925378: [], 29920322: [], 29931120: [], 29931140: [], 29932357: [], 29933344: [], 29947736: [], 29971434: [], 29950612: [], 29948559: [], 29971781: [], 29968964: [], 29971594: [], 29975184: [], 29981168: [], 29982318: [], 29974477: [], 29982593: [], 29992560: [], 30007509: [], 30019399: [], 29987894: [], 29992993: [], 30020493: [], 29991571: [], 30006586: [], 30026010: [], 30028904: [], 30021753: [], 30030213: [], 30031635: [], 30042151: [], 30048408: [], 30049842: [], 30036595: [], 30047418: [], 30054336: [], 30044713: [], 30059977: [], 30041450: [], 30060076: [], 30073448:

Fetching SRP IDs for SRA IDs: 100%|███████████| 728/728 [04:29<00:00,  2.70ID/s]

DEBUG: sra_to_srp_ids: {'35830676': ['ERP165468'], '35830823': ['ERP165468'], '35830821': ['ERP165468'], '6162127': ['ERP110064'], '35830774': ['ERP165468'], '35830489': ['ERP165468'], '35830537': ['ERP165468'], '35830510': ['ERP165468'], '35830532': ['ERP165468'], '35830825': ['ERP165468'], '35830817': ['ERP165468'], '35830692': ['ERP165468'], '35830612': ['ERP165468'], '6162023': ['ERP110064'], '35830854': ['ERP165468'], '35830446': ['ERP165468'], '35830512': ['ERP165468'], '6162152': ['ERP110064'], '6162178': ['ERP110064'], '6161985': ['ERP110064'], '6161966': ['ERP110064'], '35830501': ['ERP165468'], '6162181': ['ERP110064'], '35830840': ['ERP165468'], '35830719': ['ERP165468'], '35830380': ['ERP165468'], '6162030': ['ERP110064'], '35830857': ['ERP165468'], '35830582': ['ERP165468'], '35830641': ['ERP165468'], '35830631': ['ERP165468'], '35830379': ['ERP165468'], '6162205': ['ERP110064'], '35830657': ['ERP165468'], '35830527': ['ERP165468'], '35830809': ['ERP165468'], '6162104': ['




Unnamed: 0,PMID,SRP_IDs,SRA_IDs
0,29801099,,
1,29801475,,
2,29807233,,
3,29809280,,
4,29844655,,
5,29852126,,
6,29855508,,
7,29860330,,
8,29863445,,
9,29863827,,


### Batch 5: PMIDs 4000-4999

In [41]:
# Process PMIDs 4000-4999
batch_5_pmids = dceg_ins_pubs['PubMed ID'].iloc[4000:5000].tolist()

batch_5_df = pmid_to_sra_srp_pipeline(
    pmid_list=batch_5_pmids,
    max_workers=10,
    sleep_between_calls=0.05
)

# Save to CSV
batch_5_output = '21_sra_scrap_data/dceg_batch_0005_pmids_4000-4999.csv'
batch_5_df.to_csv(batch_5_output, index=False)
print(f"Batch 5 saved to: {batch_5_output}")

batch_5_df.head(10)

Processing 778 PMIDs...

[Step 1/3] Fetching SRA IDs for PMIDs...


Fetching SRA IDs:   0%|                               | 0/778 [00:00<?, ?PMID/s]

Fetching SRA IDs: 100%|█████████████████████| 778/778 [04:22<00:00,  2.96PMID/s]
Fetching SRA IDs: 100%|█████████████████████| 778/778 [04:22<00:00,  2.96PMID/s]


DEBUG: pmid_to_sra_ids: {34146516: [], 34202037: [], 34162655: [], 34156301: [], 34160068: [], 34183723: [], 34157104: [], 34214881: [], 34174935: [], 34153328: [], 34218328: [], 34225255: [], 34222791: [], 34226613: [], 34234117: [], 34244153: [], 34240714: [], 34240448: [], 34236559: [], 34231883: [], 34244895: [], 34245454: [], 34255071: [], 34255164: [], 34262154: [], 34270795: [], 34286851: [], 34289968: [], 34258619: [], 34280845: [], 34294836: [], 34299799: [], 34301443: [], 34301922: ['14586709', '14586708', '14586707', '14586599', '14586598', '14586597', '14586596', '14586595', '14586594', '14586593', '14586592', '14586591', '14586590', '14586589', '14586588'], 34302788: [], 34310489: [], 34325938: [], 34320204: [], 34308104: [], 34324707: [], 34331495: [], 34334719: [], 34341517: [], 34349265: [], 34345026: [], 34351497: [], 34366148: [], 34376486: [], 34379524: [], 34382747: [], 34392847: [], 34398067: [], 34398068: [], 34401930: [], 34404683: [], 34407845: [], 34407489: [],

Fetching SRP IDs for SRA IDs: 100%|█████████████| 25/25 [00:09<00:00,  2.62ID/s]

DEBUG: sra_to_srp_ids: {'25715771': ['SRP412604'], '14586595': ['SRP321002'], '14586589': ['SRP321002'], '10203536': ['SRP250706'], '14586591': ['SRP321002'], '14586592': ['SRP321002'], '14586597': ['SRP321002'], '10203537': ['SRP250706'], '10203534': ['SRP250706'], '25715775': ['SRP412604'], '14586594': ['SRP321002'], '25715774': ['SRP412604'], '10203535': ['SRP250706'], '14586709': ['SRP321005'], '14586707': ['SRP321005'], '14586599': ['SRP321002'], '14586590': ['SRP321002'], '14586598': ['SRP321002'], '14586708': ['SRP321005'], '14586588': ['SRP321002'], '25715772': ['SRP412604'], '14586596': ['SRP321002'], '14586593': ['SRP321002'], '25715770': ['SRP412604'], '25715773': ['SRP412604']}

[Step 3/3] Building output DataFrame...

Pipeline Complete!
Total PMIDs processed: 778
PMIDs with SRA IDs: 3 (0.4%)
PMIDs with SRP IDs: 3 (0.4%)

Batch 5 saved to: 21_sra_scrap_data/dceg_batch_0005_pmids_4000-4999.csv





Unnamed: 0,PMID,SRP_IDs,SRA_IDs
0,34146516,,
1,34153328,,
2,34156301,,
3,34157104,,
4,34160068,,
5,34162655,,
6,34174935,,
7,34183723,,
8,34202037,,
9,34214881,,
