In [1]:
from Bio import Entrez
import pandas as pd
import html
import re
import json
import xml.etree.ElementTree as ET

# Set your email (required by NCBI)
Entrez.email = "chandrima.004@gmail.com"  

In [4]:
# Search term: metagenomic + shotgun + environmental
search_term = (
    'metagenomic[All Fields] AND WGS[Strategy] AND environmental[All Fields] NOT human[Organism]'
)

# Search SRA for this query
handle = Entrez.esearch(
    db="sra",
    term=search_term,
    retmax=20000  # Increase this if needed
)
record = Entrez.read(handle)
handle.close()

sra_ids = record["IdList"]
print(f"Found {len(sra_ids)} datasets.")


print("Search Term:", search_term)
print("Total results found:", record["Count"])
print("Returned IDs:", len(sra_ids))


Found 20000 datasets.
Search Term: metagenomic[All Fields] AND WGS[Strategy] AND environmental[All Fields] NOT human[Organism]
Total results found: 537184
Returned IDs: 20000


In [5]:
# Get summaries for the SRA IDs
summaries = []
for i in range(0, len(sra_ids), 50):
    batch = sra_ids[i:i+50]
    try:
        handle = Entrez.esummary(db="sra", id=",".join(batch))
        summary_batch = Entrez.read(handle)
        handle.close()
        summaries.extend(summary_batch)
    except Exception as e:
        print(f"Error retrieving batch {i}-{i+50}: {e}")

print("Number of summaries retrieved:", len(summaries))

Error retrieving batch 11800-11850: HTTP Error 429: Too Many Requests
Error retrieving batch 17350-17400: HTTP Error 429: Too Many Requests
Number of summaries retrieved: 19900


In [6]:
print(json.dumps(summaries[0], indent=2))

{
  "Item": [],
  "Id": "38091106",
  "ExpXml": "<Summary><Title>HiSeq X Ten paired end sequencing</Title><Platform instrument_model=\"HiSeq X Ten\">ILLUMINA</Platform><Statistics total_runs=\"1\" total_spots=\"0\" total_bases=\"0\" total_size=\"0\" cluster_name=\"public\"/></Summary><Submitter acc=\"ERA30852561\" center_name=\"Department of Clinical Sciences Lund, Lund Univers\" contact_name=\"European Nucleotide Archive\" lab_name=\"European Nucleotide Archive\"/><Experiment acc=\"ERX13158885\" ver=\"1\" status=\"public\" name=\"HiSeq X Ten paired end sequencing\"/><Study acc=\"ERP163562\" name=\"The Gut Microbiome in Preeclampsia\"/><Organism taxid=\"408170\" ScientificName=\"human gut metagenome\"/><Sample acc=\"ERS21114313\" name=\"\"/><Instrument ILLUMINA=\"HiSeq X Ten\"/><Library_descriptor><LIBRARY_NAME>S21</LIBRARY_NAME><LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY><LIBRARY_SOURCE>METAGENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT> <PAIRED/> </L

In [7]:
def clean_xml(text):
    # Basic safety: replace unescaped & with &amp; only when not already an entity
    text = re.sub(r'&(?!amp;|lt;|gt;|apos;|quot;)', '&amp;', text)
    # Remove invalid control characters (non-printable)
    text = re.sub(r'[^\x09\x0A\x0D\x20-\x7F]', '', text)
    return f"<root>{text}</root>"

In [8]:
# Process each summary into structured metadata
data = []

for item in summaries:
    exp_xml = clean_xml(item.get("ExpXml", ""))
    runs_xml = clean_xml(item.get("Runs", ""))

    try:
        exp_tree = ET.fromstring(exp_xml)
        run_tree = ET.fromstring(runs_xml)

        title = exp_tree.findtext(".//Title", default="")
        platform = exp_tree.find(".//Platform")
        platform_name = platform.attrib.get('instrument_model', '') if platform is not None else ""

        library_strategy = exp_tree.findtext(".//Library_descriptor/LIBRARY_STRATEGY", default="")
        library_source = exp_tree.findtext(".//Library_descriptor/LIBRARY_SOURCE", default="")

        study = exp_tree.find(".//Study")
        study_acc = study.attrib.get("acc", "") if study is not None else ""
        study_name = study.attrib.get("name", "") if study is not None else ""

        bioproject = exp_tree.findtext(".//Bioproject", default="")
        biosample = exp_tree.findtext(".//Biosample", default="")

        organism = exp_tree.find(".//Organism")
        organism_name = organism.attrib.get("ScientificName", "") if organism is not None else ""
        taxid = organism.attrib.get("taxid", "") if organism is not None else ""

        run_ids = [r.attrib.get('acc', '') for r in run_tree.findall(".//Run")]

        for run_id in run_ids:
            data.append({
                "Run": run_id,
                "StudyAccession": study_acc,
                "StudyTitle": study_name,
                "Bioproject": bioproject,
                "Biosample": biosample,
                "Organism": organism_name,
                "TaxID": taxid,
                "Platform": platform_name,
                "LibraryStrategy": library_strategy,
                "LibrarySource": library_source,
                "Title": title
            })

    except ET.ParseError as e:
        print(f"⚠️ Failed to parse XML for item: {item.get('Id', 'Unknown')} — Error: {e}")


In [13]:
# Create DataFrame outside the loop
df = pd.DataFrame(data)

# Filter for environmental WGS metagenomics
filtered = df[
    df["LibraryStrategy"].str.contains("WGS", na=False) &
    df["LibrarySource"].str.contains("METAGENOMIC", na=False)
]

# Step 1: Drop unwanted columns
filtered_cleaned = filtered.drop(columns=["Run", "Biosample", "TaxID", "Title"])

# Step 2: Remove rows where Organism mentions human (case-insensitive)
filtered_cleaned = filtered_cleaned[~filtered_cleaned["Organism"].str.contains("human", case=False, na=False)]

# Step 3: Drop duplicates
study_summary = filtered_cleaned.drop_duplicates()

# Step 4: Show and save
print(study_summary)
study_summary.to_csv("study_level_summary_cleaned.csv", index=False)
print("✅ Saved cleaned summary to 'study_level_summary_cleaned.csv'")


      StudyAccession                                         StudyTitle  \
37         SRP577669                       ARGs Circular Network in QTP   
49         SRP577669                       ARGs Circular Network in QTP   
69         SRP577669                       ARGs Circular Network in QTP   
79         ERP170238  Uncovering the taxonomic and functional divers...   
117        SRP577458  Metagenomic Assembly of Methanotrophic and Met...   
...              ...                                                ...   
18725      SRP555193                  Food metagenomes (CM_UNINA_FFOOD)   
18921      SRP555181  Mediation of systemic exposure to the immunosu...   
19028      SRP555060  Fungal Profiles of Puerto Rican Caves Through ...   
19035      SRP512443   CONSORTIUM VE303 phase 2 study stool metagenomes   
19537      ERP151343  Microbiome response to ketogenic diet in a mou...   

         Bioproject                      Organism                 Platform  \
37     PRJNA1248871  