# SRA Query Evaluation Framework

This notebook evaluates 30 queries across three complexity levels against the NCBI Sequence Read Archive (SRA) API:
- **Basic Discovery (Low Complexity)**: 10 queries (EV-L01 to EV-L10)
- **Entity Filtering (Medium Complexity)**: 10 queries (EV-M01 to EV-M10)  
- **Complex Cohorts (High Complexity)**: 10 queries (EV-H01 to EV-H10)

In [45]:
# Import Required Libraries

import requests
import json
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import time
from datetime import datetime
import xml.etree.ElementTree as ET
from urllib.parse import urlencode

In [46]:
# SRA API Configuration
NCBI_EUTILS_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
ENA_API_BASE = "https://www.ebi.ac.uk/ena/portal/api"

# Configuration
NCBI_EMAIL = "linganesan@nus.edu.sg"
NCBI_TOOL = "sra-evaluation-notebook"

results = {}

In [47]:
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def build_ncbi_params(**params):
    """Build standard NCBI E-utilities parameters"""
    base_params = {
        'tool': NCBI_TOOL,
        'email': NCBI_EMAIL,
    }
    return {**base_params, **params}

def sra_search(term, retmax=200, retstart=0, sort="relevance"):
    """Search SRA using NCBI esearch"""
    params = build_ncbi_params(
        db='sra',
        term=term,
        retmax=retmax,
        retstart=retstart,
        sort=sort
    )
    
    url = f"{NCBI_EUTILS_BASE}/esearch.fcgi"
    response = requests.get(url, params=params, timeout=30)
    response.raise_for_status()
    
    # Parse XML response
    root = ET.fromstring(response.content)
    
    count = int(root.find('.//Count').text or 0)
    retmax_actual = int(root.find('.//RetMax').text or 0)
    retstart_actual = int(root.find('.//RetStart').text or 0)
    
    # Extract IDs
    ids = []
    id_list = root.find('.//IdList')
    if id_list is not None:
        ids = [id_elem.text for id_elem in id_list.findall('Id')]
    
    return {
        'count': count,
        'ids': ids,
        'retstart': retstart_actual,
        'retmax': retmax_actual
    }

def sra_metadata(ids):
    """Get metadata for SRA IDs using efetch with full XML"""
    if not ids:
        return []
    
    # Use efetch instead of esummary for full metadata
    params = build_ncbi_params(
        db='sra',
        id=','.join(ids)
    )
    
    url = f"{NCBI_EUTILS_BASE}/efetch.fcgi"
    response = requests.get(url, params=params, timeout=30)
    response.raise_for_status()
    
    # Parse the full XML response
    summaries = []
    
    try:
        root = ET.fromstring(response.content)
        
        # Iterate through each EXPERIMENT_PACKAGE
        for exp_pkg in root.findall('.//EXPERIMENT_PACKAGE'):
            metadata = {}
            
            # Get accession from RUN_SET
            run_set = exp_pkg.find('.//RUN_SET/RUN')
            if run_set is not None:
                metadata['accession'] = run_set.get('accession', '')
                metadata['uid'] = ids[len(summaries)] if len(summaries) < len(ids) else ''
            
            # Get title from EXPERIMENT
            experiment = exp_pkg.find('.//EXPERIMENT')
            if experiment is not None:
                title_elem = experiment.find('.//TITLE')
                if title_elem is not None:
                    metadata['title'] = title_elem.text or ''
                
                # Get platform
                platform = experiment.find('.//PLATFORM')
                if platform is not None:
                    for child in platform:
                        metadata['platform'] = child.tag
                        instrument = child.find('.//INSTRUMENT_MODEL')
                        if instrument is not None:
                            metadata['instrument'] = instrument.text
                        break
                
                # Get library info
                library_descriptor = experiment.find('.//LIBRARY_DESCRIPTOR')
                if library_descriptor is not None:
                    lib_strategy = library_descriptor.find('.//LIBRARY_STRATEGY')
                    if lib_strategy is not None:
                        metadata['library_strategy'] = lib_strategy.text
                    
                    lib_source = library_descriptor.find('.//LIBRARY_SOURCE')
                    if lib_source is not None:
                        metadata['library_source'] = lib_source.text
            
            # Get organism from SAMPLE
            sample = exp_pkg.find('.//SAMPLE')
            if sample is not None:
                organism_elem = sample.find('.//SCIENTIFIC_NAME')
                if organism_elem is not None:
                    metadata['organism'] = organism_elem.text
            
            # Get runs info
            runs = exp_pkg.findall('.//RUN_SET/RUN')
            if runs:
                run_accs = [run.get('accession', '') for run in runs]
                metadata['runs'] = ', '.join(run_accs)
            
            if metadata:  # Only add if we got some data
                summaries.append(metadata)
    
    except ET.ParseError as e:
        # Fallback: return minimal data
        for uid in ids:
            summaries.append({
                'uid': uid,
                'parse_error': f'Failed to parse XML: {str(e)}'
            })
    
    return summaries

def ena_get_files(run_accessions):
    """Get file information from ENA"""
    files_info = {}
    
    for accession in run_accessions:
        try:
            params = {
                'result': 'read_run',
                'query': f'run_accession="{accession}"',
                'format': 'json',
                'fields': 'run_accession,fastq_ftp,fastq_md5,fastq_bytes,sra_ftp,sra_md5,sra_bytes'
            }
            
            response = requests.get(
                f"{ENA_API_BASE}/filereport",
                params=params,
                timeout=30
            )
            
            if response.status_code == 200:
                data = response.json()
                if data:
                    files_info[accession] = data[0]
            else:
                files_info[accession] = {'error': f'HTTP {response.status_code}'}
                
        except Exception as e:
            files_info[accession] = {'error': str(e)}
    
    return files_info


In [48]:
# ============================================================================
# LOW COMPLEXITY QUERIES (Basic Discovery)
# ============================================================================

print("🧬 Starting SRA Low Complexity Evaluations...")
print("=" * 60)

🧬 Starting SRA Low Complexity Evaluations...


In [49]:
# EV-L01: Search for all available platforms in SRA
def eval_L01():
    start = time.time()
    try:
        # Get a broader sample of records to examine platforms
        # Use a simple search term to get diverse results
        result = sra_search("RNA-Seq[strategy]", retmax=500)
        
        if result['ids']:
            # Get metadata for sample to find platforms
            metadata = sra_metadata(result['ids'][:200])  # Sample first 200
            
            platforms = set()
            for record in metadata:
                if 'platform' in record and record['platform']:
                    platforms.add(record['platform'])
            
            platforms_list = sorted(list(platforms))
            count = len(platforms_list)
            
            print(f"✅ EV-L01: Found {count} unique platforms (from sample)")
            print(f"Platforms: {', '.join(platforms_list)}")
            
            results["EV-L01"] = {
                "status": "success",
                "result": f"{count} platforms found",
                "data": platforms_list,
                "time": time.time() - start,
            }
        else:
            results["EV-L01"] = {
                "status": "error", 
                "error": "No results returned",
                "time": time.time() - start
            }
    except Exception as e:
        print(f"❌ EV-L01 Failed: {e}")
        results["EV-L01"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L01()


✅ EV-L01: Found 3 unique platforms (from sample)
Platforms: BGISEQ, DNBSEQ, ILLUMINA


In [50]:
# EV-L02: Count total number of human samples in SRA
def eval_L02():
    start = time.time()
    try:
        result = sra_search("Homo sapiens[orgn]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-L02: Found {count:,} human samples in SRA")
        
        results["EV-L02"] = {
            "status": "success",
            "result": f"{count:,} human samples",
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-L02 Failed: {e}")
        results["EV-L02"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L02()

✅ EV-L02: Found 6,372,238 human samples in SRA


In [51]:
# EV-L03: List available library strategies
def eval_L03():
    start = time.time()
    try:
        # Get a diverse sample of records to examine library strategies
        result = sra_search("Homo sapiens[orgn]", retmax=500)
        
        if result['ids']:
            metadata = sra_metadata(result['ids'][:200])
            
            strategies = set()
            for record in metadata:
                if 'library_strategy' in record and record['library_strategy']:
                    strategies.add(record['library_strategy'])
            
            strategies_list = sorted(list(strategies))
            count = len(strategies_list)
            
            print(f"✅ EV-L03: Found {count} library strategies (from sample)")
            print(f"Strategies: {'; '.join(strategies_list)}")
            
            results["EV-L03"] = {
                "status": "success",
                "result": f"{count} library strategies",
                "data": strategies_list,
                "time": time.time() - start
            }
        else:
            results["EV-L03"] = {
                "status": "error",
                "error": "No results returned", 
                "time": time.time() - start
            }
    except Exception as e:
        print(f"❌ EV-L03 Failed: {e}")
        results["EV-L03"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L03()


✅ EV-L03: Found 5 library strategies (from sample)
Strategies: AMPLICON; OTHER; RNA-Seq; Targeted-Capture; miRNA-Seq


In [52]:
# EV-L04: Count RNA-Seq experiments across all organisms
def eval_L04():
    start = time.time()
    try:
        result = sra_search("RNA-Seq[strategy]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-L04: Found {count:,} RNA-Seq experiments")
        
        results["EV-L04"] = {
            "status": "success", 
            "result": f"{count:,} RNA-Seq experiments",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-L04 Failed: {e}")
        results["EV-L04"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L04()

✅ EV-L04: Found 6,090,828 RNA-Seq experiments


In [53]:
# EV-L05: List all organisms with more than 1000 samples
def eval_L05():
    start = time.time()
    try:
        # Common model organisms to check
        test_organisms = [
            "Homo sapiens",
            "Mus musculus", 
            "Drosophila melanogaster",
            "Caenorhabditis elegans",
            "Saccharomyces cerevisiae",
            "Arabidopsis thaliana",
            "Escherichia coli",
            "SARS-CoV-2"
        ]
        
        organism_counts = {}
        for organism in test_organisms:
            try:
                result = sra_search(f'"{organism}"[orgn]', retmax=0)
                count = result['count']
                if count > 1000:
                    organism_counts[organism] = count
            except:
                continue
        
        print(f"✅ EV-L05: Found {len(organism_counts)} organisms with >1000 samples")
        for organism, count in sorted(organism_counts.items(), key=lambda x: x[1], reverse=True):
            print(f"  {organism}: {count:,} samples")
        
        results["EV-L05"] = {
            "status": "success",
            "result": f"{len(organism_counts)} organisms with >1000 samples",
            "data": organism_counts,
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-L05 Failed: {e}")
        results["EV-L05"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L05()

✅ EV-L05: Found 8 organisms with >1000 samples
  SARS-CoV-2: 7,518,624 samples
  Homo sapiens: 6,372,238 samples
  Mus musculus: 2,860,071 samples
  Escherichia coli: 550,962 samples
  Saccharomyces cerevisiae: 243,862 samples
  Arabidopsis thaliana: 220,862 samples
  Drosophila melanogaster: 174,133 samples
  Caenorhabditis elegans: 65,832 samples


In [54]:
# EV-L06: Count Illumina platform experiments
def eval_L06():
    start = time.time()
    try:
        result = sra_search("illumina[platform]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-L06: Found {count:,} Illumina experiments")
        
        results["EV-L06"] = {
            "status": "success",
            "result": f"{count:,} Illumina experiments",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-L06 Failed: {e}")
        results["EV-L06"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L06()

✅ EV-L06: Found 32,991,092 Illumina experiments


In [55]:
# EV-L07: Search for ChIP-Seq experiments
def eval_L07():
    start = time.time()
    try:
        result = sra_search("ChIP-Seq[strategy]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-L07: Found {count:,} ChIP-Seq experiments")
        
        results["EV-L07"] = {
            "status": "success",
            "result": f"{count:,} ChIP-Seq experiments",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-L07 Failed: {e}")
        results["EV-L07"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L07()

✅ EV-L07: Found 369,136 ChIP-Seq experiments


In [56]:
# EV-L08: Count single-cell RNA-Seq experiments
def eval_L08():
    start = time.time()
    try:
        result = sra_search("single cell[title] AND RNA-Seq[strategy]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-L08: Found {count:,} single-cell RNA-Seq experiments")
        
        results["EV-L08"] = {
            "status": "success",
            "result": f"{count:,} single-cell RNA-Seq experiments",
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-L08 Failed: {e}")
        results["EV-L08"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L08()

✅ EV-L08: Found 1,011,227 single-cell RNA-Seq experiments


In [57]:
# EV-L09: Search for SARS-CoV-2 related studies
def eval_L09():
    start = time.time()
    try:
        result = sra_search("SARS-CoV-2[orgn]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-L09: Found {count:,} SARS-CoV-2 experiments")
        
        results["EV-L09"] = {
            "status": "success",
            "result": f"{count:,} SARS-CoV-2 experiments",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-L09 Failed: {e}")
        results["EV-L09"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L09()

✅ EV-L09: Found 7,518,624 SARS-CoV-2 experiments


In [58]:
# EV-L10: Count whole genome sequencing experiments
def eval_L10():
    start = time.time()
    try:
        result = sra_search("WGS[strategy]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-L10: Found {count:,} whole genome sequencing experiments")
        
        results["EV-L10"] = {
            "status": "success",
            "result": f"{count:,} WGS experiments",
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-L10 Failed: {e}")
        results["EV-L10"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L10()

✅ EV-L10: Found 7,755,321 whole genome sequencing experiments


### ENTITY FILTERING QUERIES (Medium Complexity)

These queries apply specific filtering criteria to narrow down results within specific organisms or experimental conditions.

In [59]:
# ============================================================================
# MEDIUM COMPLEXITY QUERIES (Entity Filtering)
# ============================================================================

print("\n🔬 Starting SRA Medium Complexity Evaluations...")
print("=" * 60)


🔬 Starting SRA Medium Complexity Evaluations...


In [60]:
# EV-M01: Count human RNA-Seq experiments using Illumina
def eval_M01():
    start = time.time()
    try:
        result = sra_search("Homo sapiens[orgn] AND RNA-Seq[strategy] AND illumina[platform]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-M01: Found {count:,} human RNA-Seq experiments using Illumina")
        
        results["EV-M01"] = {
            "status": "success",
            "result": f"{count:,} human RNA-Seq Illumina experiments",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M01 Failed: {e}")
        results["EV-M01"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M01()

✅ EV-M01: Found 2,020,206 human RNA-Seq experiments using Illumina


In [61]:
# EV-M02: Search for mouse ChIP-Seq experiments
def eval_M02():
    start = time.time()
    try:
        result = sra_search("Mus musculus[orgn] AND ChIP-Seq[strategy]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-M02: Found {count:,} mouse ChIP-Seq experiments")
        
        results["EV-M02"] = {
            "status": "success",
            "result": f"{count:,} mouse ChIP-Seq experiments",
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-M02 Failed: {e}")
        results["EV-M02"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M02()

✅ EV-M02: Found 122,056 mouse ChIP-Seq experiments


In [62]:
# EV-M03: Find paired-end human RNA-Seq experiments
def eval_M03():
    start = time.time()
    try:
        result = sra_search("Homo sapiens[orgn] AND RNA-Seq[strategy] AND paired[layout]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-M03: Found {count:,} paired-end human RNA-Seq experiments")
        
        results["EV-M03"] = {
            "status": "success",
            "result": f"{count:,} paired-end human RNA-Seq experiments",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M03 Failed: {e}")
        results["EV-M03"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M03()

✅ EV-M03: Found 1,372,686 paired-end human RNA-Seq experiments


In [63]:
# EV-M04: Search for cancer-related human RNA-Seq studies
def eval_M04():
    start = time.time()
    try:
        result = sra_search("Homo sapiens[orgn] AND RNA-Seq[strategy] AND cancer[title]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-M04: Found {count:,} cancer-related human RNA-Seq studies")
        
        results["EV-M04"] = {
            "status": "success",
            "result": f"{count:,} cancer-related human RNA-Seq studies",
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-M04 Failed: {e}")
        results["EV-M04"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M04()

✅ EV-M04: Found 232,712 cancer-related human RNA-Seq studies


In [64]:
# EV-M05: Count recent experiments (last 2 years) for human samples
def eval_M05():
    start = time.time()
    try:
        # Search for recent human experiments
        result = sra_search("Homo sapiens[orgn] AND 2022:2024[pdat]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-M05: Found {count:,} human experiments from 2022-2024")
        
        results["EV-M05"] = {
            "status": "success",
            "result": f"{count:,} recent human experiments",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M05 Failed: {e}")
        results["EV-M05"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M05()

✅ EV-M05: Found 2,617,771 human experiments from 2022-2024


In [65]:
# EV-M06: Find ATAC-Seq experiments for human samples
def eval_M06():
    start = time.time()
    try:
        result = sra_search("Homo sapiens[orgn] AND ATAC-Seq[strategy]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-M06: Found {count:,} human ATAC-Seq experiments")
        
        results["EV-M06"] = {
            "status": "success",
            "result": f"{count:,} human ATAC-Seq experiments",
            "data": {"count": count},
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M06 Failed: {e}")
        results["EV-M06"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M06()

✅ EV-M06: Found 81,372 human ATAC-Seq experiments


In [66]:
# EV-M07: Search for plant RNA-Seq experiments
def eval_M07():
    start = time.time()
    try:
        # Search for Arabidopsis (model plant organism)
        result = sra_search("Arabidopsis thaliana[orgn] AND RNA-Seq[strategy]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-M07: Found {count:,} Arabidopsis RNA-Seq experiments")
        
        results["EV-M07"] = {
            "status": "success",
            "result": f"{count:,} Arabidopsis RNA-Seq experiments",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M07 Failed: {e}")
        results["EV-M07"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M07()

✅ EV-M07: Found 104,475 Arabidopsis RNA-Seq experiments


In [67]:
# EV-M08: Count bacterial genome sequencing experiments
def eval_M08():
    start = time.time()
    try:
        result = sra_search("Escherichia coli[orgn] AND WGS[strategy]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-M08: Found {count:,} E. coli genome sequencing experiments")
        
        results["EV-M08"] = {
            "status": "success",
            "result": f"{count:,} E. coli WGS experiments",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M08 Failed: {e}")
        results["EV-M08"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M08()

✅ EV-M08: Found 477,954 E. coli genome sequencing experiments


In [68]:
# EV-M09: Find human exome sequencing experiments
def eval_M09():
    start = time.time()
    try:
        result = sra_search("Homo sapiens[orgn] AND WXS[strategy]", retmax=0)
        count = result['count']
        
        print(f"✅ EV-M09: Found {count:,} human exome sequencing experiments")
        
        results["EV-M09"] = {
            "status": "success",
            "result": f"{count:,} human WXS experiments",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M09 Failed: {e}")
        results["EV-M09"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M09()

✅ EV-M09: Found 454,315 human exome sequencing experiments


In [69]:
# EV-M10: Search for viral metagenomics studies
def eval_M10():
    start = time.time()
    try:
        # Try different search strategies for viral metagenomics
        # First try with OR conditions
        result1 = sra_search("(viral metagenomics[title]) OR (virus metagenomics[title])", retmax=0)
        count1 = result1['count']
        
        # Also try with strategy field
        result2 = sra_search("(viral[title] OR virus[title]) AND (metagenomics[strategy] OR amplicon[strategy])", retmax=0)
        count2 = result2['count']
        
        # Use the larger count
        count = max(count1, count2)
        
        print(f"✅ EV-M10: Found {count:,} viral metagenomics studies")
        print(f"  (viral metagenomics in title: {count1:,})")
        print(f"  (viral + metagenomic strategy: {count2:,})")
        
        results["EV-M10"] = {
            "status": "success",
            "result": f"{count:,} viral metagenomics studies",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M10 Failed: {e}")
        results["EV-M10"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M10()


✅ EV-M10: Found 132,973 viral metagenomics studies
  (viral metagenomics in title: 5,097)
  (viral + metagenomic strategy: 132,973)


### COMPLEX COHORTS QUERIES (High Complexity)

These queries require sophisticated filtering, metadata analysis, and multi-step reasoning to define specific sample cohorts.

In [70]:
# ============================================================================
# HIGH COMPLEXITY QUERIES (Complex Cohorts)
# ============================================================================

print("\n🎯 Starting SRA High Complexity Evaluations...")
print("=" * 60)


🎯 Starting SRA High Complexity Evaluations...


In [71]:
# EV-H01: Find samples with both RNA-Seq and ChIP-Seq from same studies
def eval_H01():
    start = time.time()
    try:
        # Get RNA-Seq sample studies
        rnaseq_result = sra_search("RNA-Seq[strategy]", retmax=500)
        
        # Get ChIP-Seq sample studies  
        chipseq_result = sra_search("ChIP-Seq[strategy]", retmax=500)
        
        # Get metadata to find study overlaps
        rnaseq_metadata = sra_metadata(rnaseq_result['ids'][:100]) if rnaseq_result['ids'] else []
        chipseq_metadata = sra_metadata(chipseq_result['ids'][:100]) if chipseq_result['ids'] else []
        
        # Extract study information (simplified approach)
        rnaseq_studies = set()
        for record in rnaseq_metadata:
            # Use study accession if available, otherwise use a portion of title
            if 'title' in record:
                # Simple heuristic: use first part of title as study identifier
                study_id = record['title'].split()[0] if record['title'] else record['accession']
                rnaseq_studies.add(study_id)
        
        chipseq_studies = set()
        for record in chipseq_metadata:
            if 'title' in record:
                study_id = record['title'].split()[0] if record['title'] else record['accession']
                chipseq_studies.add(study_id)
        
        # Find intersection
        common_studies = rnaseq_studies.intersection(chipseq_studies)
        count = len(common_studies)
        
        print(f"✅ EV-H01: Found {count} studies with both RNA-Seq and ChIP-Seq (estimated)")
        print(f"  RNA-Seq studies: {len(rnaseq_studies)}")
        print(f"  ChIP-Seq studies: {len(chipseq_studies)}")
        print(f"  Common studies: {count}")
        
        results["EV-H01"] = {
            "status": "success",
            "result": f"{count} studies with both RNA-Seq and ChIP-Seq",
            "data": {"common_count": count, "rnaseq_count": len(rnaseq_studies), "chipseq_count": len(chipseq_studies)},
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-H01 Failed: {e}")
        results["EV-H01"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H01()

✅ EV-H01: Found 0 studies with both RNA-Seq and ChIP-Seq (estimated)
  RNA-Seq studies: 29
  ChIP-Seq studies: 63
  Common studies: 0


In [72]:
# EV-H02: Analyze platform distribution for human cancer RNA-Seq studies
def eval_H02():
    start = time.time()
    try:
        # Search for human cancer RNA-Seq
        result = sra_search("Homo sapiens[orgn] AND RNA-Seq[strategy] AND cancer[title]", retmax=300)
        
        if result['ids']:
            # Get metadata for platform analysis - fetch in batches
            metadata = sra_metadata(result['ids'][:100])
            
            # Add delay to avoid rate limiting
            time.sleep(0.5)
            
            if len(result['ids']) > 100:
                metadata.extend(sra_metadata(result['ids'][100:200]))
            
            platform_counts = {}
            total_studies = 0
            
            for record in metadata:
                if 'platform' in record and record['platform']:
                    platform = record['platform']
                    platform_counts[platform] = platform_counts.get(platform, 0) + 1
                    total_studies += 1
            
            print(f"✅ EV-H02: Platform distribution for human cancer RNA-Seq ({total_studies} studies analyzed)")
            
            if platform_counts:
                sorted_platforms = sorted(platform_counts.items(), key=lambda x: x[1], reverse=True)
                for platform, count in sorted_platforms:
                    percentage = (count / total_studies) * 100 if total_studies > 0 else 0
                    print(f"  {platform}: {count} ({percentage:.1f}%)")
            else:
                print("  No platform data found in metadata")
            
            results["EV-H02"] = {
                "status": "success",
                "result": f"Platform distribution for human cancer RNA-Seq ({total_studies} studies)",
                "data": platform_counts,
                "time": time.time() - start
            }
        else:
            results["EV-H02"] = {"status": "error", "error": "No results found", "time": time.time() - start}
            
    except Exception as e:
        print(f"❌ EV-H02 Failed: {e}")
        results["EV-H02"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H02()


✅ EV-H02: Platform distribution for human cancer RNA-Seq (200 studies analyzed)
  ILLUMINA: 177 (88.5%)
  DNBSEQ: 21 (10.5%)
  OXFORD_NANOPORE: 2 (1.0%)


In [73]:
# EV-H03: Find single-cell studies with paired-end sequencing
def eval_H03():
    start = time.time()
    try:
        result = sra_search("single cell[title] AND paired[layout] AND RNA-Seq[strategy]", retmax=0)
        count = result['count']
        
        # Get sample metadata for analysis
        if count > 0:
            sample_result = sra_search("single cell[title] AND paired[layout] AND RNA-Seq[strategy]", retmax=50)
            sample_metadata = sra_metadata(sample_result['ids']) if sample_result['ids'] else []
            
            organism_counts = {}
            for record in sample_metadata:
                if 'organism' in record and record['organism']:
                    organism = record['organism']
                    organism_counts[organism] = organism_counts.get(organism, 0) + 1
            
            print(f"✅ EV-H03: Found {count:,} paired-end single-cell RNA-Seq studies")
            
            if organism_counts:
                print("  Top organisms:")
                sorted_orgs = sorted(organism_counts.items(), key=lambda x: x[1], reverse=True)
                for org, cnt in sorted_orgs[:5]:
                    print(f"    {org}: {cnt} studies")
            
            results["EV-H03"] = {
                "status": "success",
                "result": f"{count:,} paired-end single-cell RNA-Seq studies",
                "data": {"count": count, "organism_distribution": organism_counts},
                "time": time.time() - start
            }
        else:
            print(f"✅ EV-H03: Found 0 paired-end single-cell RNA-Seq studies")
            results["EV-H03"] = {
                "status": "success",
                "result": "0 paired-end single-cell RNA-Seq studies",
                "time": time.time() - start
            }
            
    except Exception as e:
        print(f"❌ EV-H03 Failed: {e}")
        results["EV-H03"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H03()

✅ EV-H03: Found 645,230 paired-end single-cell RNA-Seq studies
  Top organisms:
    Macaca mulatta: 50 studies


In [74]:
# EV-H04: Analyze temporal trends in COVID-19 sequencing studies
def eval_H04():
    start = time.time()
    try:
        # Search for SARS-CoV-2 studies by year
        years = ['2020', '2021', '2022', '2023', '2024']
        yearly_counts = {}
        
        for year in years:
            try:
                result = sra_search(f"SARS-CoV-2[orgn] AND {year}[pdat]", retmax=0)
                yearly_counts[year] = result['count']
            except:
                yearly_counts[year] = 0
        
        total_count = sum(yearly_counts.values())
        
        print(f"✅ EV-H04: COVID-19 sequencing trends ({total_count:,} total studies)")
        print("  Year-by-year breakdown:")
        
        for year, count in yearly_counts.items():
            percentage = (count / total_count) * 100 if total_count > 0 else 0
            print(f"    {year}: {count:,} studies ({percentage:.1f}%)")
        
        results["EV-H04"] = {
            "status": "success",
            "result": f"COVID-19 temporal analysis ({total_count:,} studies)",
            "data": yearly_counts,
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-H04 Failed: {e}")
        results["EV-H04"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H04()

✅ EV-H04: COVID-19 sequencing trends (7,441,298 total studies)
  Year-by-year breakdown:
    2020: 169,396 studies (2.3%)
    2021: 2,828,183 studies (38.0%)
    2022: 3,062,934 studies (41.2%)
    2023: 651,565 studies (8.8%)
    2024: 729,220 studies (9.8%)


In [75]:
# EV-H05: Find studies with specific file size characteristics
def eval_H05():
    start = time.time()
    try:
        # Get a sample of recent human RNA-Seq studies
        result = sra_search("Homo sapiens[orgn] AND RNA-Seq[strategy]", retmax=100)
        
        if result['ids']:
            # Get metadata first to extract run accessions properly
            metadata = sra_metadata(result['ids'][:20])
            
            # Extract run accessions from the 'runs' field
            run_accessions = []
            for record in metadata:
                if 'runs' in record and record['runs']:
                    # Parse the runs field which contains XML
                    try:
                        runs_xml = ET.fromstring(f"<root>{record['runs']}</root>")
                        for run in runs_xml.findall('.//Run'):
                            acc = run.get('acc')
                            if acc:
                                run_accessions.append(acc)
                                if len(run_accessions) >= 15:  # Limit to avoid timeout
                                    break
                    except:
                        continue
                
                if len(run_accessions) >= 15:
                    break
            
            if not run_accessions:
                print(f"✅ EV-H05: Could not extract run accessions from metadata")
                results["EV-H05"] = {
                    "status": "success",
                    "result": "Run accession extraction not available",
                    "data": {"note": "Run accessions not in metadata format"},
                    "time": time.time() - start
                }
                return
            
            # Get file information from ENA
            files_info = ena_get_files(run_accessions[:10])  # Limit to 10 for performance
            
            large_files = []
            total_files = 0
            
            for acc, info in files_info.items():
                if 'error' not in info:
                    total_files += 1
                    # Check for large files (>1GB)
                    if 'fastq_bytes' in info and info['fastq_bytes']:
                        try:
                            sizes = str(info['fastq_bytes']).split(';')
                            for size_str in sizes:
                                if size_str and size_str.strip() and int(size_str.strip()) > 1_000_000_000:  # 1GB
                                    large_files.append(acc)
                                    break
                        except:
                            continue
            
            large_count = len(large_files)
            
            print(f"✅ EV-H05: Analyzed {total_files} files, found {large_count} with >1GB files")
            if large_files:
                print(f"  Sample large files: {large_files[:5]}")
            
            results["EV-H05"] = {
                "status": "success",
                "result": f"{large_count} studies with large files (>1GB) from {total_files} analyzed",
                "data": {"large_files": large_files, "total_analyzed": total_files},
                "time": time.time() - start
            }
        else:
            results["EV-H05"] = {"status": "error", "error": "No studies found", "time": time.time() - start}
            
    except Exception as e:
        print(f"❌ EV-H05 Failed: {e}")
        results["EV-H05"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H05()


✅ EV-H05: Could not extract run accessions from metadata


In [76]:
# EV-H06: Cross-platform comparison for model organisms
def eval_H06():
    start = time.time()
    try:
        organisms = ["Homo sapiens", "Mus musculus", "Drosophila melanogaster"]
        platforms = ["illumina", "pacbio", "oxford nanopore"]
        
        comparison_data = {}
        
        for organism in organisms:
            comparison_data[organism] = {}
            
            for platform in platforms:
                try:
                    result = sra_search(f'"{organism}"[orgn] AND {platform}[platform]', retmax=0)
                    comparison_data[organism][platform] = result['count']
                except:
                    comparison_data[organism][platform] = 0
        
        print(f"✅ EV-H06: Cross-platform comparison for model organisms")
        print("\n📊 ORGANISM × PLATFORM MATRIX:")
        
        # Header
        header = "Organism".ljust(20)
        for platform in platforms:
            header += f"{platform.title():>12}"
        header += f"{'Total':>12}"
        print(header)
        print("-" * len(header))
        
        # Data rows
        for organism in organisms:
            row = organism.ljust(20)
            org_total = 0
            for platform in platforms:
                count = comparison_data[organism][platform]
                row += f"{count:>12,}"
                org_total += count
            row += f"{org_total:>12,}"
            print(row)
        
        results["EV-H06"] = {
            "status": "success",
            "result": "Cross-platform comparison for model organisms",
            "data": comparison_data,
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-H06 Failed: {e}")
        results["EV-H06"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H06()

✅ EV-H06: Cross-platform comparison for model organisms

📊 ORGANISM × PLATFORM MATRIX:
Organism                Illumina      PacbioOxford Nanopore       Total
-----------------------------------------------------------------------
Homo sapiens           6,038,475           0      88,619   6,127,094
Mus musculus           2,718,509           0      14,631   2,733,140
Drosophila melanogaster     168,868           0         422     169,290


In [77]:
# EV-H07: Analyze library preparation methods for human studies
def eval_H07():
    start = time.time()
    try:
        # Get human RNA-Seq samples
        result = sra_search("Homo sapiens[orgn] AND RNA-Seq[strategy]", retmax=300)
        
        if result['ids']:
            # Fetch metadata in batches to get more data
            metadata = sra_metadata(result['ids'][:100])
            
            # Add delay and fetch more
            time.sleep(0.5)
            if len(result['ids']) > 100:
                metadata.extend(sra_metadata(result['ids'][100:200]))
            
            library_sources = {}
            total_with_source = 0
            
            for record in metadata:
                if 'library_source' in record and record['library_source']:
                    source = record['library_source']
                    library_sources[source] = library_sources.get(source, 0) + 1
                    total_with_source += 1
            
            print(f"✅ EV-H07: Library source analysis for human RNA-Seq ({total_with_source} with data)")
            
            if library_sources:
                sorted_sources = sorted(library_sources.items(), key=lambda x: x[1], reverse=True)
                for source, count in sorted_sources:
                    percentage = (count / total_with_source) * 100 if total_with_source > 0 else 0
                    print(f"  {source}: {count} ({percentage:.1f}%)")
            else:
                print("  No library source data found in metadata")
            
            results["EV-H07"] = {
                "status": "success",
                "result": f"Library source analysis for human RNA-Seq ({total_with_source} studies)",
                "data": library_sources,
                "time": time.time() - start
            }
        else:
            results["EV-H07"] = {"status": "error", "error": "No results found", "time": time.time() - start}
            
    except Exception as e:
        print(f"❌ EV-H07 Failed: {e}")
        results["EV-H07"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H07()


✅ EV-H07: Library source analysis for human RNA-Seq (200 with data)
  TRANSCRIPTOMIC: 191 (95.5%)
  TRANSCRIPTOMIC SINGLE CELL: 9 (4.5%)


In [78]:
# EV-H08: Multi-omics studies identification
def eval_H08():
    start = time.time()
    try:
        # Search for studies mentioning multiple omics approaches
        multi_omics_terms = [
            "multi-omics[title]",
            "multiomics[title]", 
            "genomics proteomics[title]",
            "transcriptomics metabolomics[title]"
        ]
        
        total_multi_omics = 0
        strategy_breakdown = {}
        
        for term in multi_omics_terms:
            try:
                result = sra_search(term, retmax=20)
                count = result['count']
                total_multi_omics += count
                
                # Get sample metadata
                if result['ids']:
                    metadata = sra_metadata(result['ids'][:10])
                    for record in metadata:
                        if 'library_strategy' in record and record['library_strategy']:
                            strategy = record['library_strategy']
                            strategy_breakdown[strategy] = strategy_breakdown.get(strategy, 0) + 1
            except:
                continue
        
        print(f"✅ EV-H08: Found {total_multi_omics:,} multi-omics studies")
        
        if strategy_breakdown:
            print("  Strategy breakdown:")
            sorted_strategies = sorted(strategy_breakdown.items(), key=lambda x: x[1], reverse=True)
            for strategy, count in sorted_strategies:
                print(f"    {strategy}: {count} studies")
        
        results["EV-H08"] = {
            "status": "success",
            "result": f"{total_multi_omics:,} multi-omics studies",
            "data": {"total": total_multi_omics, "strategies": strategy_breakdown},
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-H08 Failed: {e}")
        results["EV-H08"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H08()

✅ EV-H08: Found 109,407 multi-omics studies
  Strategy breakdown:
    RNA-Seq: 20 studies
    AMPLICON: 10 studies


In [79]:
# EV-H09: Long-read vs short-read sequencing comparison
def eval_H09():
    start = time.time()
    try:
        # Search for different sequencing technologies
        short_read_result = sra_search("illumina[platform] AND Homo sapiens[orgn]", retmax=0)
        long_read_pacbio = sra_search("pacbio[platform] AND Homo sapiens[orgn]", retmax=0)
        long_read_nanopore = sra_search("oxford nanopore[platform] AND Homo sapiens[orgn]", retmax=0)
        
        short_read_count = short_read_result['count']
        pacbio_count = long_read_pacbio['count']
        nanopore_count = long_read_nanopore['count']
        
        total_long_read = pacbio_count + nanopore_count
        total_all = short_read_count + total_long_read
        
        print(f"✅ EV-H09: Sequencing technology comparison for human samples")
        print(f"  Short-read (Illumina): {short_read_count:,} ({short_read_count/total_all*100:.1f}%)")
        print(f"  Long-read (PacBio): {pacbio_count:,} ({pacbio_count/total_all*100:.1f}%)")
        print(f"  Long-read (Nanopore): {nanopore_count:,} ({nanopore_count/total_all*100:.1f}%)")
        print(f"  Total long-read: {total_long_read:,} ({total_long_read/total_all*100:.1f}%)")
        
        results["EV-H09"] = {
            "status": "success",
            "result": f"Technology comparison: {short_read_count:,} short-read vs {total_long_read:,} long-read",
            "data": {
                "short_read": short_read_count,
                "pacbio": pacbio_count,
                "nanopore": nanopore_count,
                "total": total_all
            },
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-H09 Failed: {e}")
        results["EV-H09"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H09()

✅ EV-H09: Sequencing technology comparison for human samples
  Short-read (Illumina): 6,038,475 (98.6%)
  Long-read (PacBio): 0 (0.0%)
  Long-read (Nanopore): 88,619 (1.4%)
  Total long-read: 88,619 (1.4%)


In [80]:
# EV-H10: Comprehensive organism diversity analysis
def eval_H10():
    start = time.time()
    try:
        # Sample different organism categories
        organism_categories = {
            "Mammals": ["Homo sapiens", "Mus musculus", "Rattus norvegicus"],
            "Plants": ["Arabidopsis thaliana", "Oryza sativa", "Zea mays"],
            "Bacteria": ["Escherichia coli", "Bacillus subtilis", "Staphylococcus aureus"],
            "Viruses": ["SARS-CoV-2", "Influenza A virus", "Human immunodeficiency virus"],
            "Fungi": ["Saccharomyces cerevisiae", "Candida albicans", "Aspergillus fumigatus"],
            "Invertebrates": ["Drosophila melanogaster", "Caenorhabditis elegans", "Apis mellifera"]
        }
        
        category_totals = {}
        organism_details = {}
        
        for category, organisms in organism_categories.items():
            category_total = 0
            organism_details[category] = {}
            
            for organism in organisms:
                try:
                    result = sra_search(f'"{organism}"[orgn]', retmax=0)
                    count = result['count']
                    organism_details[category][organism] = count
                    category_total += count
                except:
                    organism_details[category][organism] = 0
            
            category_totals[category] = category_total
        
        grand_total = sum(category_totals.values())
        
        print(f"✅ EV-H10: Organism diversity analysis ({grand_total:,} total studies)")
        print("\n📊 ORGANISM DIVERSITY BREAKDOWN:")
        
        sorted_categories = sorted(category_totals.items(), key=lambda x: x[1], reverse=True)
        for category, total in sorted_categories:
            percentage = (total / grand_total) * 100 if grand_total > 0 else 0
            print(f"\n  {category}: {total:,} studies ({percentage:.1f}%)")
            
            # Show top organisms in category
            sorted_orgs = sorted(organism_details[category].items(), key=lambda x: x[1], reverse=True)
            for org, count in sorted_orgs[:3]:  # Top 3
                if count > 0:
                    print(f"    {org}: {count:,}")
        
        results["EV-H10"] = {
            "status": "success",
            "result": f"Organism diversity analysis ({grand_total:,} studies)",
            "data": {
                "category_totals": category_totals,
                "organism_details": organism_details,
                "grand_total": grand_total
            },
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-H10 Failed: {e}")
        results["EV-H10"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H10()

✅ EV-H10: Organism diversity analysis (18,774,360 total studies)

📊 ORGANISM DIVERSITY BREAKDOWN:

  Mammals: 9,378,573 studies (50.0%)
    Homo sapiens: 6,372,238
    Mus musculus: 2,860,071
    Rattus norvegicus: 146,264

  Viruses: 7,580,391 studies (40.4%)
    SARS-CoV-2: 7,518,624
    Influenza A virus: 60,233
    Human immunodeficiency virus: 1,534

  Bacteria: 736,595 studies (3.9%)
    Escherichia coli: 550,962
    Staphylococcus aureus: 175,614
    Bacillus subtilis: 10,019

  Plants: 555,373 studies (3.0%)
    Arabidopsis thaliana: 220,862
    Zea mays: 167,914
    Oryza sativa: 166,597

  Fungi: 261,953 studies (1.4%)
    Saccharomyces cerevisiae: 243,862
    Candida albicans: 12,200
    Aspergillus fumigatus: 5,891

  Invertebrates: 261,475 studies (1.4%)
    Drosophila melanogaster: 174,133
    Caenorhabditis elegans: 65,832
    Apis mellifera: 21,510


In [81]:
# ============================================================================
# RESULTS SUMMARY
# ============================================================================

print("\n" + "="*80)
print("🧬 SRA EVALUATION RESULTS SUMMARY")
print("="*80)

# Categorize results
categories = {
    "Low Complexity (L01-L10)": [f"EV-L{i:02d}" for i in range(1, 11)],
    "Medium Complexity (M01-M10)": [f"EV-M{i:02d}" for i in range(1, 11)],
    "High Complexity (H01-H10)": [f"EV-H{i:02d}" for i in range(1, 11)]
}

overall_stats = {
    "total_tests": 0,
    "successful": 0,
    "failed": 0,
    "total_time": 0
}

for category_name, eval_ids in categories.items():
    print(f"\n📊 {category_name}")
    print("-" * 60)
    
    category_stats = {"success": 0, "error": 0, "total_time": 0}
    
    for eval_id in eval_ids:
        if eval_id in results:
            result = results[eval_id]
            status = result["status"]
            exec_time = result.get("time", 0)
            
            if status == "success":
                print(f"  ✅ {eval_id}: {result['result']} ({exec_time:.2f}s)")
                category_stats["success"] += 1
                overall_stats["successful"] += 1
            else:
                print(f"  ❌ {eval_id}: {result.get('error', 'Unknown error')} ({exec_time:.2f}s)")
                category_stats["error"] += 1
                overall_stats["failed"] += 1
            
            category_stats["total_time"] += exec_time
            overall_stats["total_time"] += exec_time
            overall_stats["total_tests"] += 1
        else:
            print(f"  ❓ {eval_id}: Not executed")
    
    # Category summary
    total_category = category_stats["success"] + category_stats["error"]
    success_rate = (category_stats["success"] / total_category * 100) if total_category > 0 else 0
    avg_time = category_stats["total_time"] / total_category if total_category > 0 else 0
    
    print(f"\n  📈 Category Summary:")
    print(f"     Success Rate: {success_rate:.1f}% ({category_stats['success']}/{total_category})")
    print(f"     Average Time: {avg_time:.2f}s")
    print(f"     Total Time: {category_stats['total_time']:.2f}s")

# Overall summary
print(f"\n🎯 OVERALL EVALUATION SUMMARY")
print(f"   Total Tests: {overall_stats['total_tests']}")
print(f"   Successful: {overall_stats['successful']} ({overall_stats['successful']/overall_stats['total_tests']*100:.1f}%)")
print(f"   Failed: {overall_stats['failed']} ({overall_stats['failed']/overall_stats['total_tests']*100:.1f}%)")
print(f"   Total Execution Time: {overall_stats['total_time']:.2f}s")
print(f"   Average Time per Query: {overall_stats['total_time']/overall_stats['total_tests']:.2f}s")

# Performance analysis
print(f"\n⚡ PERFORMANCE ANALYSIS")
fastest_queries = []
slowest_queries = []

for eval_id, result in results.items():
    if result["status"] == "success":
        exec_time = result.get("time", 0)
        fastest_queries.append((eval_id, exec_time))
        slowest_queries.append((eval_id, exec_time))

fastest_queries.sort(key=lambda x: x[1])
slowest_queries.sort(key=lambda x: x[1], reverse=True)

print(f"   Fastest Queries:")
for eval_id, exec_time in fastest_queries[:3]:
    print(f"     {eval_id}: {exec_time:.2f}s")

print(f"   Slowest Queries:")
for eval_id, exec_time in slowest_queries[:3]:
    print(f"     {eval_id}: {exec_time:.2f}s")

print(f"\n🎉 SRA Evaluation Complete!")
print(f"   Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


🧬 SRA EVALUATION RESULTS SUMMARY

📊 Low Complexity (L01-L10)
------------------------------------------------------------
  ✅ EV-L01: 3 platforms found (4.87s)
  ✅ EV-L02: 6,372,238 human samples (0.97s)
  ✅ EV-L03: 5 library strategies (4.75s)
  ✅ EV-L04: 6,090,828 RNA-Seq experiments (1.06s)
  ✅ EV-L05: 8 organisms with >1000 samples (8.46s)
  ✅ EV-L06: 32,991,092 Illumina experiments (1.13s)
  ✅ EV-L07: 369,136 ChIP-Seq experiments (1.16s)
  ✅ EV-L08: 1,011,227 single-cell RNA-Seq experiments (1.58s)
  ✅ EV-L09: 7,518,624 SARS-CoV-2 experiments (1.09s)
  ✅ EV-L10: 7,755,321 WGS experiments (8.03s)

  📈 Category Summary:
     Success Rate: 100.0% (10/10)
     Average Time: 3.31s
     Total Time: 33.10s

📊 Medium Complexity (M01-M10)
------------------------------------------------------------
  ✅ EV-M01: 2,020,206 human RNA-Seq Illumina experiments (1.06s)
  ✅ EV-M02: 122,056 mouse ChIP-Seq experiments (0.99s)
  ✅ EV-M03: 1,372,686 paired-end human RNA-Seq experiments (1.04s)
  ✅ EV

In [82]:
# Optional: Save results to JSON file for further analysis
import json
from datetime import datetime

# Prepare results for export
export_data = {
    "evaluation_metadata": {
        "title": "SRA Query Evaluation Framework",
        "date": datetime.now().isoformat(),
        "total_queries": len(results),
        "successful_queries": len([r for r in results.values() if r["status"] == "success"]),
        "failed_queries": len([r for r in results.values() if r["status"] == "error"]),
        "total_time": sum([r.get("time", 0) for r in results.values()]),
        "api_endpoints": [NCBI_EUTILS_BASE, ENA_API_BASE]
    },
    "query_results": results,
    "summary_statistics": overall_stats
}

# Save to file
output_file = "sra_evaluation_results.json"
with open(output_file, 'w') as f:
    json.dump(export_data, f, indent=2, default=str)

print(f"📁 Results exported to: {output_file}")
print(f"📊 Ready for further analysis and visualization!")

📁 Results exported to: sra_evaluation_results.json
📊 Ready for further analysis and visualization!
