# Analyse the ENA Records

Records retrieved from ENA on 2-Mar-2025 using the broad search term 'ascaris'. 

All experiments were extracted to xml and saved under `data/ENA_ascaris_experiments/ena_sra-experiment_20250302-1102.xml`



In [1]:
import time
import xml.etree.ElementTree as ET
from collections import OrderedDict

import pandas as pd

In [2]:
# helper functions from sra_src/parser.py modified for the ENA experiment xml
def return_text(element: ET.Element) -> str | None:
    return None if element == None else element.text


def extract_data(record: ET.Element) -> list[OrderedDict]:
    """Extracts the important information from a EXPERIMENT_PACKAGE record.

    Args:
        record (ET.Element): xml element of the EXPERIMENT_PACKAGE record

    Returns:
        list[dict]: list of all the run records extracted into a dict.
    """
    data_rec = OrderedDict()
    data_rec["Experiment"] = record.attrib.get("accession")

    # library info
    lib_descriptor = record.find("DESIGN/LIBRARY_DESCRIPTOR")
    data_rec["LibraryStrategy"] = return_text(lib_descriptor.find("LIBRARY_STRATEGY"))
    data_rec["LibrarySelection"] = return_text(lib_descriptor.find("LIBRARY_SELECTION"))
    data_rec["LibrarySource"] = return_text(lib_descriptor.find("LIBRARY_SOURCE"))
    data_rec["LibraryLayout"] = [i.tag for i in lib_descriptor.find("LIBRARY_LAYOUT")][
        0
    ]
    data_rec["LibraryProtocol"] = return_text(
        lib_descriptor.find("LIBRARY_CONSTRUCTION_PROTOCOL")
    )

    # sequencer
    platform = record.find("PLATFORM")
    data_rec["Sequencer"] = [
        [return_text(gc) for gc in child][0] for child in platform
    ][0]

    # study info
    data_rec["StudyId"] = return_text(record.find("STUDY_REF/IDENTIFIERS/PRIMARY_ID"))

    # sample info
    sample_block = record.find("DESIGN/SAMPLE_DESCRIPTOR")
    data_rec["SampleId"] = sample_block.attrib.get("accession")
    data_rec["BioSample"] = return_text(sample_block.find("IDENTIFIERS/EXTERNAL_ID"))
    return data_rec

In [3]:
# data source as mentioned in first
ena_xmls = "../data/ENA_ascaris_metadata/ena_sra-experiment_20250302-1102.xml"

start_run = time.time()

record = []
count = 0
records = []

# parse each xml record of an experiment
with open(ena_xmls, "r") as handle:
    for line in handle.readlines():
        if (line := line.strip()) in [
            "<EXPERIMENT_SET>",
        ]:
            continue
        record.append(line)

        # find the end of the experiment package xml record
        if line == "</EXPERIMENT>":
            parsed_records = extract_data(ET.fromstringlist(record))
            record = []
            count += 1
            if count == 1:
                records.append(list(parsed_records.keys()))

            records.append([str(i) for _, i in parsed_records.items()])

            if count % 1000 == 0:
                print(f"{count} records parsed in {round(time.time()-start_run)}s")

    print(f"Run records parsed = {count}")

1000 records parsed in 0s
Run records parsed = 1172


In [4]:
df = pd.DataFrame.from_records(records[1:], columns=records[0])

In [5]:
df.columns

Index(['Experiment', 'LibraryStrategy', 'LibrarySelection', 'LibrarySource',
       'LibraryLayout', 'LibraryProtocol', 'Sequencer', 'StudyId', 'SampleId',
       'BioSample'],
      dtype='object')

In [6]:
df["Sequencer"].value_counts()

Sequencer
Illumina MiSeq                  747
Illumina NovaSeq 6000           190
Illumina HiSeq 2000             110
Illumina Genome Analyzer         51
Illumina Genome Analyzer II      26
Illumina HiSeq 2500              20
Illumina HiSeq 4000              18
Illumina HiSeq 1000               6
Illumina Genome Analyzer IIx      2
454 GS FLX Titanium               1
Illumina HiSeq 3000               1
Name: count, dtype: int64

In [7]:
df["LibrarySelection"].value_counts()

LibrarySelection
PCR                   739
size fractionation    188
ChIP                   62
RANDOM                 52
unspecified            44
cDNA                   40
other                  40
RT-PCR                  7
Name: count, dtype: int64

In [8]:
# focus on the random library selection strategy
pd.crosstab(df["LibraryStrategy"], df["LibrarySelection"])

LibrarySelection,ChIP,PCR,RANDOM,RT-PCR,cDNA,other,size fractionation,unspecified
LibraryStrategy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AMPLICON,0,739,0,0,0,0,0,0
ChIP-Seq,62,0,0,0,0,0,0,0
OTHER,0,0,0,0,0,40,0,0
RNA-Seq,0,0,21,7,40,0,53,32
WGS,0,0,31,0,0,0,0,12
ncRNA-Seq,0,0,0,0,0,0,135,0


In [9]:
# avoid the single cells and use only bulk for now
pd.crosstab(df["LibrarySource"], df["LibrarySelection"])

LibrarySelection,ChIP,PCR,RANDOM,RT-PCR,cDNA,other,size fractionation,unspecified
LibrarySource,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GENOMIC,62,0,31,0,0,40,0,12
METAGENOMIC,0,671,0,0,0,0,0,0
OTHER,0,68,0,0,0,0,0,0
TRANSCRIPTOMIC,0,0,12,7,40,0,188,32
TRANSCRIPTOMIC SINGLE CELL,0,0,9,0,0,0,0,0


In [10]:
selected_ENA_experiments = df.loc[
    (df["LibrarySelection"] == "RANDOM")
    & (df["LibrarySource"].isin(["GENOMIC", "TRANSCRIPTOMIC"]))
    & (df["Sequencer"].str.contains("Illumina"))
]

In [11]:
selected_ENA_experiments[["Sequencer", "LibrarySource"]].value_counts()

Sequencer                    LibrarySource 
Illumina HiSeq 2000          GENOMIC           13
Illumina HiSeq 4000          TRANSCRIPTOMIC    12
Illumina NovaSeq 6000        GENOMIC           11
Illumina Genome Analyzer II  GENOMIC            4
Illumina HiSeq 1000          GENOMIC            1
Illumina HiSeq 3000          GENOMIC            1
Name: count, dtype: int64

In [12]:
selected_ENA_experiments

Unnamed: 0,Experiment,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,LibraryProtocol,Sequencer,StudyId,SampleId,BioSample
0,ERX015104,WGS,RANDOM,GENOMIC,PAIRED,No PCR,Illumina Genome Analyzer II,ERP000532,ERS006615,SAMEA915801
1,ERX015105,WGS,RANDOM,GENOMIC,PAIRED,No PCR,Illumina Genome Analyzer II,ERP000532,ERS006615,SAMEA915801
2,ERX015106,WGS,RANDOM,GENOMIC,PAIRED,No PCR,Illumina HiSeq 2000,ERP000532,ERS006615,SAMEA915801
3,ERX044035,WGS,RANDOM,GENOMIC,PAIRED,No PCR,Illumina HiSeq 2000,ERP000904,ERS056112,SAMEA1034744
4,ERX044036,WGS,RANDOM,GENOMIC,PAIRED,No PCR,Illumina HiSeq 2000,ERP000904,ERS056111,SAMEA1034755
5,ERX046218,WGS,RANDOM,GENOMIC,PAIRED,Long range,Illumina HiSeq 2000,ERP000904,ERS067638,SAMEA1034750
6,ERX087586,WGS,RANDOM,GENOMIC,PAIRED,No PCR,Illumina HiSeq 2000,ERP000904,ERS083920,SAMEA1463534
7,ERX092220,WGS,RANDOM,GENOMIC,PAIRED,Long range,Illumina HiSeq 2000,ERP000904,ERS094834,SAMEA1463511
15,ERX284224,WGS,RANDOM,GENOMIC,PAIRED,Custom,Illumina HiSeq 2000,ERP000904,ERS083922,SAMEA1984308
17,SRX039003,WGS,RANDOM,GENOMIC,PAIRED,,Illumina Genome Analyzer II,SRP005397,SRS155755,
