In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [2]:
# Import necessary libraries
from Bio import Entrez
import pandas as pd
from io import StringIO

In [4]:
# Set your email (required by NCBI Entrez)
Entrez.email = 'bryan.vannimwegen@wur.nl'  # Replace with your email

# Define the search query
# The query searches for RNA-Seq experiments involving HEK or HeLa cell lines
query = '("HEK" OR "HeLa") AND "RNA-Seq"[Strategy]'

# Specify the number of records to fetch
num_records = 50000  # You can adjust this number as needed

# Search the SRA database
print("Searching the SRA database for RNA-Seq data from HEK and HeLa cell lines...")
handle = Entrez.esearch(db="sra", term=query, retmax=num_records)
record = Entrez.read(handle)
id_list = record['IdList']
print(f"Retrieved {len(id_list)} SRR IDs.")

# Define a function to fetch run information
def fetch_run_info(id_list):
    print("Fetching run information for the retrieved SRR IDs...")
    try:
        # To avoid too large requests, fetch in batches if necessary
        batch_size = 500  # Adjust batch size as needed
        runinfo_df_list = []
        for start in range(0, len(id_list), batch_size):
            end = min(start + batch_size, len(id_list))
            batch_ids = id_list[start:end]
            print(f"Fetching batch {start + 1} to {end}...")
            handle = Entrez.efetch(db='sra', id=','.join(batch_ids), rettype='runinfo', retmode='text')
            runinfo = handle.read()

            # Decode bytes to string if necessary
            if isinstance(runinfo, bytes):
                runinfo = runinfo.decode('utf-8')

            # Check if runinfo is not empty
            if runinfo.strip():
                df_batch = pd.read_csv(StringIO(runinfo))
                runinfo_df_list.append(df_batch)
            else:
                print(f"No run information retrieved for batch {start + 1} to {end}.")

        # Concatenate all batches
        if runinfo_df_list:
            full_runinfo_df = pd.concat(runinfo_df_list, ignore_index=True)
            return full_runinfo_df
        else:
            print("No run information retrieved for any batch.")
            return pd.DataFrame()
    except Exception as e:
        print(f"An error occurred while fetching run information: {e}")
        return pd.DataFrame()

# Fetch run information for all SRR IDs
run_info_df = fetch_run_info(id_list)

# Process and extract the required information
if not run_info_df.empty:
    print("Run information fetched successfully.")
    print("Available columns:")
    print(run_info_df.columns.tolist())

    # Display the first few rows of the fetched data
    display(run_info_df.head())

    # Define the required columns
    required_columns = [
        'Run',
        'ReleaseDate',
        'avgLength',
        'Model',
        'LibraryLayout',
        'LibrarySelection',
        'LibraryStrategy',
        'Experiment',
        'TaxID',
        'BioProject',
        'SampleName',
        'size_MB'
    ]

    # Check which required columns are available
    available_columns = [col for col in required_columns if col in run_info_df.columns]
    missing_cols = [col for col in required_columns if col not in run_info_df.columns]

    if available_columns:
        # Extract the available required columns
        result_df = run_info_df[available_columns].copy()

        # Rename columns for clarity
        rename_dict = {
            'avgLength': 'ReadLength',
            'Model': 'SequencerUsed',
            'LibraryLayout': 'LibraryLayout',
            'LibrarySelection': 'LibrarySelection',
            'LibraryStrategy': 'LibraryStrategy',
            'ReleaseDate': 'ReleaseDate',
            'Experiment': 'Experiment',
            'TaxID': 'TaxID',
            'BioProject': 'BioProject',
            'SampleName': 'SampleName',
            'size_MB': 'Size_MB'
        }
        result_df.rename(columns=rename_dict, inplace=True)

        # Convert 'Size_MB' and 'TaxID' to numeric, coerce errors to NaN
        result_df['Size_MB'] = pd.to_numeric(result_df['Size_MB'], errors='coerce')
        result_df['TaxID'] = pd.to_numeric(result_df['TaxID'], errors='coerce')

        # Verify that 'Size_MB' and 'TaxID' correspond to individual SRR runs
        # Typically, each 'Run' should be unique
        if result_df['Run'].is_unique:
            print("\nEach 'Run' is unique. Proceeding with individual run size and species filtering.")
        else:
            print("\nWarning: 'Run' IDs are not unique. There might be duplicate entries.")
            # Optionally, handle duplicates here if necessary

        # Display the first few rows of the extracted information
        print("\nFirst few SRR numbers with their corresponding metadata:")
        display(result_df.head())

        # Define filtering criteria
        size_lower_bound = 2000  # 2 GB in MB
        size_upper_bound = 8000  # 8 GB in MB
        human_taxid = 9606       # TaxID for Homo sapiens

        # Apply filters:
        # 1. Size_MB between 2000 and 8000
        # 2. TaxID equal to 9606 (Human)
        filtered_df = result_df[
            (result_df['Size_MB'] >= size_lower_bound) &
            (result_df['Size_MB'] <= size_upper_bound) &
            (result_df['TaxID'] == human_taxid)
        ]
        print(f"\nAfter filtering, {len(filtered_df)} samples remain with individual size between 2 GB and 8 GB and from human.")

        # Display the first few rows of the filtered data
        display(filtered_df.head())

        # Save the filtered results to a CSV file
        filtered_df.to_csv('srr_metadata_filtered.csv', index=False)
        print("\nSaved the filtered SRR numbers and metadata to 'srr_metadata_filtered.csv'.")

        # Provide a download link for the CSV file (for Google Colab)
        try:
            from google.colab import files
            files.download('srr_metadata_filtered.csv')
        except ImportError:
            print("Not running in Google Colab. Please download the 'srr_metadata_filtered.csv' file manually.")
    else:
        print(f"\nThe following required columns are missing: {missing_cols}")
        print("Please verify the available columns and adjust the script accordingly.")
else:
    print("No run information available to process.")

Searching the SRA database for RNA-Seq data from HEK and HeLa cell lines...
Retrieved 20046 SRR IDs.
Fetching run information for the retrieved SRR IDs...
Fetching batch 1 to 500...
Fetching batch 501 to 1000...
Fetching batch 1001 to 1500...
Fetching batch 1501 to 2000...
Fetching batch 2001 to 2500...
Fetching batch 2501 to 3000...
Fetching batch 3001 to 3500...
Fetching batch 3501 to 4000...
Fetching batch 4001 to 4500...
Fetching batch 4501 to 5000...
Fetching batch 5001 to 5500...
Fetching batch 5501 to 6000...
Fetching batch 6001 to 6500...
Fetching batch 6501 to 7000...
Fetching batch 7001 to 7500...
Fetching batch 7501 to 8000...
Fetching batch 8001 to 8500...
Fetching batch 8501 to 9000...
Fetching batch 9001 to 9500...
Fetching batch 9501 to 10000...
Fetching batch 10001 to 10500...
Fetching batch 10501 to 11000...
Fetching batch 11001 to 11500...
Fetching batch 11501 to 12000...
Fetching batch 12001 to 12500...
Fetching batch 12501 to 13000...
Fetching batch 13001 to 13500..

Unnamed: 0,Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,AssemblyName,download_path,...,Affection_Status,Analyte_Type,Histological_Type,Body_Site,CenterName,Submission,dbgap_study_accession,Consent,RunHash,ReadHash
0,SRR30719219,2024-09-19 09:03:19,2024-09-19 08:59:15,22490377,2377903535,0,105,1116,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,,,,,LOMONOSOV MOSCOW STATE UNIVERSITY,SRA1975263,,public,3CE4408188C615503F1AD76BDA8D82A0,8B83DAC496E7DBA8F8E26F863173CE7D
1,SRR30719220,2024-09-19 09:03:19,2024-09-19 08:58:19,24430370,2626964208,0,107,1222,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,,,,,LOMONOSOV MOSCOW STATE UNIVERSITY,SRA1975263,,public,A1142314D7B3769AF08527F283B79D7C,3BD6BB0047BA595AD5821A307A33FFF7
2,SRR30719218,2024-09-19 09:32:48,2024-09-19 09:01:37,35039478,5485174003,0,156,2643,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,,,,,LOMONOSOV MOSCOW STATE UNIVERSITY,SRA1975263,,public,8F8340569AA52AB32E846CF1A9A57CBF,9DB0CF2A80BE67B21F0341610AEC65FF
3,SRR30719216,2024-09-19 09:12:25,2024-09-19 09:01:22,40787917,6283228666,0,154,3027,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,,,,,LOMONOSOV MOSCOW STATE UNIVERSITY,SRA1975263,,public,CC525D99D9D6224C010A1634188E7385,17F4E99CC5BE7C73E3A59DB12AD512B2
4,SRR30719215,2024-09-19 09:03:21,2024-09-19 08:59:58,36269386,4264299205,0,117,2022,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,...,,,,,LOMONOSOV MOSCOW STATE UNIVERSITY,SRA1975263,,public,967240EB55DFCCF401FF95B5A322A112,DF05F640DC88B69A39543D2496FAB138



Each 'Run' is unique. Proceeding with individual run size and species filtering.

First few SRR numbers with their corresponding metadata:


Unnamed: 0,Run,ReleaseDate,ReadLength,SequencerUsed,LibraryLayout,LibrarySelection,LibraryStrategy,Experiment,TaxID,BioProject,SampleName,Size_MB
0,SRR30719219,2024-09-19 09:03:19,105,Illumina HiSeq 1500,SINGLE,other,RNA-Seq,SRX26122396,9606,PRJNA1162735,3rd EU-RNA-Seq of METTL4 KO: 10 minutes after ...,1116
1,SRR30719220,2024-09-19 09:03:19,107,Illumina HiSeq 1500,SINGLE,other,RNA-Seq,SRX26122395,9606,PRJNA1162735,2nd EU-RNA-Seq of METTL4 KO: 10 minutes after ...,1222
2,SRR30719218,2024-09-19 09:32:48,156,Illumina HiSeq 1500,SINGLE,other,RNA-Seq,SRX26122397,9606,PRJNA1162735,1st EU-RNA-Seq of METTL4 KO: 20 minutes after ...,2643
3,SRR30719216,2024-09-19 09:12:25,154,Illumina HiSeq 1500,SINGLE,other,RNA-Seq,SRX26122399,9606,PRJNA1162735,3rd EU-RNA-Seq of METTL4 KO: 20 minutes after ...,3027
4,SRR30719215,2024-09-19 09:03:21,117,Illumina HiSeq 1500,SINGLE,Oligo-dT,RNA-Seq,SRX26122400,9606,PRJNA1162735,1st EU-RNA-Seq of METTL4 KO: polyA+,2022



After filtering, 5454 samples remain with individual size between 2 GB and 8 GB and from human.


Unnamed: 0,Run,ReleaseDate,ReadLength,SequencerUsed,LibraryLayout,LibrarySelection,LibraryStrategy,Experiment,TaxID,BioProject,SampleName,Size_MB
2,SRR30719218,2024-09-19 09:32:48,156,Illumina HiSeq 1500,SINGLE,other,RNA-Seq,SRX26122397,9606,PRJNA1162735,1st EU-RNA-Seq of METTL4 KO: 20 minutes after ...,2643
3,SRR30719216,2024-09-19 09:12:25,154,Illumina HiSeq 1500,SINGLE,other,RNA-Seq,SRX26122399,9606,PRJNA1162735,3rd EU-RNA-Seq of METTL4 KO: 20 minutes after ...,3027
4,SRR30719215,2024-09-19 09:03:21,117,Illumina HiSeq 1500,SINGLE,Oligo-dT,RNA-Seq,SRX26122400,9606,PRJNA1162735,1st EU-RNA-Seq of METTL4 KO: polyA+,2022
5,SRR30719217,2024-09-19 09:12:25,155,Illumina HiSeq 1500,SINGLE,other,RNA-Seq,SRX26122398,9606,PRJNA1162735,2nd EU-RNA-Seq of METTL4 KO: 20 minutes after ...,3108
10,SRR30860038,2024-10-02 05:56:24,300,Illumina HiSeq 4000,PAIRED,PolyA,RNA-Seq,SRX26258167,9606,PRJNA1167917,HEK-RUNX1T1-E1,4521



Saved the filtered SRR numbers and metadata to 'srr_metadata_filtered.csv'.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>