# Getting data from NCBI

> 1. Reference Genome Sequence (fasta)
> 2. RefSeq Reference Genome Annotation (gff3)
> 3. RefSeq Transcripts (fasta)
> 4. RefSeq Proteins (fasta)

In [1]:
#| default_exp data.download

In [2]:
#| hide
from nbdev.showdoc import *

In [2]:
#| export
from yaml import safe_load
from pathlib import Path
import typing
import subprocess
from tqdm import tqdm
import gzip
import pandas as pd
import warnings

warnings.simplefilter("ignore")

from Bio import SeqIO, SeqRecord


## Get our config file in

In [4]:
#| export
def load_config(path: Path = Path("../config.yml")) -> dict[typing.Any, typing.Any]:
    with open(path) as f:
        return safe_load(f)

In [5]:
#| hide
config = load_config()

data_path = Path(config.get("data_path"))
data_raw_path = data_path / "raw"
assemblies_path = data_raw_path / "assemblies"

## Get our data from NCBI

In [6]:
#| export
def download_data(data_path: Path):
    data_raw_path = data_path / "raw"
    assemblies_path = data_raw_path / "assemblies"
    if not assemblies_path.exists():
        assemblies_path.mkdir(parents=True)
    # Get latest reference genome data
    subprocess.call([
        "rsync", "--copy-links", "--recursive", "--times", "--verbose",
        "--exclude=Annotation_comparison",
        "--exclude=*_assembly_structure",
        "--exclude=*_major_release_seqs_for_alignment_pipelines",
        "--exclude=RefSeq_historical_alignments",
        "--exclude=RefSeq_transcripts_alignments",
        "rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/reference/",
        str(assemblies_path.resolve())
    ])    

In [7]:
#| hide
download_data(data_path)



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


receiving incremental file list
./
GCF_000001405.40_GRCh38.p14/
GCF_000001405.40_GRCh38.p14/assembly_status.txt
GCF_000001405.40_GRCh38.p14/md5checksums.txt

sent 1,943 bytes  received 2,213 bytes  8,312.00 bytes/sec
total size is 3,601,107,022  speedup is 866,483.88


rsync: [receiver] rename "/mnt/e/Data/llm-mito-scanner-data/data/raw/assemblies/GCF_000001405.40_GRCh38.p14/.md5checksums.txt.ws1moi" -> "GCF_000001405.40_GRCh38.p14/md5checksums.txt": Permission denied (13)
rsync error: some files/attrs were not transferred (see previous errors) (code 23) at main.c(1865) [generator=3.2.7]


## Utils

In [8]:
#| export
def get_latest_assembly_path(
        assemblies_path: Path # Path for downloaded assemblies
        ) -> Path: # Path of the latest assembly
    "Get the latest annotation path."
    annotations = [d for d in assemblies_path.iterdir() if d.is_dir()]
    annotations_df = pd.DataFrame(annotations, columns=['path'])
    annotations_df.loc[:, 'accession'] = annotations_df.path.apply(lambda p: p.name)
    annotations_df.loc[:, 'accession_prefix'] = annotations_df.accession.apply(
        lambda acc: acc.split(".", 1)[0]
    )
    annotations_df.sort_values("accession_prefix", inplace=True, ascending=False)
    return annotations_df.iloc[0, 0]

In [32]:
#| hide
latest_assembly_path = get_latest_assembly_path(assemblies_path)
latest_assembly_path

Path('/mnt/e/Data/llm-mito-scanner-data/data/raw/assemblies/GCF_000001405.40_GRCh38.p14')

In [33]:
#| export
def get_genomic_genbank_path(
        assembly_path: Path # Annotation path,
        ) -> Path: # Genomic genbank path
    "Get the genomic genbank file."
    return next(assembly_path.glob("*[!from]_genomic.gbff.gz"), None)

In [34]:
#| hide
genomic_genbank_path = get_genomic_genbank_path(latest_assembly_path)
genomic_genbank_path

Path('/mnt/e/Data/llm-mito-scanner-data/data/raw/assemblies/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gbff.gz')

In [37]:
#| hide
ignore_prefixes = set([
    "NG_",
    "NT_",
    "NW_",
    "NZ_",
    "XM_",
    "XR_",
    "AP_",
    "NP_",
    "YP_",
    "XP_",
    "WP_"
])
with gzip.open(str(genomic_genbank_path.resolve()), mode='rt') as f:
    for sample_record in SeqIO.parse(f, "genbank"):
        pref = sample_record.id[:3]
        if pref not in ignore_prefixes and "chromosome" not in sample_record.description.lower():
            break

sample_record

NC


SeqRecord(seq=Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN'), id='NC_000001.11', name='NC_000001', description='Homo sapiens chromosome 1, GRCh38.p14 Primary Assembly', dbxrefs=['BioProject:PRJNA168', 'Assembly:GCF_000001405.40'])

In [39]:
#| hide
len(sample_record.features)

51489

In [13]:
#| hide
sample_record

SeqRecord(seq=Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN'), id='NC_000001.11', name='NC_000001', description='Homo sapiens chromosome 1, GRCh38.p14 Primary Assembly', dbxrefs=['BioProject:PRJNA168', 'Assembly:GCF_000001405.40'])

## Break out the chromosome sequence records

In [14]:
#| export
def extract_accession_sequence_records(
        genomic_genbank_path: Path, # Path to the genomic genbank file for an assembly
        assembly_path: Path, # Path to write the files to
        expected_accessions: int = 24
):
    write_path = assembly_path / "chromosomes"
    if not write_path.exists():
        write_path.mkdir()
    pbar = None
    if isinstance(expected_accessions, int):
        pbar = tqdm(total=expected_accessions, ncols=80, leave=True)
    try:
        with gzip.open(str(genomic_genbank_path.resolve()), mode='rt') as f:
            for record in SeqIO.parse(f, "genbank"):
                # Only write complete genomic molecules
                if record.id.startswith("NC_"):
                    record_write_path = write_path / f"{record.id}.gb"
                    if not record_write_path.exists():
                        SeqIO.write(record, record_write_path, "genbank")
                    if pbar is not None:
                        pbar.update(1)
    except Exception as e:
        raise e
    finally:
        if pbar is not None:
            pbar.close()

In [15]:
#| hide
extract_accession_sequence_records(
    genomic_genbank_path,
    latest_assembly_path,
    24
)

  0%|                                                    | 0/24 [00:00<?, ?it/s]

25it [03:14,  7.77s/it]                                                         


## Utils

In [16]:
#| hide
chromosomes_path = latest_assembly_path / "chromosomes"

In [19]:
#| export
def get_chromosome_sequence_record(
        chromosomes_path: str,
        refseq: str
) -> SeqRecord:
    refseq_path = chromosomes_path / f"{refseq}.gb"
    with refseq_path.open("r") as f:
        return next(SeqIO.parse(f, "genbank"), None)

In [20]:
#| hide
example_accession = get_chromosome_sequence_record(
    chromosomes_path,
    "NC_000001.11"
)

In [21]:
#| hide
example_accession

SeqRecord(seq=Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN'), id='NC_000001.11', name='NC_000001', description='Homo sapiens chromosome 1, GRCh38.p14 Primary Assembly', dbxrefs=['BioProject:PRJNA168', 'Assembly:GCF_000001405.40'])

In [3]:
#| hide
import nbdev; nbdev.nbdev_export()

IndentationError: expected an indented block after function definition on line 18 (<unknown>, line 19)