# Making the Reference Genome more easily usable

> "Using BioPython to convert the reference human genome into a sqlite database."

In [None]:
#| default_exp features.genome

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from pathlib import Path
from Bio import SeqIO
import sqlite3
from yaml import safe_load
from tqdm.auto import tqdm



In [None]:
#| hide
with open("../config.yml") as f:
    config = safe_load(f)

In [None]:
#| hide
data_path = Path(config.get("data_path"))

In [None]:
#| hide
genome_path = data_path / "raw/GRCh38_latest_genomic.fna"
if not (data_path / "raw/GRCh38_latest_genomic.fna").exists():
    !gzip ../data/raw/GRCh38_latest_genomic.fna.gz -d

In [None]:
#| hide
genome_annotations_path = data_path / "raw/GRCh38_latest_genomic.gff"
if not (data_path / "raw/GRCh38_latest_genomic.gff").exists():
    !gzip ../data/raw/GRCh38_latest_genomic.gff.gz -d

In [None]:
#| export
def break_genome_into_fasta_files(
    genome_file_path: Path, 
    write_path: Path,
):
    """
    Insert the fasta-annotated sequence records from our reference genome 
    into our sqlite3 database.
    """
    for i, seq_record in enumerate(tqdm(SeqIO.parse(genome_file_path, "fasta"))):
        seq_record_write_path = write_path / f"{seq_record.id}.fasta"
        if not seq_record_write_path.exists():
            with seq_record_write_path.open("w+") as out:
                SeqIO.write([seq_record], out, "fasta")

In [None]:
#| hide
genome_fasta_path = data_path / "genome"

if not genome_fasta_path.exists():
    genome_fasta_path.mkdir()
    
break_genome_into_fasta_files(genome_path, data_path / "genome")

0it [00:00, ?it/s]

In [None]:
#| export
def validate_written_genome_fasta_files(
    write_path: Path
) -> int:
    unlinked_files = 0
    for file in write_path.glob("*.fasta"):
        if file.stat().st_size == 0:
            file.unlink()
            unlinked_files += 1
    return unlinked_files

In [None]:
#| hide
validate_written_genome_fasta_files(genome_fasta_path)

0

In [None]:
#| hide
num_files = len(list(genome_fasta_path.glob("*.fasta")))
assert num_files > 0

In [None]:
#| hide
num_files

705

In [None]:
#| hide
sample_sequence_record = next(SeqIO.parse(list(genome_fasta_path.glob("*.fasta"))[0], "fasta"))

In [None]:
#| hide
from collections import Counter

test_sequence_counter = Counter(sample_sequence_record.seq)

In [None]:
#| hide
test_sequence_counter

Counter({'T': 41321614,
         'A': 41260191,
         'C': 31266202,
         'G': 31256045,
         't': 25922550,
         'a': 25810086,
         'N': 18475408,
         'g': 16855483,
         'c': 16788841,
         'M': 1,
         'R': 1})

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()