# Making the Reference Genome more easily usable

> "Using BioPython to convert the reference human genome into a sqlite database."

In [None]:
#| default_exp features.genome

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from pathlib import Path
from Bio import SeqIO
import sqlite3
from llm_mito_scanner.features import database



In [None]:
#| hide
# !gzip ../data/raw/GRCh38_latest_genomic.fna.gz -d

In [None]:
#| hide
# !gzip ../data/raw/GRCh38_latest_genomic.gff.gz -d

In [None]:
#| hide
data_path = Path("../data")
genome_file_path = data_path / "raw/GRCh38_latest_genomic.fna"
annotations_file_path = data_path / "raw/GRCh38_latest_genomic.gff"

In [None]:
#| hide
conn = sqlite3.connect(data_path / "genome.db")

In [None]:
#| export
def insert_genome_into_sqlite(
    genome_file_path: Path, 
    sqlite_connection: sqlite3.Connection, 
    insert_batch_size: int = 10
):
    """
    Insert the fasta-annotated sequence records from our reference genome 
    into our sqlite3 database.
    """
    records = []
    for i, seq_record in enumerate(SeqIO.parse(genome_file_path, "fasta")):
        records.append(seq_record)
        if i % insert_batch_size == 0:
            database.insert_sequences(records, sqlite_connection)
            records = []
    if len(records) > 0:
        database.insert_sequences(records, sqlite_connection)

In [None]:
#| hide
insert_genome_into_sqlite(genome_file_path, conn)

In [None]:
#| hide
conn.close()

In [None]:
#| hide
conn = sqlite3.connect(data_path / "genome.db")
cursor = conn.cursor()

In [None]:
#| hide
num_rows = cursor.execute("SELECT COUNT(DISTINCT ID) FROM sequence;").fetchone()[0]
assert num_rows > 0

In [None]:
#| hide
num_rows

705

In [None]:
#| hide
test_sequence = cursor.execute("SELECT * FROM sequence LIMIT 1;").fetchone()

In [None]:
#| hide
# ID is string of non-zero length
assert isinstance(test_sequence[0], str) and len(test_sequence[0]) > 0
test_sequence[0]

'NC_000001.11'

In [None]:
#| hide
# Name is string
assert isinstance(test_sequence[1], str)

In [None]:
#| hide
test_sequence[1]

'NC_000001.11'

In [None]:
#| hide
# Description is string
assert isinstance(test_sequence[2], str)

In [None]:
#| hide
test_sequence[2]

'NC_000001.11 Homo sapiens chromosome 1, GRCh38.p14 Primary Assembly'

In [None]:
#| hide
# Sequence is string and of non-zero length
assert isinstance(test_sequence[3], str) and len(test_sequence[3]) > 0

In [None]:
#| hide
len(test_sequence[3])

248956422

In [None]:
#| hide
from collections import Counter

test_sequence_counter = Counter(test_sequence[3])
# test_sequence_counter.keys()

In [None]:
#| hide
test_sequence_counter

Counter({'T': 41321614,
         'A': 41260191,
         'C': 31266202,
         'G': 31256045,
         't': 25922550,
         'a': 25810086,
         'N': 18475408,
         'g': 16855483,
         'c': 16788841,
         'M': 1,
         'R': 1})

In [None]:
#| hide
cursor.close()

In [None]:
#| hide
conn.close()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()