# Making the Reference Protein Fasta more easily usable

> "Using BioPython to convert the reference human genome proteins into individual fasta files."

In [None]:
#| default_exp features.protein

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from Bio import SeqIO
from pathlib import Path
from yaml import safe_load
from tqdm.auto import tqdm

In [None]:
#| hide
with open("../config.yml", "r") as f:
    config = safe_load(f)
    
data_path = Path(config.get("data_path"))

In [None]:
#| export
def break_proteins_into_fasta_files(
    protein_file_path: Path, 
    write_path: Path,
):
    """
    Insert the fasta-annotated sequence records from our reference genome 
    into our sqlite3 database.
    """
    # This only works for fasta files
    sequences = 0
    with protein_file_path.open("r") as f:
        for line in f.readlines():
            if line.startswith(">"):
                sequences += 1
    progress_bar = tqdm(total=sequences)
    for i, seq_record in enumerate(SeqIO.parse(protein_file_path, "fasta")):
        seq_record_write_path = write_path / f"{seq_record.id}.fasta"
        if not seq_record_write_path.exists():
            with seq_record_write_path.open("w+") as out:
                SeqIO.write([seq_record], out, "fasta")
        progress_bar.update(1)
    progress_bar.close()

In [None]:
#| hide
protein_fasta_write_path = data_path / "protein"
if not protein_fasta_write_path.exists():
    protein_fasta_write_path.mkdir()

break_proteins_into_fasta_files(
    data_path / "raw/GRCh38_latest_protein.faa",
    protein_fasta_write_path
)

  0%|          | 0/136193 [00:00<?, ?it/s]

In [None]:
#| hide
# Load a sample protein fasta file
from collections import Counter

sample_protein_sequence = next(
    SeqIO.parse(
        next(
            (data_path / "protein").glob("*.fasta")
        ),
        "fasta"
    )
)
protein_sequence_counter = Counter(sample_protein_sequence.seq)
protein_sequence_counter

Counter({'L': 143,
         'V': 139,
         'S': 123,
         'E': 104,
         'T': 101,
         'A': 93,
         'G': 91,
         'K': 89,
         'P': 78,
         'Q': 73,
         'N': 64,
         'F': 62,
         'I': 57,
         'Y': 56,
         'D': 53,
         'R': 46,
         'H': 40,
         'M': 26,
         'C': 25,
         'W': 11})

In [None]:
#| hide
sample_protein_sequence

SeqRecord(seq=Seq('MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEKGCVLLSYL...GNA'), id='NP_000005.3', name='NP_000005.3', description='NP_000005.3 alpha-2-macroglobulin isoform a precursor [Homo sapiens]', dbxrefs=[])

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()