# Validate Extracted Sequences

> "Sequences we extracted could have introns in them. Lets validate that DNA maps to protein as expected before we train."

In [None]:
#| default_exp features.validation

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from Bio import SeqIO



In [None]:
#| hide
from yaml import safe_load

tqdm.pandas(ncols=80, leave=False)

with open("../config.yml") as f:
    config = safe_load(f)

In [None]:
#| hide
import json

data_path = Path(config.get("data_path"))
annotations_path = data_path / "annotations"
genome_path = data_path / "genome"
sequences_path = data_path / "entrez_genes_sequences"
protein_path = data_path / "protein"

with (data_path / "entrez_gene_to_protein_map.json").open("r") as f:
    gene_to_protein_map = json.load(f)
    
len(gene_to_protein_map)

20080

In [None]:
#| export
def load_gene_and_protein(
    gene_id: str, 
    protein_id: str, 
    gene_path: Path, 
    protein_path: Path
) -> (SeqIO.SeqRecord, SeqIO.SeqRecord):
    gene = next(SeqIO.parse(gene_path / f"{gene_id}.fasta", "fasta"))
    protein = next(SeqIO.parse(protein_path / f"{protein_id}.fasta", "fasta"))
    return gene, protein

In [None]:
#| hide
gene_ids = list(gene_to_protein_map.keys())
test_gene_id = gene_ids[0]
test_protein_id = gene_to_protein_map.get(test_gene_id)
test_gene_id, test_protein_id

('79501', 'NP_001005484.2')

In [None]:
#| hide
test_gene, test_protein = load_gene_and_protein(
    test_gene_id,
    test_protein_id,
    sequences_path,
    protein_path
)

In [None]:
#| export
def validate_gene_and_protein(gene: SeqIO.SeqRecord, protein: SeqIO.SeqRecord) -> bool:
    """
    Returns true if a dna sequence maps directly to a protein sequence.
    """
    translated_gene = gene.translate(cds=True)
    return translated_gene.seq == protein.seq

In [None]:
#| hide
validate_gene_and_protein(test_gene, test_protein)

TranslationError: First codon 'CCA' is not a start codon

In [None]:
test_gene.translate().seq

Seq('PDLFRYI*SIHKGLLINQVVY*KGQFTTYYSLTVFMPHSVKIAVVSSSYEEGRW...SKK')

In [None]:
test_gene.seq in test_protein.seq

False