# Getting our positive sample of sequences from MITOMAP

> "Using pandas and Genbank to get mitochondrial dna sequences."

In [None]:
#| default_exp features.mitochondria

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
from Bio import SeqIO, Entrez
from tqdm.auto import tqdm
from yaml import safe_load
from pathlib import Path
import json

In [None]:
#| hide
import collections

In [None]:
#| hide
with open("../config.yml", 'r') as f:
    config = safe_load(f)

In [None]:
#| hide
Entrez.email = config.get("email")
Entrez.api_key = config.get("nih_api_key")
Entrez.sleep_between_tries = 1

In [None]:
#| hide
data_path = Path(config.get("data_path"))
data_path.exists()

True

In [None]:
#| export
# Download sequences from MITOMAP
def get_structural_nuclear_genes_for_mitochondrial_diseases() -> pd.DataFrame:
    """
    Get structural nuclear genes for mitochondrial diseases from MITOMAP.
    """
    genes = pd.read_html(
        "https://www.mitomap.org/foswiki/bin/view/MITOMAP/NuclearGenesStructural",
        match="Complex"
    )[0].dropna(how='all')
    genes.loc[:, 'Complex'] = genes.Complex.fillna(method='ffill')
    genes.reset_index(drop=True, inplace=True)
    genes.loc[:, 'OMIM'] = genes.OMIM.astype(int).astype(str)
    return genes

In [None]:
#| hide
if not (data_path / "raw/structural_nuclear_genes_for_mitochondrial_diseases.csv").exists():
    structural_genes = get_structural_nuclear_genes_for_mitochondrial_diseases()
    structural_genes.to_csv(data_path / "raw/structural_nuclear_genes_for_mitochondrial_diseases.csv", index=False)
else:
    structural_genes = pd.read_csv(data_path / "raw/structural_nuclear_genes_for_mitochondrial_diseases.csv")
display(structural_genes.head())
structural_genes.Complex.value_counts()

Unnamed: 0,Complex,Name,OMIM,Function,Chromosome,Inheritance,Clinical Phenotype,References
0,Complex I,NDUFS1,157655,IP fraction,2q33-q34,AR,LS,[i]
1,Complex I,NDUFS2,602985,IP fraction,1q23,AR,"Encephalopathy, Cardiomyopathy",[ii]
2,Complex I,NDUFS3,603846,IP fraction,11p11.11,AR,LS,[iii]
3,Complex I,NDUFS4,602694,IP fraction,5q11.1,AR,LS,[iv]
4,Complex I,NDUFS6,603848,IP fraction,5pter-p15.33,AR,Fatal Infantile Lactic Acidosis,[v]


Complex
Complex I      20
Complex II      4
Complex IV      4
Complex V       3
Complex III     2
Name: count, dtype: int64

In [None]:
#| export
def get_non_structural_nuclear_genes_for_mitochondrial_diseases() -> pd.DataFrame:
    """
    Get non-structural nuclear genes for mitochondrial diseases from MITOMAP.
    """
    genes = pd.read_html(
        "https://www.mitomap.org/foswiki/bin/view/MITOMAP/NuclearGenesNonStructural",
        match="Complex"
    )[0].dropna(how='all')
    genes = genes.iloc[1:, :]
    genes.loc[:, 'Complex'] = genes.Complex.fillna(method='ffill')
    genes.reset_index(drop=True, inplace=True)
    genes.loc[:, 'OMIM'] = genes.OMIM.str.split(" ")
    # Expand the OMIM column
    genes = genes.explode('OMIM')
    return genes

In [None]:
#| hide
if not (data_path / "raw/non_structural_nuclear_genes_for_mitochondrial_diseases.csv").exists():
    non_structural_genes = get_non_structural_nuclear_genes_for_mitochondrial_diseases()
    non_structural_genes.to_csv(data_path / "raw/non_structural_nuclear_genes_for_mitochondrial_diseases.csv", index=False)
else:
    non_structural_genes = pd.read_csv(data_path / "raw/non_structural_nuclear_genes_for_mitochondrial_diseases.csv")
display(non_structural_genes.head())
non_structural_genes.Complex.value_counts()

Unnamed: 0,Complex,Name,OMIM,Function,Chromosome,Inheritance,Clinical Phenotype,References
0,Complex I,NDUFAF1(CIA30),606934,Assembly,15q13.3,AR,Cardioencephalomyopathy,[i]
1,Complex I,NDUFAF2 (B17.2L),609653,Assembly,5q12.1,AR,Early onset progressive encephalopathy,[ii]
2,Complex I,NDUFAF3,612911,Assembly,3p21.31,AR,Neonatal encephalopathy,[iii]
3,Complex I,NDUFAF4 (HRPAP2),611776,Assembly,6q16.1,AR,Infantile encephalopathy,[iv]
4,Complex I,NDUFAF5 (C20orf7),612360,Assembly,20p12.1,AR,LS,[v]


Complex
Mitochondrial Protein Synthesis    37
MtDNA Maintenance                  16
Complex IV                         14
Iron Homeostasis                   11
Coenzyme Q10 biogenesis            10
Complex I                           9
Mitochondrial Metabolism            4
Complex III                         3
Complex V                           3
Mitochondrial Integrity             3
Complex II                          2
Mitochondrial Import                2
Chaperone Function                  2
Name: count, dtype: int64

Now that we've got the sequence references (via protein names), lets get their fasta sequences from entrez.

NOTE: I would use OMIM if I had a valid institutional email - but unfortunately they limit access to institutional researchers only.

In [None]:
#| export
def load_omim_map(data_path: Path):
    omim_map_df = pd.read_csv(data_path / "raw/mim2gene.txt", sep="\t", skiprows=4)
    omim_map_df.columns = ["OMIM", "entry_type", "entrez_gene_id", "gene_symbol", "ensembl_gene_id"]
    omim_map_df.loc[:, 'OMIM'] = omim_map_df.OMIM.astype(int)
    return omim_map_df

In [None]:
#| hide
omim_map_df = load_omim_map(data_path)
omim_map_df.head()

Unnamed: 0,OMIM,entry_type,entrez_gene_id,gene_symbol,ensembl_gene_id
0,100050,predominantly phenotypes,,,
1,100070,phenotype,100329167.0,,
2,100100,phenotype,,,
3,100200,predominantly phenotypes,,,
4,100300,phenotype,,,


In [None]:
omim_map_df.OMIM.dtype, structural_genes.OMIM.dtype, non_structural_genes.OMIM.dtype

(dtype('int64'), dtype('int64'), dtype('int64'))

In [None]:
len(set(omim_map_df.OMIM).intersection(set(structural_genes.OMIM))) == len(set(structural_genes.OMIM))

True

In [None]:
len(set(omim_map_df.OMIM).intersection(set(non_structural_genes.OMIM))) == len(set(non_structural_genes.OMIM))

True

In [None]:
#| hide
mito_entrez_df = pd.concat(
    [
        structural_genes[['OMIM']],
        non_structural_genes[['OMIM']]
    ],
    axis=0
).drop_duplicates("OMIM").merge(
    omim_map_df[['OMIM', 'entrez_gene_id', 'ensembl_gene_id']],
    on='OMIM'
).dropna(subset='entrez_gene_id')
mito_entrez_df.shape[0]

124

In [None]:
#| hide
# These are OMIM ids from MitoMAP that don't have an entrez map
# Looking through some of these, they do point to uniprot
# Without API access to OMIM, I can't map these anywhere
unique_mito_omim_ids = set(non_structural_genes.OMIM.values)
unique_mito_omim_ids.update(set(structural_genes.OMIM))
abs(len(unique_mito_omim_ids) - mito_entrez_df.shape[0])
missing_omim_ids = unique_mito_omim_ids - set(mito_entrez_df.OMIM)
len(missing_omim_ids), list(missing_omim_ids)[:5]

(24, [609283, 609286, 250250, 615438, 600462])

Now that we have entrez gene ids, it should be simple to fetch fasta sequences in bulk from Entrez.

In [None]:
#| hide
entrez_ids = list(map(lambda gene_id: str(int(gene_id)), mito_entrez_df.entrez_gene_id.tolist()))
len(entrez_ids), entrez_ids[:5]

(124, ['4719', '4720', '4722', '4724', '4726'])

In [None]:
#| export
def fetch_entrez_id_documents(
    gene_ids: list[str], 
    write_path: Path,
    tqdm_kwargs: dict = None
) -> list[SeqIO.SeqRecord]:
    """
    Get summary documents for the given entrez gene ids.
    """
    already_written_gene_ids = [p.stem for p in (write_path.glob("*.json"))]
    search_gene_ids = [gid for gid in gene_ids if str(gid) not in already_written_gene_ids]
    if len(search_gene_ids) == 0:
        return
    gene_id_efetch_response = Entrez.efetch(
        "gene",
        id=",".join(search_gene_ids),
        retmode="xml"
    )
    write_pbar = tqdm(total=len(search_gene_ids), **tqdm_kwargs)
    try:
        for result in Entrez.parse(gene_id_efetch_response):
            result_gene_id = result.get("Entrezgene_track-info", {}).get("Gene-track", {}).get("Gene-track_geneid", None)
            if result_gene_id is None:
                raise ValueError("Failed to get gene id from result")
            with (write_path / f"{result_gene_id}.json").open("w+") as f_out:
                json.dump(result, f_out)
            write_pbar.update(1)
    except ValueError as e:
        raise e
    finally:
        write_pbar.close()


def get_gene_loci_from_entrez_document(path: Path, heading_filter: str = 'Reference GRCh38.p14 Primary Assembly') -> list[str]:
    with path.open("r") as f:
        doc = json.load(f)
    #return doc
    gene_loci = doc.get("Entrezgene_locus")
    if heading_filter is not None:
        gene_loci = [loci for loci in gene_loci if loci.get("Gene-commentary_heading") == heading_filter]
    return gene_loci

In [None]:
#| hide
entrez_gene_path = data_path / "entrez_genes"
if not entrez_gene_path.exists():
    entrez_gene_path.mkdir()
    
fetch_entrez_id_documents(
    entrez_ids,
    entrez_gene_path
)

In [None]:
#| hide
sample_entrez_document = next(entrez_gene_path.glob("*.json"))

In [None]:
sample_entrez_document.stem

'10059'

In [None]:
#| hide
sample_entrez_document_loci = get_gene_loci_from_entrez_document(
    sample_entrez_document, 
    heading_filter='Reference GRCh38.p14 Primary Assembly'
)
len(sample_entrez_document_loci)

1

In [None]:
sample_entrez_document_loci[0]

{'Gene-commentary_type': '1',
 'Gene-commentary_heading': 'Reference GRCh38.p14 Primary Assembly',
 'Gene-commentary_label': 'Chromosome 12 Reference GRCh38.p14 Primary Assembly',
 'Gene-commentary_accession': 'NC_000012',
 'Gene-commentary_version': '12',
 'Gene-commentary_seqs': [{'Seq-loc_int': {'Seq-interval': {'Seq-interval_from': '32679300',
     'Seq-interval_to': '32745649',
     'Seq-interval_strand': {'Na-strand': ''},
     'Seq-interval_id': {'Seq-id': {'Seq-id_gi': '568815586'}}}}}],
 'Gene-commentary_products': [{'Gene-commentary_type': '3',
   'Gene-commentary_heading': 'Reference',
   'Gene-commentary_label': 'transcript variant 5',
   'Gene-commentary_accession': 'NM_001278464',
   'Gene-commentary_version': '2',
   'Gene-commentary_genomic-coords': [{'Seq-loc_mix': {'Seq-loc-mix': [{'Seq-loc_int': {'Seq-interval': {'Seq-interval_from': '32679300',
          'Seq-interval_to': '32679464',
          'Seq-interval_strand': {'Na-strand': ''},
          'Seq-interval_id': {

In [None]:
sample_entrez_document_loci[0].get("Gene-commentary_accession"), sample_entrez_document_loci[0].get("Gene-commentary_version")

('NC_000012', '12')

In [None]:
sample_entrez_document_loci[0].get("Gene-commentary_seqs")

[{'Seq-loc_int': {'Seq-interval': {'Seq-interval_from': '32679300',
    'Seq-interval_to': '32745649',
    'Seq-interval_strand': {'Na-strand': ''},
    'Seq-interval_id': {'Seq-id': {'Seq-id_gi': '568815586'}}}}}]

In [None]:
#| hide
loci_doc_count = collections.Counter()

for doc in entrez_gene_path.glob("*.json"):
    doc_loci = get_gene_loci_from_entrez_document(
        doc, 
        "Reference GRCh38.p14 Primary Assembly"
    )
    loci_doc_count.update([len(doc_loci)])
    
loci_doc_count

Counter({1: 124})

In [None]:
#| export
def get_sequences_from_loci(
    genome_fasta_path: Path, 
    sequence_write_path: Path, 
    entrez_gene_id: str, 
    sequence_docs: list[dict]
) -> list[str]:
    gene_sequence_write_path = sequence_write_path / entrez_gene_id
    if not gene_sequence_write_path.exists():
        gene_sequence_write_path.mkdir(parents=True)
    for doc in sequence_docs:
        sequence_accession = doc.get("Gene-commentary_accession", None)
        sequence_version = doc.get("Gene-commentary_version", None)
        sequence_loci = doc.get("Gene-commentary_seqs", [])
        sequence_reference_path = genome_fasta_path / f"{sequence_accession}.{sequence_version}.fasta"
        if not sequence_reference_path.exists():
            raise FileNotFoundError(f"Could not find reference fasta at {sequence_reference_path.resolve()}")
        for loci in sequence_loci:
            loci_interval = loci.get("Seq-loc_int", {}).get("Seq-interval", {})
            loci_start = loci_interval.get("Seq-interval_from", None)
            loci_to = loci_interval.get("Seq-interval_to", None)
            loci_strand = loci_interval.get("Seq-interval_strand", {})
            loci_gi = loci_interval.get("Seq-interval_id", {}).get("Seq-id", {}).get("Seq-id_gi", None)
            loci_sequence_path = gene_sequence_write_path / f"{loci_gi}.fasta"
            if not loci_sequence_path.exists():
                if loci_start is None:
                    raise ValueError("Sequence start is None")
                if loci_to is None:
                    raise ValueError("Sequence end is None")
                if loci_gi is None:
                    raise ValueError("GenInfo Identifier is None")
                loci_sequence = get_sequence_from_file(
                    sequence_reference_path,
                    start=int(loci_start),
                    end=int(loci_to)
                )
                loci_sequence_record = SeqIO.SeqRecord(
                    seq=loci_sequence,
                    id=f"gid|{entrez_gene_id}|gi|{loci_gi}"
                )
                if len(loci_sequence_record.seq) > 0:
                    with loci_sequence_path.open("w+") as out:
                        SeqIO.write([loci_sequence_record], out, "fasta")

In [None]:
#| hide
# Sample run
get_sequences_from_loci(
    genome_fasta_path = data_path / "genome", 
    sequence_write_path = data_path / "entrez_genes_sequences",
    entrez_gene_id = sample_entrez_document.stem,
    sequence_docs = sample_entrez_document_loci
)

In [None]:
#| hide
for entrez_gene_doc in tqdm(list(entrez_gene_path.glob("*.json"))):
    entrez_gene_loci = get_gene_loci_from_entrez_document(
        entrez_gene_doc, 
        heading_filter='Reference GRCh38.p14 Primary Assembly'
    )
    get_sequences_from_loci(
        genome_fasta_path = data_path / "genome", 
        sequence_write_path = data_path / "entrez_genes_sequences",
        entrez_gene_id = entrez_gene_doc.stem,
        sequence_docs = entrez_gene_loci
    )

  0%|          | 0/124 [00:00<?, ?it/s]

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()