# Getting our positive sample of sequences from MITOMAP

> "Using pandas and Genbank to get mitochondrial dna sequences."

In [None]:
#| default_exp features.mitochondria

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
from xml.etree import ElementTree
from Bio import SeqIO, Entrez
from io import StringIO
import sqlite3
from lxml import etree
from tqdm.auto import tqdm
from tqdm import contrib as tqdm_contrib
import time
import functools
from yaml import safe_load
from pathlib import Path

from llm_mito_scanner.features import database

In [None]:
#| hide
with open("../config.yml", 'r') as f:
    config = safe_load(f)

In [None]:
#| hide
Entrez.email = config.get("email")
Entrez.api_key = config.get("nih_api_key")
Entrez.sleep_between_tries = 1

In [None]:
#| hide
data_path = Path(config.get("data_path"))
data_path.exists()

True

In [None]:
#| export
# Download sequences from MITOMAP
def get_structural_nuclear_genes_for_mitochondrial_diseases() -> pd.DataFrame:
    """
    Get structural nuclear genes for mitochondrial diseases from MITOMAP.
    """
    genes = pd.read_html(
        "https://www.mitomap.org/foswiki/bin/view/MITOMAP/NuclearGenesStructural",
        match="Complex"
    )[0].dropna(how='all')
    genes.loc[:, 'Complex'] = genes.Complex.fillna(method='ffill')
    genes.reset_index(drop=True, inplace=True)
    genes.loc[:, 'OMIM'] = genes.OMIM.astype(int).astype(str)
    return genes

In [None]:
#| hide
if not (data_path / "raw/structural_nuclear_genes_for_mitochondrial_diseases.csv").exists():
    structural_genes = get_structural_nuclear_genes_for_mitochondrial_diseases()
    structural_genes.to_csv(data_path / "raw/structural_nuclear_genes_for_mitochondrial_diseases.csv", index=False)
else:
    structural_genes = pd.read_csv(data_path / "raw/structural_nuclear_genes_for_mitochondrial_diseases.csv")
display(structural_genes.head())
structural_genes.Complex.value_counts()

Unnamed: 0,Complex,Name,OMIM,Function,Chromosome,Inheritance,Clinical Phenotype,References
0,Complex I,NDUFS1,157655,IP fraction,2q33-q34,AR,LS,[i]
1,Complex I,NDUFS2,602985,IP fraction,1q23,AR,"Encephalopathy, Cardiomyopathy",[ii]
2,Complex I,NDUFS3,603846,IP fraction,11p11.11,AR,LS,[iii]
3,Complex I,NDUFS4,602694,IP fraction,5q11.1,AR,LS,[iv]
4,Complex I,NDUFS6,603848,IP fraction,5pter-p15.33,AR,Fatal Infantile Lactic Acidosis,[v]


Complex
Complex I      20
Complex II      4
Complex IV      4
Complex V       3
Complex III     2
Name: count, dtype: int64

In [None]:
def get_non_structural_nuclear_genes_for_mitochondrial_diseases() -> pd.DataFrame:
    """
    Get non-structural nuclear genes for mitochondrial diseases from MITOMAP.
    """
    genes = pd.read_html(
        "https://www.mitomap.org/foswiki/bin/view/MITOMAP/NuclearGenesNonStructural",
        match="Complex"
    )[0].dropna(how='all')
    genes = genes.iloc[1:, :]
    genes.loc[:, 'Complex'] = genes.Complex.fillna(method='ffill')
    genes.reset_index(drop=True, inplace=True)
    genes.loc[:, 'OMIM'] = genes.OMIM.str.split(" ")
    # Expand the OMIM column
    genes = genes.explode('OMIM')
    return genes

In [None]:
#| hide
if not (data_path / "raw/non_structural_nuclear_genes_for_mitochondrial_diseases.csv").exists():
    non_structural_genes = get_non_structural_nuclear_genes_for_mitochondrial_diseases()
    non_structural_genes.to_csv(data_path / "raw/non_structural_nuclear_genes_for_mitochondrial_diseases.csv", index=False)
else:
    non_structural_genes = pd.read_csv(data_path / "raw/non_structural_nuclear_genes_for_mitochondrial_diseases.csv")
display(non_structural_genes.head())
non_structural_genes.Complex.value_counts()

Unnamed: 0,Complex,Name,OMIM,Function,Chromosome,Inheritance,Clinical Phenotype,References
0,Complex I,NDUFAF1(CIA30),606934,Assembly,15q13.3,AR,Cardioencephalomyopathy,[i]
1,Complex I,NDUFAF2 (B17.2L),609653,Assembly,5q12.1,AR,Early onset progressive encephalopathy,[ii]
2,Complex I,NDUFAF3,612911,Assembly,3p21.31,AR,Neonatal encephalopathy,[iii]
3,Complex I,NDUFAF4 (HRPAP2),611776,Assembly,6q16.1,AR,Infantile encephalopathy,[iv]
4,Complex I,NDUFAF5 (C20orf7),612360,Assembly,20p12.1,AR,LS,[v]


Complex
Mitochondrial Protein Synthesis    37
MtDNA Maintenance                  16
Complex IV                         14
Iron Homeostasis                   11
Coenzyme Q10 biogenesis            10
Complex I                           9
Mitochondrial Metabolism            4
Complex III                         3
Complex V                           3
Mitochondrial Integrity             3
Complex II                          2
Mitochondrial Import                2
Chaperone Function                  2
Name: count, dtype: int64

Now that we've got the sequence references (via protein names), lets get their fasta sequences from entrez.

NOTE: I would use OMIM if I had a valid institutional email - but unfortunately they limit access to institutional researchers only.

In [None]:
#| export
def load_omim_map(data_path: Path):
    omim_map_df = pd.read_csv(data_path / "raw/mim2gene.txt", sep="\t", skiprows=4)
    omim_map_df.columns = ["OMIM", "entry_type", "entrez_gene_id", "gene_symbol", "ensembl_gene_id"]
    omim_map_df.loc[:, 'OMIM'] = omim_map_df.OMIM.astype(str)
    return omim_map_df

In [None]:
#| hide
omim_map_df = load_omim_map(data_path)
omim_map_df.head()

Unnamed: 0,OMIM,entry_type,entrez_gene_id,gene_symbol,ensembl_gene_id
0,100050,predominantly phenotypes,,,
1,100070,phenotype,100329167.0,,
2,100100,phenotype,,,
3,100200,predominantly phenotypes,,,
4,100300,phenotype,,,


In [None]:
len(set(omim_map_df.OMIM).intersection(set(structural_genes.OMIM))) == len(set(structural_genes.OMIM))

True

In [None]:
len(set(omim_map_df.OMIM).intersection(set(non_structural_genes.OMIM))) == len(set(non_structural_genes.OMIM))

True

In [None]:
#| hide
mito_entrez_df = pd.concat(
    [
        structural_genes[['OMIM']],
        non_structural_genes[['OMIM']]
    ],
    axis=0
).drop_duplicates("OMIM").merge(
    omim_map_df[['OMIM', 'entrez_gene_id', 'ensembl_gene_id']],
    on='OMIM'
).dropna(subset='entrez_gene_id')
mito_entrez_df.shape[0]

124

In [None]:
#| hide
# These are OMIM ids from MitoMAP that don't have an entrez map
# Looking through some of these, they do point to uniprot
# Without API access to OMIM, I can't map these anywhere
unique_mito_omim_ids = set(non_structural_genes.OMIM.values)
unique_mito_omim_ids.update(set(structural_genes.OMIM))
abs(len(unique_mito_omim_ids) - mito_entrez_df.shape[0])
missing_omim_ids = unique_mito_omim_ids - set(mito_entrez_df.OMIM)
len(missing_omim_ids), list(missing_omim_ids)[:5]

(24, ['250250', '609260', '302060', '205950', '602473'])

Now that we have entrez gene ids, it should be simple to fetch fasta sequences in bulk from Entrez.

In [None]:
#| hide
entrez_ids = list(map(lambda gene_id: str(int(gene_id)), mito_entrez_df.entrez_gene_id.tolist()))
len(entrez_ids)

124

In [None]:
#| export
def get_fasta_from_entrez_ids(ids: list[str]) -> list[SeqIO.SeqRecord]:
    """
    """
    id_fasta_response = Entrez.efetch(
        "gene",
        id=",".join(ids),
    )
    return Entrez.read(id_fasta_response)

In [None]:
#| hide
test_response = Entrez.efetch(
    "gene",
    id=entrez_ids[0],
    retmode="xml"
)

KeyboardInterrupt: 

In [None]:
#| hide
parsed_test_response = list(Entrez.parse(test_response))

In [None]:
#| hide
parsed_test_response[0]

In [None]:
#| hide
parsed_test_response[0].get("Entrezgene_locus")[0].keys()

dict_keys(['Gene-commentary_type', 'Gene-commentary_heading', 'Gene-commentary_label', 'Gene-commentary_accession', 'Gene-commentary_version', 'Gene-commentary_seqs', 'Gene-commentary_products'])

In [None]:
#| hide
sample_entrez_id_result = get_fasta_from_entrez_ids(entrez_ids[:5])

TypeError: file should be opened in binary mode

In [None]:
len(sample_entrez_id_result)

In [None]:
#| export
def search_entrez_protein_names(protein_names: list[str]) -> list[str]:
    entrez_ids = map(lambda name: , names)

In [None]:
#| export    
def add_mitochondria_sequences_to_database(
    protein_names: list[str],
    connection: sqlite3.Connection
):
    # Add structural genes
    protein_pbar = tqdm(total=len(protein_names), ncols=80, leave=False)
    for name in protein_names:
        protein_pbar.set_description(f"{name} | searching")
        name_sequence_records = get_fasta_from_protein_name(name)
        if len(name_sequence_records) > 0:
            protein_pbar.set_description(f"{name} | inserting sequences")
            database.insert_sequences(name_sequence_records, connection)
            protein_pbar.set_description(f"{name} | inserting mito indices")
            database.insert_mitochondria_related_sequences(name_sequence_records, connection)
        protein_pbar.update(1)
    protein_pbar.close()

In [None]:
#| hide
connection = sqlite3.connect("../data/genome.db")

In [None]:
#| hide
#add_mitochondria_sequences_to_database(mitochondria_protein_names, connection)

In [None]:
#| hide
cursor = connection.cursor()

In [None]:
#| hide
test_query = cursor.execute("SELECT COUNT(*) FROM mitochondria_related")
test_query.fetchone()

(190,)

In [None]:
#| hide
cursor.close()

In [None]:
#| hide
connection.close()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()