## Explore mitochondrial impairment in tumars 

**Aim**: in this notebook, you will see all the steps for collecting data and constructing a KG to explore mitochondrial impairment in tumer (both in mice and mice)

### Import required libraries

In [1]:
# Import modules
import gzip
import os
import pickle
from io import BytesIO

import numpy as np
import pandas as pd
import requests
from gprofiler.gprofiler import GProfiler

from pyBiodatafuse import id_mapper
from pyBiodatafuse.annotators import (
    bgee,
    disgenet,
    minerva,
    molmedb,
    opentargets,
    pubchem,
    stringdb,
    wikipathways,
)
from pyBiodatafuse.constants import BGEE_GENE_EXPRESSION_LEVELS_COL, DISGENET_DISEASE_COL, GPROFILER
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import (
    combine_sources,
    create_harmonized_input_file,
    create_or_append_to_metadata,
    get_identifier_of_interest,
)

os.makedirs("data", exist_ok=True)
base_dir = os.path.abspath(os.getcwd())

### Load the input files

In [2]:
all_genes_mice = pd.read_excel("datasets/Copy of 344P_cachexia_vs_sham_FDR_0.01_all_genes.xlsx" , usecols="B:J")
all_genes_mice.rename(
    columns={"Unnamed: 1": "identifier", "Unnamed: 2": "GENE_SYMBOL"}, inplace=True
)
all_genes_mice.head()

Unnamed: 0,identifier,GENE_SYMBOL,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,minus_log10_pvalue
0,ENSMUSG00000021903,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,2.373133e-136,139.689061
1,ENSMUSG00000059824,Dbp,2816.783019,-2.985239,0.19176,-15.567614,1.208303e-54,7.006949e-51,53.917824
2,ENSMUSG00000041417,Pik3r1,2934.327264,2.047483,0.136776,14.969629,1.1596579999999999e-50,4.483238e-47,49.93567
3,ENSMUSG00000004939,Nmrk2,675.068734,-3.981474,0.274358,-14.511978,1.017437e-47,2.950057e-44,46.992493
4,ENSMUSG00000028834,Trim63,44995.104413,3.738275,0.261289,14.307048,1.977304e-46,4.586555e-43,45.703926


In [3]:
# Read only specific columns and skip the first row
all_genes_mice = pd.read_excel("datasets/Copy of 344P_cachexia_vs_sham_FDR_0.01_all_genes.xlsx" , usecols="B:J")
all_genes_mice.rename(
    columns={"Unnamed: 1": "identifier", "Unnamed: 2": "GENE_SYMBOL"}, inplace=True
)
deg_mice = all_genes_mice[
     (all_genes_mice['padj'] < 0.01) &
    (
        (all_genes_mice['log2FoldChange'] > 1) |
        (all_genes_mice['log2FoldChange'] < -1)
    )
]
print("Number of genes:", len(all_genes_mice["identifier"].unique()))
print("Number of DEGs:", len(deg_mice["identifier"].unique()))
deg_mice.head(1)

Number of genes: 11599
Number of DEGs: 846


Unnamed: 0,identifier,GENE_SYMBOL,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,minus_log10_pvalue
0,ENSMUSG00000021903,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,2.373133e-136,139.689061


### Entity resolution with BridgeDB

In [4]:
pickle_path_mice = os.path.join(base_dir, "data/mice/bridgedb_df_mice.pkl")
metadata_path_mice = os.path.join(base_dir, "data/mice/bridgedb_metadata_mice.pkl")

if not os.path.exists(pickle_path_mice):
    bridgedb_df_mice, bridgedb_metadata_mice = id_mapper.bridgedb_xref(
        identifiers=all_genes_mice,
        input_species="Mouse",
        input_datasource="Ensembl",
        output_datasource="All",
    )
    bridgedb_df_mice.to_pickle(pickle_path_mice)
    with open(metadata_path_mice, "wb") as file:
        pickle.dump(bridgedb_metadata_mice, file)
else:
    bridgedb_df_mice = pd.read_pickle(pickle_path_mice)
    with open(metadata_path_mice, "rb") as file:
        bridgedb_metadata_mice = pickle.load(file)

In [None]:
print("Number of genes with mapping in BridgeDb:", len(bridgedb_df_mice["identifier"].unique()))
bridgedb_df_mice.head(1)

Number of genes with mapping in BridgeDb: 11588


Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea
0,ENSMUSG00000021903,Ensembl,Q9D2N8,Uniprot-TrEMBL,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,2.373133e-136,139.689061


### Enrichment analysis using g:Profiler
all the pathways and annotations are being added despite being significance.

In [6]:
len(bridgedb_df_mice["identifier"].unique())

11588

In [7]:
len(bridgedb_df_mice[
            (bridgedb_df_mice['padj_dea'] < 0.01) &
                (
                    (bridgedb_df_mice['log2FoldChange_dea'] > 1) |
                    (bridgedb_df_mice['log2FoldChange_dea'] < -1)
                )]["identifier"]
        .unique())

846

In [8]:
gp = GProfiler(return_dataframe=True)
filename_mice = "data/mice/gprofiler_mice_0.01.pkl"
if not os.path.exists(filename_mice):
    gprofiler_mice = gp.profile(
        organism="mmusculus",
        all_results=True,
        query=bridgedb_df_mice[
            (bridgedb_df_mice['padj_dea'] < 0.01)]["identifier"]
        .unique()
        .tolist(),
        background=bridgedb_df_mice["identifier"].unique().tolist(),
        no_evidences=False,
        significance_threshold_method="fdr",
        user_threshold=0.05,
    )
    gprofiler_mice.rename(columns={"native": "id"}, inplace=True)
    gprofiler_mice["datasource"] = "g:Profiler"
    gprofiler_mice.to_pickle(filename_mice)
else:
    with open(filename_mice, "rb") as f:
        gprofiler_mice = pickle.load(f)

In [9]:
print(gprofiler_mice.shape)
gprofiler_mice.head(1)

(30481, 17)


Unnamed: 0,source,id,name,p_value,significant,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,intersections,evidences,datasource
0,GO:BP,GO:0032774,RNA biosynthetic process,0.001777,True,"""The chemical reactions and pathways resulting...",2922,3953,1112,11577,0.281305,0.380561,query_1,"[GO:0016070, GO:0141187]","[ENSMUSG00000059824, ENSMUSG00000041417, ENSMU...","[[ISO, IBA, TAS, IEA], [IDA, IMP, ISS, ISO, IE...",g:Profiler


In [10]:
# Function to count upregulated and downregulated genes
def count_up_down(genes, bridgedb_df_mice):
    if not isinstance(genes, list) or len(genes) == 0:  # Ensure genes is a list
        return 0, 0

    genes = [str(g).strip() for g in genes]
    filtered_df = bridgedb_df_mice[bridgedb_df_mice["identifier"].isin(genes)]

    upregulated = (filtered_df["log2FoldChange_dea"] > 0).sum()
    downregulated = (filtered_df["log2FoldChange_dea"] < 0).sum()
    
    return upregulated, downregulated

In [11]:
# Keep only significant results
gprofiler_sig_mice= gprofiler_mice[gprofiler_mice["significant"] == True]

In [12]:
gprofiler_sig_mice[["no_upregulated_genes", "no_downregulated_genes"]]  = gprofiler_sig_mice["intersections"].apply(
    lambda genes: pd.Series(count_up_down(genes, bridgedb_df_mice[bridgedb_df_mice["target.source"] == "Ensembl"]))
)
gprofiler_sig_mice.head(2)

Unnamed: 0,source,id,name,p_value,significant,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,intersections,evidences,datasource,no_upregulated_genes,no_downregulated_genes
0,GO:BP,GO:0032774,RNA biosynthetic process,0.001777,True,"""The chemical reactions and pathways resulting...",2922,3953,1112,11577,0.281305,0.380561,query_1,"[GO:0016070, GO:0141187]","[ENSMUSG00000059824, ENSMUSG00000041417, ENSMU...","[[ISO, IBA, TAS, IEA], [IDA, IMP, ISS, ISO, IE...",g:Profiler,783,329
1,GO:BP,GO:0016070,RNA metabolic process,0.002043,True,"""The cellular chemical reactions and pathways ...",3068,3953,1160,11577,0.293448,0.378096,query_1,[GO:0090304],"[ENSMUSG00000059824, ENSMUSG00000041417, ENSMU...","[[ISO, IBA, TAS, IEA], [IDA, IMP, ISS, ISO, IE...",g:Profiler,814,346


In [13]:
gprofiler_mice['significant'].value_counts()

significant
False    30417
True        64
Name: count, dtype: int64

In [14]:
gprofiler_sig_mice.shape

(64, 19)

In [15]:
selected_columns = ["source", "id", "name", "p_value", "term_size", "intersection_size", "parents","no_upregulated_genes", "no_downregulated_genes"]
gprofiler_sig_mice[selected_columns].to_csv("data/mice/gprofiler_sig_mice_0.01.csv", index=False)


#### For the graph

In [None]:
gprofiler_mice[gprofiler_mice["id"] == "HP:0011805"]["intersections"].values

array([list(['ENSMUSG00000022270', 'ENSMUSG00000039242', 'ENSMUSG00000035606', 'ENSMUSG00000073418', 'ENSMUSG00000051627', 'ENSMUSG00000055435', 'ENSMUSG00000096054', 'ENSMUSG00000063275', 'ENSMUSG00000026012', 'ENSMUSG00000005360', 'ENSMUSG00000001627', 'ENSMUSG00000020593', 'ENSMUSG00000043639', 'ENSMUSG00000047407', 'ENSMUSG00000018076', 'ENSMUSG00000041235', 'ENSMUSG00000016559', 'ENSMUSG00000041798', 'ENSMUSG00000022094', 'ENSMUSG00000027274', 'ENSMUSG00000028766', 'ENSMUSG00000029009', 'ENSMUSG00000026179', 'ENSMUSG00000013539', 'ENSMUSG00000007613', 'ENSMUSG00000047635', 'ENSMUSG00000025509', 'ENSMUSG00000056537', 'ENSMUSG00000006307', 'ENSMUSG00000020744', 'ENSMUSG00000028414', 'ENSMUSG00000031381', 'ENSMUSG00000005686', 'ENSMUSG00000022510', 'ENSMUSG00000034083', 'ENSMUSG00000040363', 'ENSMUSG00000041695', 'ENSMUSG00000022617', 'ENSMUSG00000042548', 'ENSMUSG00000017491', 'ENSMUSG00000029683', 'ENSMUSG00000024589', 'ENSMUSG00000033107', 'ENSMUSG00000051456', 'ENSMUSG00000048126

In [None]:
gprofiler_mice = gprofiler_mice[
    ~gprofiler_mice["parents"].apply(lambda x: x == [])
]  # rm the root terms

In [None]:
# from pyBiodatafuse.constants import GPROFILER


def create_path_info(row):
    path_info = {
        col: row[col] for col in gprofiler_mice.columns if col not in ["intersections", "source"]
    }
    return path_info


gprofiler_mice["gprofiler"] = gprofiler_mice.apply(create_path_info, axis=1)
gprofiler_mice = gprofiler_mice.drop(
    columns=[
        col
        for col in gprofiler_mice.columns
        if col not in ["source", "id", "intersections", "gprofiler"]
    ]
)
gprofiler_mice = gprofiler_mice.explode("intersections").reset_index(drop=True)
unique_sources = sorted(gprofiler_mice["source"].unique())
gprofiler_mice_final = pd.DataFrame()
gprofiler_mice_final["intersections"] = gprofiler_mice["intersections"].unique()
for source in unique_sources:
    source_subset = gprofiler_mice[gprofiler_mice["source"] == source]
    source_dictionaries = source_subset.groupby("intersections")["gprofiler"].apply(list).to_dict()
    gprofiler_mice_final[f"{GPROFILER}_{str.lower(source)}"] = gprofiler_mice_final[
        "intersections"
    ].map(source_dictionaries)
gprofiler_mice_final.head(1)

Unnamed: 0,intersections,g:Profiler_corum,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp
0,ENSMUSG00000027947,,"[{'id': 'GO:0051239', 'name': 'regulation of m...","[{'id': 'GO:0005615', 'name': 'extracellular s...","[{'id': 'GO:0005102', 'name': 'signaling recep...","[{'id': 'HP:0030386', 'name': 'Abnormal propor...","[{'id': 'KEGG:04060', 'name': 'Cytokine-cytoki...","[{'id': 'MIRNA:mmu-miR-155-5p', 'name': 'mmu-m...","[{'id': 'REAC:R-MMU-162582', 'name': 'Signal T...","[{'id': 'TF:M04454_1', 'name': 'Factor: AR; mo...","[{'id': 'WP:WP2841', 'name': 'Focal adhesion P..."


In [None]:
## get version


def get_data_versions(organism):
    url = "https://biit.cs.ut.ee/gprofiler/api/util/data_versions"
    params = {"organism": organism}
    try:
        response = requests.get(url, params=params)

        if response.status_code == 200:
            data = response.json()
            return data
        else:
            raise Exception(f"Failed to retrieve data: {response.status_code}")
    except Exception as e:
        return {"error": str(e)}


get_data_versions("mmusculus")

{'biomart': 'Ensembl',
 'biomart_version': '111',
 'display_name': 'Mouse',
 'genebuild': 'GRCm39',
 'gprofiler_version': 'e111_eg58_p18_f463989d',
 'organism': 'mmusculus',
 'sources': {'CORUM': {'name': 'CORUM protein complexes',
   'version': '28.11.2022 Corum 4.1'},
  'GO:BP': {'name': 'biological process',
   'version': 'annotations: BioMart\nclasses: releases/2024-01-17'},
  'GO:CC': {'name': 'cellular component',
   'version': 'annotations: BioMart\nclasses: releases/2024-01-17'},
  'GO:MF': {'name': 'molecular function',
   'version': 'annotations: BioMart\nclasses: releases/2024-01-17'},
  'HP': {'name': 'Human Phenotype Ontology',
   'version': 'annotations: 01.2024\nclasses: None'},
  'KEGG': {'name': 'Kyoto Encyclopedia of Genes and Genomes',
   'version': 'KEGG FTP Release 2024-01-22'},
  'MIRNA': {'name': 'miRTarBase', 'version': 'Release 9.0'},
  'REAC': {'name': 'Reactome',
   'version': 'annotations: BioMart\nclasses: 2024-1-25'},
  'TF': {'name': 'Transfac',
   'versi

In [None]:
ncbi_tf_gprofiler_df_mice = pd.merge(
    ncbi_tf_df_mice,
    gprofiler_mice_final,
    left_on="identifier",
    right_on="intersections",
    how="outer",
)

In [None]:
ncbi_tf_gprofiler_df_mice.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_corum,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp
0,ENSMUSG00000021903,Ensembl,78754,NCBI Gene,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,...,,"[{'id': 'GO:0006493', 'name': 'protein O-linke...","[{'id': 'GO:0030133', 'name': 'transport vesic...","[{'id': 'GO:0030246', 'name': 'carbohydrate bi...",,"[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:mmu-miR-34b-5p', 'name': 'mmu-m...","[{'id': 'REAC:R-MMU-5173105', 'name': 'O-linke...","[{'id': 'TF:M04454_1', 'name': 'Factor: AR; mo...",
1,ENSMUSG00000059824,Ensembl,13170,NCBI Gene,Dbp,2816.783019,-2.985239,0.19176,-15.567614,1.208303e-54,...,,"[{'id': 'GO:0048518', 'name': 'positive regula...","[{'id': 'GO:0005667', 'name': 'transcription r...","[{'id': 'GO:0000981', 'name': 'DNA-binding tra...",,"[{'id': 'KEGG:04710', 'name': 'Circadian rhyth...","[{'id': 'MIRNA:mmu-miR-122-5p', 'name': 'mmu-m...",,"[{'id': 'TF:M03882', 'name': 'Factor: RelB:p50...",


In [None]:
ncbi_tf_gprofiler_df_mice[ncbi_tf_gprofiler_df_mice["g:Profiler_reac"].notna()].head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_corum,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp
0,ENSMUSG00000021903,Ensembl,78754,NCBI Gene,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,...,,"[{'id': 'GO:0006493', 'name': 'protein O-linke...","[{'id': 'GO:0030133', 'name': 'transport vesic...","[{'id': 'GO:0030246', 'name': 'carbohydrate bi...",,"[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:mmu-miR-34b-5p', 'name': 'mmu-m...","[{'id': 'REAC:R-MMU-5173105', 'name': 'O-linke...","[{'id': 'TF:M04454_1', 'name': 'Factor: AR; mo...",


In [None]:
ncbi_tf_gprofiler_df_mice[ncbi_tf_gprofiler_df_mice["g:Profiler_reac"].notna()].head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_corum,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp
0,ENSMUSG00000021903,Ensembl,78754,NCBI Gene,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,...,,"[{'id': 'GO:0006493', 'name': 'protein O-linke...","[{'id': 'GO:0030133', 'name': 'transport vesic...","[{'id': 'GO:0030246', 'name': 'carbohydrate bi...",,"[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:mmu-miR-34b-5p', 'name': 'mmu-m...","[{'id': 'REAC:R-MMU-5173105', 'name': 'O-linke...","[{'id': 'TF:M04454_1', 'name': 'Factor: AR; mo...",


### Add MitoCarta data

In [16]:
# URLs for the mice MitoCarta Dataset
mitocarta_url_mice = (
    "https://personal.broadinstitute.org/scalvo/MitoCarta3.0/Mouse.MitoCarta3.0.xls"
)


# Function to download and read a gzipped file into a dataframe
def download_save_and_read(url, filename):
    if not os.path.exists(filename):
        response = requests.get(url)
        with open(filename, "wb") as file:
            file.write(response.content)
    else:
        print(f"{filename} already exists, skipping download.")

    # Load the second sheet ("A Mouse MitoCarta3.0") into a DataFrame
    df = pd.read_excel(url, sheet_name="A Mouse MitoCarta3.0")
    return df


# Download and read the mice MitoCarta dataset
mitocarta_mice_df = download_save_and_read(mitocarta_url_mice, "data/mice/mice_mitocarta3.0.xls")

mitocarta_mice_df.head(1)

data/mice/mice_mitocarta3.0.xls already exists, skipping download.


Unnamed: 0,MouseGeneID,HumanOrthologGeneID,Symbol,Synonyms,Description,MitoCarta3.0_List,MitoCarta3.0_Evidence,MitoCarta3.0_SubMitoLocalization,MitoCarta3.0_MitoPathways,TrainingDataset,...,liver_total_peak_intensity_log10,heart_total_peak_intensity_log10,skeletalmuscle_total_peak_intensity_log10,adipose_total_peak_intensity_log10,smallintestine_total_peak_intensity_log10,largeintestine_total_peak_intensity_log10,stomach_total_peak_intensity_log10,placenta_total_peak_intensity_log10,testis_total_peak_intensity_log10,HPA_Main_Location_2020 (Reliability)
0,66445,1537.0,Cyc1,2610002H19Rik|AA408921|Cyct1,cytochrome c-1,MitoCarta3.0,"literature, APEX_IMS, APEX_matrix, targetP sig...",MIM,OXPHOS > Complex III > CIII subunits | Metabol...,Tmito,...,10.0,10.4,9.8,10.1,10.2,10.2,10.0,10.0,9.8,Mitochondria (Supported)


In [17]:
# Select relevant columns for inclusion in the graph
selected_columns_mice = [
    "EnsemblGeneID",
    "Description",
    "MitoCarta3.0_Evidence",
    "MitoCarta3.0_SubMitoLocalization",
    "MitoCarta3.0_MitoPathways",
    "HPA_Main_Location_2020 (Reliability)",
    "Tissues",
]

mitocarta_mice_selected = mitocarta_mice_df[selected_columns_mice]

# Rename columns for clarity
mitocarta_mice_selected.rename(
    columns={
        "EnsemblGeneID": "ensembl_id",
        "Description": "gene_description",
        "MitoCarta3.0_Evidence": "evidence",
        "MitoCarta3.0_SubMitoLocalization": "sub_mito_localization",
        "MitoCarta3.0_MitoPathways": "mito_pathways",
        "HPA_Main_Location_2020 (Reliability)": "hpa_location",
        "Tissues": "tissue_expression",
    },
    inplace=True,
)

In [18]:
mitocarta_mice_selected["mito_pathways"] = (
    mitocarta_mice_selected["mito_pathways"]
    .str.split(">")
    .str[-1]
    .str.split("|")
    .str[0]
    .str.strip()
)
mitocarta_mice_selected.head(1)

Unnamed: 0,ensembl_id,gene_description,evidence,sub_mito_localization,mito_pathways,hpa_location,tissue_expression
0,ENSMUSG00000022551,cytochrome c-1,"literature, APEX_IMS, APEX_matrix, targetP sig...",MIM,OXPHOS subunits,Mitochondria (Supported),all 14


In [19]:
mitocarta_mice_selected = mitocarta_mice_selected.dropna(subset=["mito_pathways"]).reset_index(drop=True) # Drop rows with missing mito_pathways
mitocarta_mice_selected["ensembl_id"] = mitocarta_mice_selected["ensembl_id"].str.split("|")
mitocarta_mice_selected = mitocarta_mice_selected.explode("ensembl_id").reset_index(drop=True)


In [20]:
mitocarta_mice_selected = mitocarta_mice_selected[mitocarta_mice_selected["ensembl_id"].isin(bridgedb_df_mice["identifier"][bridgedb_df_mice["padj_dea"] < 0.01])] # Keep only significant genes existing in our dataset

In [21]:
bridgedb_df_mice.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea
0,ENSMUSG00000021903,Ensembl,Q9D2N8,Uniprot-TrEMBL,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,2.373133e-136,139.689061


In [22]:
mitocarta_mice_selected.head(3)

Unnamed: 0,ensembl_id,gene_description,evidence,sub_mito_localization,mito_pathways,hpa_location,tissue_expression
0,ENSMUSG00000022551,cytochrome c-1,"literature, APEX_IMS, APEX_matrix, targetP sig...",MIM,OXPHOS subunits,Mitochondria (Supported),all 14
2,ENSMUSG00000003072,"ATP synthase, H+ transporting, mitochondrial F...","literature, APEX_matrix, targetP signal, yeast...",MIM,OXPHOS subunits,,all 14
7,ENSMUSG00000039640,mitochondrial ribosomal protein L12,"literature, targetP signal+, yeast mito homolo...",Matrix,Mitochondrial ribosome,Mitochondria (Supported),all 14


In [27]:
aggregated_mitocarta_mice_selected = mitocarta_mice_selected.groupby("mito_pathways").agg(
    {
        "ensembl_id": list,
    }
).reset_index()
aggregated_mitocarta_mice_selected["intersections"] = aggregated_mitocarta_mice_selected["ensembl_id"].apply(len)


In [30]:
aggregated_mitocarta_mice_selected.head(4)

Unnamed: 0,mito_pathways,ensembl_id,intersections,no_upregulated_genes,no_downregulated_genes
0,ABC transporters,"[ENSMUSG00000031974, ENSMUSG00000055782, ENSMU...",3,0,3
1,Apoptosis,"[ENSMUSG00000036932, ENSMUSG00000058927, ENSMU...",9,5,4
2,Autophagy,"[ENSMUSG00000025040, ENSMUSG00000029500, ENSMU...",4,3,1
3,Biotin utilizing proteins,[ENSMUSG00000042010],1,1,0


In [29]:
aggregated_mitocarta_mice_selected[["no_upregulated_genes", "no_downregulated_genes"]]  = aggregated_mitocarta_mice_selected["ensembl_id"].apply(
    lambda genes: pd.Series(count_up_down(genes, bridgedb_df_mice[bridgedb_df_mice["target.source"] == "Ensembl"]))
)

In [31]:
aggregated_mitocarta_mice_selected.to_csv("data/mice/mitocarta_mice_0.01.csv", index=False)


In [32]:
aggregated_mitocarta_mice_selected

Unnamed: 0,mito_pathways,ensembl_id,intersections,no_upregulated_genes,no_downregulated_genes
0,ABC transporters,"[ENSMUSG00000031974, ENSMUSG00000055782, ENSMU...",3,0,3
1,Apoptosis,"[ENSMUSG00000036932, ENSMUSG00000058927, ENSMU...",9,5,4
2,Autophagy,"[ENSMUSG00000025040, ENSMUSG00000029500, ENSMU...",4,3,1
3,Biotin utilizing proteins,[ENSMUSG00000042010],1,1,0
4,Branched-chain amino acid dehydrogenase complex,[ENSMUSG00000037826],1,0,1
...,...,...,...,...,...
76,mtDNA repair,[ENSMUSG00000029591],1,0,1
77,mtDNA replication,[ENSMUSG00000002814],1,1,0
78,mtDNA stability and decay,"[ENSMUSG00000042787, ENSMUSG00000027424]",2,0,2
79,mtRNA granules,[ENSMUSG00000029624],1,1,0


: 

#### For the graph

In [None]:
from pyBiodatafuse.constants import MITOCARTA

df_melted = mitocarta_mice_selected.apply(
    lambda row: [row["ensembl_id"], [row.drop("ensembl_id").to_dict()]], axis=1
)
df_transformed = pd.DataFrame(df_melted.tolist(), columns=["ensembl_id", MITOCARTA])
df_transformed.head()

Unnamed: 0,ensembl_id,MitoCarta
0,ENSMUSG00000022551,"[{'gene_description': 'cytochrome c-1', 'evide..."
1,ENSMUSG00000031299,[{'gene_description': 'pyruvate dehydrogenase ...
2,ENSMUSG00000003072,"[{'gene_description': 'ATP synthase, H+ transp..."
3,ENSMUSG00000021241,[{'gene_description': 'iron-sulfur cluster ass...
4,ENSMUSG00000021748,[{'gene_description': 'pyruvate dehydrogenase ...


In [None]:
# Merge mice MitoCarta with the graph dataset
ncbi_tf_gprofiler_mitocarta_df_mice = pd.merge(
    ncbi_tf_gprofiler_df_mice,
    df_transformed,
    left_on="identifier",
    right_on="ensembl_id",
    how="left",
).drop(columns=["ensembl_id"])

ncbi_tf_gprofiler_mitocarta_df_mice.head(67)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta
0,ENSMUSG00000021903,Ensembl,78754,NCBI Gene,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.046157e-140,...,"[{'id': 'GO:0006493', 'name': 'protein O-linke...","[{'id': 'GO:0030133', 'name': 'transport vesic...","[{'id': 'GO:0030246', 'name': 'carbohydrate bi...",,"[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:mmu-miR-34b-5p', 'name': 'mmu-m...","[{'id': 'REAC:R-MMU-5173105', 'name': 'O-linke...","[{'id': 'TF:M04454_1', 'name': 'Factor: AR; mo...",,
1,ENSMUSG00000059824,Ensembl,13170,NCBI Gene,Dbp,2816.783019,-2.985239,0.191760,-15.567614,1.208303e-54,...,"[{'id': 'GO:0048518', 'name': 'positive regula...","[{'id': 'GO:0005667', 'name': 'transcription r...","[{'id': 'GO:0000981', 'name': 'DNA-binding tra...",,"[{'id': 'KEGG:04710', 'name': 'Circadian rhyth...","[{'id': 'MIRNA:mmu-miR-122-5p', 'name': 'mmu-m...",,"[{'id': 'TF:M03882', 'name': 'Factor: RelB:p50...",,
2,ENSMUSG00000041417,Ensembl,18708,NCBI Gene,Pik3r1,2934.327264,2.047483,0.136776,14.969629,1.159658e-50,...,"[{'id': 'GO:0051239', 'name': 'regulation of m...","[{'id': 'GO:0005942', 'name': 'phosphatidylino...","[{'id': 'GO:0005102', 'name': 'signaling recep...","[{'id': 'HP:0025539', 'name': 'Abnormal B cell...","[{'id': 'KEGG:04933', 'name': 'AGE-RAGE signal...","[{'id': 'MIRNA:mmu-miR-340-5p', 'name': 'mmu-m...","[{'id': 'REAC:R-MMU-6806834', 'name': 'Signali...","[{'id': 'TF:M00446_1', 'name': 'Factor: Spz1; ...","[{'id': 'WP:WP2841', 'name': 'Focal adhesion P...",
3,ENSMUSG00000004939,Ensembl,69564,NCBI Gene,Nmrk2,675.068734,-3.981474,0.274358,-14.511978,1.017437e-47,...,"[{'id': 'GO:0048869', 'name': 'cellular develo...","[{'id': 'GO:0071944', 'name': 'cell periphery'...","[{'id': 'GO:0005515', 'name': 'protein binding...",,"[{'id': 'KEGG:00760', 'name': 'Nicotinate and ...","[{'id': 'MIRNA:mmu-miR-709', 'name': 'mmu-miR-...","[{'id': 'REAC:R-MMU-196854', 'name': 'Metaboli...","[{'id': 'TF:M00446_1', 'name': 'Factor: Spz1; ...",,
4,ENSMUSG00000028834,Ensembl,433766,NCBI Gene,Trim63,44995.104413,3.738275,0.261289,14.307048,1.977304e-46,...,"[{'id': 'GO:0051239', 'name': 'regulation of m...","[{'id': 'GO:0043292', 'name': 'contractile fib...","[{'id': 'GO:0005515', 'name': 'protein binding...",,,,,"[{'id': 'TF:M10174_1', 'name': 'Factor: AR; mo...","[{'id': 'WP:WP5024', 'name': 'Hypoxia dependen...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,ENSMUSG00000044938,Ensembl,244923,NCBI Gene,Klhl31,14539.906824,-1.154039,0.120236,-9.598146,8.139554e-22,...,"[{'id': 'GO:0050896', 'name': 'response to sti...","[{'id': 'GO:0005737', 'name': 'cytoplasm', 'p_...","[{'id': 'GO:0005515', 'name': 'protein binding...",,,"[{'id': 'MIRNA:mmu-miR-1897-5p', 'name': 'mmu-...",,"[{'id': 'TF:M00407', 'name': 'Factor: RSRFC4; ...",,
63,ENSMUSG00000025612,Ensembl,12013,NCBI Gene,Bach1,2266.828985,1.633857,0.170924,9.558977,1.189265e-21,...,"[{'id': 'GO:0050896', 'name': 'response to sti...","[{'id': 'GO:0005667', 'name': 'transcription r...","[{'id': 'GO:0005515', 'name': 'protein binding...",,,"[{'id': 'MIRNA:mmu-miR-122-5p', 'name': 'mmu-m...","[{'id': 'REAC:R-MMU-9707616', 'name': 'Heme si...","[{'id': 'TF:M07094', 'name': 'Factor: FOXO1A; ...",,
64,ENSMUSG00000027496,Ensembl,20878,NCBI Gene,Aurka,311.745006,-1.896545,0.198482,-9.555240,1.232977e-21,...,"[{'id': 'GO:0050896', 'name': 'response to sti...","[{'id': 'GO:0071944', 'name': 'cell periphery'...","[{'id': 'GO:0140677', 'name': 'molecular funct...","[{'id': 'HP:0001442', 'name': 'Typified by som...","[{'id': 'KEGG:04114', 'name': 'Oocyte meiosis'...",,"[{'id': 'REAC:R-MMU-6804114', 'name': 'TP53 Re...","[{'id': 'TF:M08998_1', 'name': 'Factor: GR; mo...",,
65,ENSMUSG00000075327,Ensembl,381990,NCBI Gene,Zbtb2,454.438141,1.730660,0.182292,9.493914,2.225187e-21,...,"[{'id': 'GO:0051239', 'name': 'regulation of m...","[{'id': 'GO:0005622', 'name': 'intracellular a...","[{'id': 'GO:0005515', 'name': 'protein binding...",,,"[{'id': 'MIRNA:mmu-miR-431-5p', 'name': 'mmu-m...",,"[{'id': 'TF:M10351', 'name': 'Factor: PTF1; mo...",,


In [None]:
ncbi_tf_gprofiler_mitocarta_df_mice[MITOCARTA][11242]

[{'gene_description': 'lactate dehydrogenase A-like 6B',
  'evidence': 'targetP signal, Rickettsial homolog, mito protein domain, MS/MS',
  'sub_mito_localization': 'Matrix',
  'mito_pathways': 'Carbohydrate metabolism',
  'hpa_location': nan,
  'tissue_expression': 'cerebrum, cerebellum, brainstem, spinalcord, kidney, liver, adipose, smallintestine, largeintestine, stomach, placenta, testis'}]

In [None]:
ncbi_tf_gprofiler_mitocarta_df_mice["target.source"] = ncbi_tf_gprofiler_mitocarta_df_mice[
    "target.source"
].str.replace("NCBI Gene", "ncbi_gene_id", regex=False)

ncbi_tf_gprofiler_mitocarta_df_mice.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta
0,ENSMUSG00000021903,Ensembl,78754,ncbi_gene_id,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,...,"[{'id': 'GO:0006493', 'name': 'protein O-linke...","[{'id': 'GO:0030133', 'name': 'transport vesic...","[{'id': 'GO:0030246', 'name': 'carbohydrate bi...",,"[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:mmu-miR-34b-5p', 'name': 'mmu-m...","[{'id': 'REAC:R-MMU-5173105', 'name': 'O-linke...","[{'id': 'TF:M04454_1', 'name': 'Factor: AR; mo...",,


In [None]:
ncbi_tf_gprofiler_mitocarta_df_mice[MITOCARTA].to_dict()

{0: nan,
 1: nan,
 2: nan,
 3: nan,
 4: nan,
 5: nan,
 6: nan,
 7: nan,
 8: nan,
 9: nan,
 10: nan,
 11: nan,
 12: nan,
 13: nan,
 14: nan,
 15: nan,
 16: nan,
 17: nan,
 18: nan,
 19: nan,
 20: nan,
 21: nan,
 22: nan,
 23: nan,
 24: nan,
 25: nan,
 26: nan,
 27: nan,
 28: nan,
 29: nan,
 30: [{'gene_description': 'acyl-CoA thioesterase 2',
   'evidence': 'literature, targetP signal, mito protein domain, induction, coexpression, MS/MS++',
   'sub_mito_localization': 'Matrix',
   'mito_pathways': 'Lipid metabolism',
   'hpa_location': 'Mitochondria (Approved)',
   'tissue_expression': 'all 14'}],
 31: nan,
 32: nan,
 33: nan,
 34: nan,
 35: nan,
 36: nan,
 37: nan,
 38: [{'gene_description': 'pyruvate dehydrogenase kinase, isoenzyme 4',
   'evidence': 'literature, targetP signal, yeast mito homolog+, mito protein domain+, induction, coexpression, MS/MS++',
   'sub_mito_localization': 'MIM',
   'mito_pathways': 'Pyruvate metabolism',
   'hpa_location': nan,
   'tissue_expression': 'cere

In [None]:
ncbi_tf_gprofiler_mitocarta_df_mice["its_target"][ncbi_tf_gprofiler_mitocarta_df_mice["its_target"].notna()]
ncbi_tf_gprofiler_mitocarta_df_mice["its_target"][1]

[{'NCBI.GeneID.Target': '18626',
  'Ensembl.GeneID.Target': 'ENSMUSG00000020893',
  'Name.Target': 'Per1',
  'UniprotID.Target': 'O35973',
  'Target.TFLink.ortho': 'Hs:O15534;Dr:B3DK47;Rn:A0A0H2UHZ7',
  'Target.nonTFLink.ortho': '-',
  'Detection.method': 'inferred by curator',
  'PubmedID': '10848603;29087512',
  'Source.database': 'TRRUST',
  'Small-scale.evidence': 'Yes'}]

Extract "NCBI Gene" ids for adding transcription factor (TF) and target interaction 

In [8]:
ncbi_df_mice = get_identifier_of_interest(bridgedb_df_mice, "NCBI Gene")
ncbi_to_ensembl = dict(zip(ncbi_df_mice['target'], ncbi_df_mice['identifier']))
ncbi_df_mice.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea
0,ENSMUSG00000021903,Ensembl,78754,NCBI Gene,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,2.373133e-136,139.689061


In [9]:
bridgedb_df_mice[bridgedb_df_mice["target"] == "78754"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea
1,ENSMUSG00000021903,Ensembl,78754,NCBI Gene,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,2.373133e-136,139.689061


In [10]:
print("Number of humman genes with mapping for NCBI Gene:", len(ncbi_df_mice["target"].unique()))

Number of humman genes with mapping for NCBI Gene: 11219


### Download TF-target interactions

You can download the Gene-TF interactions dataset from **TFLink**. Please visit the following page for the download:

[TFLink Download Page](https://tflink.net/download/)

The datasets you need can be downloaded from the following links:

**For Human (Homo sapiens):**
- [TFLink_Homo_sapiens_interactions_All_simpleFormat_v1.0.tsv.gz](https://cdn.netbiol.org/tflink/download_files/TFLink_Homo_sapiens_interactions_All_simpleFormat_v1.0.tsv.gz)

**For Mice (Mus musculus):**
- [TFLink_Mus_musculus_interactions_All_simpleFormat_v1.0.tsv.gz](https://cdn.netbiol.org/tflink/download_files/TFLink_Mus_musculus_interactions_All_simpleFormat_v1.0.tsv.gz)

These files contain the TF-target interaction data in a simple format for each species.


In [11]:
# URLs for the TF datasets
url_mice = "https://cdn.netbiol.org/tflink/download_files/TFLink_Mus_musculus_interactions_All_simpleFormat_v1.0.tsv.gz"

# Function to download and read a gzipped file into a dataframe
def download_save_and_read(url, filename):
    if not os.path.exists(filename):
        response = requests.get(url)
        with open(filename, "wb") as file:
            file.write(response.content)
    else:
        print(f"{filename} already exists, skipping download.")

    with gzip.open(filename, "rt") as f:
        df = pd.read_csv(f, sep="\t")
    return df

# Download and read the mouse and mice datasets
tf_df_mice = download_save_and_read(url_mice, "data/mice/tflink_mice.tsv.gz")

# Keep rows where both TF and target exist in our dataset
tf_df_mice = tf_df_mice[tf_df_mice["NCBI.GeneID.TF"].isin(ncbi_df_mice["target"])]
tf_df_mice = tf_df_mice[tf_df_mice["NCBI.GeneID.Target"].isin(ncbi_df_mice["target"])]

tf_df_mice.head(1)

data/mice/tflink_mice.tsv.gz already exists, skipping download.


Unnamed: 0,UniprotID.TF,UniprotID.Target,NCBI.GeneID.TF,NCBI.GeneID.Target,Name.TF,Name.Target,Detection.method,PubmedID,Organism,Source.database,Small-scale.evidence,TF.TFLink.ortho,TF.nonTFLink.ortho,Target.TFLink.ortho,Target.nonTFLink.ortho
0,Q9JL61,P14483,53970,14961,Rfx5,H2-Ab1,inferred by curator,29087512;11258423,Mus musculus,TRRUST,Yes,Hs:P48382;Rn:D3ZHD7,-,Rn:Q6AYB1,Hs:Q5SU54


In [12]:
tf_df_mice.shape

(2014896, 15)

In [13]:
# Extract the TF and the TF targets
tf_list_mice = list(tf_df_mice["NCBI.GeneID.TF"])
target_list_mice = list(tf_df_mice["NCBI.GeneID.Target"])

# Add 'is_tf' and 'is_target' columns
ncbi_df_mice["is_tf"] = ncbi_df_mice["target"].isin(tf_list_mice)
ncbi_df_mice["is_target"] = ncbi_df_mice["target"].isin(target_list_mice)

ncbi_df_mice["is_target"].value_counts(), ncbi_df_mice["is_tf"].value_counts()

(is_target
 True     10759
 False      472
 Name: count, dtype: int64,
 is_tf
 False    10557
 True       674
 Name: count, dtype: int64)

In [14]:
# kepp only rows where the target is a DEG
tf_sig_df_mice = tf_df_mice[tf_df_mice["NCBI.GeneID.Target"].isin(ncbi_df_mice[
    (ncbi_df_mice['padj_dea'] < 0.01) &
    (
        (ncbi_df_mice['log2FoldChange_dea'] > 1) |
        (ncbi_df_mice['log2FoldChange_dea'] < -1)
    )]["target"])]
# add ensembl gene id for the TF and target
tf_sig_df_mice['Ensembl.GeneID.TF'] = tf_sig_df_mice['NCBI.GeneID.TF'].map(ncbi_to_ensembl)
tf_sig_df_mice['Ensembl.GeneID.Target'] = tf_sig_df_mice['NCBI.GeneID.Target'].map(ncbi_to_ensembl)
tf_sig_df_mice.head(1)

Unnamed: 0,UniprotID.TF,UniprotID.Target,NCBI.GeneID.TF,NCBI.GeneID.Target,Name.TF,Name.Target,Detection.method,PubmedID,Organism,Source.database,Small-scale.evidence,TF.TFLink.ortho,TF.nonTFLink.ortho,Target.TFLink.ortho,Target.nonTFLink.ortho,Ensembl.GeneID.TF,Ensembl.GeneID.Target
0,Q9JL61,P14483,53970,14961,Rfx5,H2-Ab1,inferred by curator,29087512;11258423,Mus musculus,TRRUST,Yes,Hs:P48382;Rn:D3ZHD7,-,Rn:Q6AYB1,Hs:Q5SU54,ENSMUSG00000005774,ENSMUSG00000073421


In [15]:
tf_sig_df_mice[(tf_sig_df_mice["Name.TF"] == "Hif1a") & (tf_sig_df_mice["Name.Target"] == "Sertad2")]

Unnamed: 0,UniprotID.TF,UniprotID.Target,NCBI.GeneID.TF,NCBI.GeneID.Target,Name.TF,Name.Target,Detection.method,PubmedID,Organism,Source.database,Small-scale.evidence,TF.TFLink.ortho,TF.nonTFLink.ortho,Target.TFLink.ortho,Target.nonTFLink.ortho,Ensembl.GeneID.TF,Ensembl.GeneID.Target
3614494,Q61221,Q9JJG5,15251,58172,Hif1a,Sertad2,chromatin immunoprecipitation assay,27924024,Mus musculus,GTRD,No,Hs:Q16665;Dr:Q6EHI4;Rn:D4A8P8,-,Hs:Q14140;Dr:Q7ZZ27,Rn:F7F8X3,ENSMUSG00000021109,ENSMUSG00000049800


In [16]:
tf_sig_df_mice[tf_sig_df_mice["Name.TF"] == "Pdhx"]["Name.Target"].sort_values()

Series([], Name: Name.Target, dtype: object)

In [17]:
tf_sig_df_mice.shape

(123350, 17)

In [18]:
# Function to add targets and TFs to each row (gene)
def add_target_and_tf_interaction(ncbi_df, tf_df, filename):
    if not os.path.exists(filename):
        ncbi_df["its_target"] = None
        print(f"Processing targets. {ncbi_df.shape}")
        for index, row in ncbi_df.iterrows():
            if index % 2000 == 0 and index > 0:
                print(f"Processed {index} rows (targets).")
            if row["is_tf"]:
                targets = tf_df[tf_df["NCBI.GeneID.TF"] == row["target"]]
                if not targets.empty:
                    target_info_list = targets[["NCBI.GeneID.Target",
                                                "Ensembl.GeneID.Target",
                                                "Name.Target",
                                                "UniprotID.Target",
                                                "Target.TFLink.ortho",
                                                "Target.nonTFLink.ortho",
                                                "Detection.method",
                                                "PubmedID",
                                                "Source.database",
                                                "Small-scale.evidence",
                                            ]].to_dict(orient="records")
                    ncbi_df.at[index, "its_target"] = target_info_list
                else:
                    ncbi_df.at[index, "its_target"] = []

        ncbi_df["its_tf"] = None
        print("Processing TFs...")
        for index, row in ncbi_df.iterrows():
            if index % 2000 == 0 and index > 0:
                print(f"Processed {index} rows (TFs).")
            if row["is_target"] and row["padj_dea"] <= 0.01 and (row["log2FoldChange_dea"] > 1 or row["log2FoldChange_dea"] < -1):
                tf = tf_df[tf_df["NCBI.GeneID.Target"] == row["target"]]
                if not tf.empty:
                    tf_info_list = tf[["NCBI.GeneID.TF",
                                       "Ensembl.GeneID.TF",
                                       "Name.TF",
                                       "UniprotID.TF",
                                       "TF.TFLink.ortho",
                                       "TF.nonTFLink.ortho",
                                       "Detection.method",
                                       "PubmedID",
                                       "Source.database",
                                       "Small-scale.evidence",
                                    ]].to_dict(orient="records")
                    ncbi_df.at[index, "its_tf"] = tf_info_list
                else:
                    # if row["padj_dea"] <= 0.01:
                    #     ncbi_df.at[index, "its_tf"] = None
                    # else:
                    ncbi_df.at[index, "its_tf"] = []

        ncbi_df.to_pickle(filename)

    else:
        with open(filename, 'rb') as f:
            ncbi_df = pickle.load(f)

    return ncbi_df

# Add target and TF
ncbi_tf_df_mice = add_target_and_tf_interaction(ncbi_df_mice, tf_sig_df_mice, filename="data/mice/ncbi_tf_df_mice.pkl")

In [19]:
ncbi_tf_df_mice.head()

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea,is_tf,is_target,its_target,its_tf
0,ENSMUSG00000021903,Ensembl,78754,NCBI Gene,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,2.373133e-136,139.689061,False,True,,"[{'NCBI.GeneID.TF': '14461', 'Ensembl.GeneID.T..."
1,ENSMUSG00000059824,Ensembl,13170,NCBI Gene,Dbp,2816.783019,-2.985239,0.19176,-15.567614,1.208303e-54,7.006949e-51,53.917824,True,True,"[{'NCBI.GeneID.Target': '18626', 'Ensembl.Gene...","[{'NCBI.GeneID.TF': '11865', 'Ensembl.GeneID.T..."
2,ENSMUSG00000041417,Ensembl,18708,NCBI Gene,Pik3r1,2934.327264,2.047483,0.136776,14.969629,1.1596579999999999e-50,4.483238e-47,49.93567,False,True,,"[{'NCBI.GeneID.TF': '17342', 'Ensembl.GeneID.T..."
3,ENSMUSG00000004939,Ensembl,69564,NCBI Gene,Nmrk2,675.068734,-3.981474,0.274358,-14.511978,1.017437e-47,2.950057e-44,46.992493,False,True,,"[{'NCBI.GeneID.TF': '17257', 'Ensembl.GeneID.T..."
4,ENSMUSG00000028834,Ensembl,433766,NCBI Gene,Trim63,44995.104413,3.738275,0.261289,14.307048,1.977304e-46,4.586555e-43,45.703926,False,True,,"[{'NCBI.GeneID.TF': '226442', 'Ensembl.GeneID...."


In [20]:
ncbi_tf_df_mice[ncbi_tf_df_mice['its_target'].notna()].head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea,is_tf,is_target,its_target,its_tf
1,ENSMUSG00000059824,Ensembl,13170,NCBI Gene,Dbp,2816.783019,-2.985239,0.19176,-15.567614,1.208303e-54,7.006949e-51,53.917824,True,True,"[{'NCBI.GeneID.Target': '18626', 'Ensembl.Gene...","[{'NCBI.GeneID.TF': '11865', 'Ensembl.GeneID.T..."


In [21]:
ncbi_tf_df_mice[ncbi_tf_df_mice['target']=="1647"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea,is_tf,is_target,its_target,its_tf


In [22]:
ncbi_tf_df_mice[ncbi_tf_df_mice['target']=="1647"]["its_tf"].to_dict()

{}

In [23]:
ncbi_tf_df_mice[ncbi_tf_df_mice['target']=="1647"]["its_tf"].to_dict()

{}

In [24]:
ncbi_tf_df_mice[ncbi_tf_df_mice['target']=="3725"]["its_target"].to_dict()

{}

### Protein-Protein Interactions from STRING

In [46]:
string_path = os.path.join(base_dir, "data/mice/string.pkl")
string_metadata_path = os.path.join(base_dir, "data/mice/string_metadata.pkl")

if not os.path.exists(string_path):
    ppi_df, ppi_metadata = stringdb.get_ppi(
        bridgedb_df=bridgedb_df_mice[
                        (bridgedb_df_mice['padj_dea'] < 0.01) &
                (
                    (bridgedb_df_mice['log2FoldChange_dea'] > 1) |
                    (bridgedb_df_mice['log2FoldChange_dea'] < -1)
                )
        ],
        species= "Mouse"
    )
    ppi_df.to_pickle(string_path)
    with open(string_metadata_path, "wb") as file:
        pickle.dump(ppi_metadata, file)
else:
    ppi_df = pd.read_pickle(string_path)
    with open(string_metadata_path, "rb") as file:
        ppi_metadata = pickle.load(file)

ppi_df.head()


Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea,StringDB_ppi
0,ENSMUSG00000021903,Ensembl,ENSMUSG00000021903,Ensembl,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,2.373133e-136,139.689061,"[{'stringdb_link_to': 'ENSMUSG00000021906', 'E..."
1,ENSMUSG00000059824,Ensembl,ENSMUSG00000059824,Ensembl,Dbp,2816.783019,-2.985239,0.19176,-15.567614,1.208303e-54,7.006949e-51,53.917824,"[{'stringdb_link_to': 'ENSMUSG00000020893', 'E..."
2,ENSMUSG00000041417,Ensembl,ENSMUSG00000041417,Ensembl,Pik3r1,2934.327264,2.047483,0.136776,14.969629,1.1596579999999999e-50,4.483238e-47,49.93567,"[{'stringdb_link_to': 'ENSMUSG00000000290', 'E..."
3,ENSMUSG00000004939,Ensembl,ENSMUSG00000004939,Ensembl,Nmrk2,675.068734,-3.981474,0.274358,-14.511978,1.017437e-47,2.950057e-44,46.992493,"[{'stringdb_link_to': 'ENSMUSG00000079243', 'E..."
4,ENSMUSG00000028834,Ensembl,ENSMUSG00000028834,Ensembl,Trim63,44995.104413,3.738275,0.261289,14.307048,1.977304e-46,4.586555e-43,45.703926,"[{'stringdb_link_to': 'ENSMUSG00000020475', 'E..."


In [47]:
ppi_df.shape

(846, 13)

In [48]:
from pyBiodatafuse.constants import STRING_PPI_COL

ppi_df[STRING_PPI_COL].to_dict()

{0: [{'stringdb_link_to': 'ENSMUSG00000021906',
   'Ensembl': 'ENSMUSP00000022462',
   'score': 0.505,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None}],
 1: [{'stringdb_link_to': 'ENSMUSG00000020893',
   'Ensembl': 'ENSMUSP00000021271',
   'score': 0.7,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSMUSG00000023087',
   'Ensembl': 'ENSMUSP00000023849',
   'score': 0.425,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSMUSG00000030103',
   'Ensembl': 'ENSMUSP00000032194',
   'score': 0.47,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSMUSG00000032010',
   'Ensembl': 'ENSMUSP00000034508',
   'score': 0.415,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSMUSG00000068742',
   'Ensembl': 'ENSMUSP00000088047',
   'score': 0.704,
   'Uniprot-TrEMBL': 'Q9R194',
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSMUSG0

In [49]:
ppi_df[["target", "StringDB_ppi"]].head()

Unnamed: 0,target,StringDB_ppi
0,ENSMUSG00000021903,"[{'stringdb_link_to': 'ENSMUSG00000021906', 'E..."
1,ENSMUSG00000059824,"[{'stringdb_link_to': 'ENSMUSG00000020893', 'E..."
2,ENSMUSG00000041417,"[{'stringdb_link_to': 'ENSMUSG00000000290', 'E..."
3,ENSMUSG00000004939,"[{'stringdb_link_to': 'ENSMUSG00000079243', 'E..."
4,ENSMUSG00000028834,"[{'stringdb_link_to': 'ENSMUSG00000020475', 'E..."


In [50]:
ppi_df[ppi_df["target"] == "ENSMUSG00000032010"]["StringDB_ppi"].values

array([list([{'stringdb_link_to': 'ENSMUSG00000020893', 'Ensembl': 'ENSMUSP00000021271', 'score': 0.768, 'Uniprot-TrEMBL': None, 'Uniprot-TrEMBL_link': None}, {'stringdb_link_to': 'ENSMUSG00000051705', 'Ensembl': 'ENSMUSP00000149463', 'score': 0.401, 'Uniprot-TrEMBL': None, 'Uniprot-TrEMBL_link': None}, {'stringdb_link_to': 'ENSMUSG00000059824', 'Ensembl': 'ENSMUSP00000079693', 'score': 0.415, 'Uniprot-TrEMBL': None, 'Uniprot-TrEMBL_link': None}, {'stringdb_link_to': 'ENSMUSG00000068742', 'Ensembl': 'ENSMUSP00000088047', 'score': 0.503, 'Uniprot-TrEMBL': 'Q9R194', 'Uniprot-TrEMBL_link': None}, {'stringdb_link_to': 'ENSMUSG00000030967', 'Ensembl': 'ENSMUSP00000101763', 'score': 0.551, 'Uniprot-TrEMBL': None, 'Uniprot-TrEMBL_link': None}, {'stringdb_link_to': 'ENSMUSG00000055116', 'Ensembl': 'ENSMUSP00000147989', 'score': 0.688, 'Uniprot-TrEMBL': None, 'Uniprot-TrEMBL_link': None}, {'stringdb_link_to': 'ENSMUSG00000008348', 'Ensembl': 'ENSMUSP00000114180', 'score': 0.856, 'Uniprot-TrEMBL

In [51]:
ncbi_tf_gprofiler_mitocarta_df_mice.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta
0,ENSMUSG00000021903,Ensembl,78754,ncbi_gene_id,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,...,"[{'id': 'GO:0006493', 'name': 'protein O-linke...","[{'id': 'GO:0030133', 'name': 'transport vesic...","[{'id': 'GO:0030246', 'name': 'carbohydrate bi...",,"[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:mmu-miR-34b-5p', 'name': 'mmu-m...","[{'id': 'REAC:R-MMU-5173105', 'name': 'O-linke...","[{'id': 'TF:M04454_1', 'name': 'Factor: AR; mo...",,


In [52]:
combined_df = pd.merge(
    ncbi_tf_gprofiler_mitocarta_df_mice,
    ppi_df[["target", "StringDB_ppi"]].rename(columns={"target": "identifier"}),
    on="identifier",
    how="left",
)

In [53]:
combined_df.columns

Index(['identifier', 'identifier.source', 'target', 'target.source',
       'GENE_SYMBOL_dea', 'baseMean_dea', 'log2FoldChange_dea', 'lfcSE_dea',
       'stat_dea', 'pvalue_dea', 'padj_dea', 'minus_log10_pvalue_dea', 'is_tf',
       'is_target', 'its_target', 'its_tf', 'intersections',
       'g:Profiler_corum', 'g:Profiler_go:bp', 'g:Profiler_go:cc',
       'g:Profiler_go:mf', 'g:Profiler_hp', 'g:Profiler_kegg',
       'g:Profiler_mirna', 'g:Profiler_reac', 'g:Profiler_tf', 'g:Profiler_wp',
       'MitoCarta', 'StringDB_ppi'],
      dtype='object')

In [54]:
combined_df.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
0,ENSMUSG00000021903,Ensembl,78754,ncbi_gene_id,Galnt15,2770.437508,2.521182,0.099941,25.226601,2.0461570000000001e-140,...,"[{'id': 'GO:0030133', 'name': 'transport vesic...","[{'id': 'GO:0030246', 'name': 'carbohydrate bi...",,"[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:mmu-miR-34b-5p', 'name': 'mmu-m...","[{'id': 'REAC:R-MMU-5173105', 'name': 'O-linke...","[{'id': 'TF:M04454_1', 'name': 'Factor: AR; mo...",,,"[{'stringdb_link_to': 'ENSMUSG00000021906', 'E..."


In [55]:
combined_df["StringDB_ppi"].to_dict()

{0: [{'stringdb_link_to': 'ENSMUSG00000021906',
   'Ensembl': 'ENSMUSP00000022462',
   'score': 0.505,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None}],
 1: [{'stringdb_link_to': 'ENSMUSG00000020893',
   'Ensembl': 'ENSMUSP00000021271',
   'score': 0.7,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSMUSG00000023087',
   'Ensembl': 'ENSMUSP00000023849',
   'score': 0.425,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSMUSG00000030103',
   'Ensembl': 'ENSMUSP00000032194',
   'score': 0.47,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSMUSG00000032010',
   'Ensembl': 'ENSMUSP00000034508',
   'score': 0.415,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSMUSG00000068742',
   'Ensembl': 'ENSMUSP00000088047',
   'score': 0.704,
   'Uniprot-TrEMBL': 'Q9R194',
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSMUSG0

In [58]:
filename_mice = "data/mice/combined_df.pkl"

combined_df.to_pickle(filename_mice)

### graph

In [56]:
filename_mice = "data/mice/combined_df.pkl"
with open(filename_mice, "rb") as f:
    combined_df = pickle.load(f)

In [57]:
# Extract all 'NCBI.GeneID.TF' values into a single list
ncbi_gene_ids = (
    combined_df["its_tf"]
    .apply(lambda x: [d["NCBI.GeneID.TF"] for d in x] if isinstance(x, list) else [])
    .explode()
    .dropna()
    .unique()
    .tolist()
)

len(ncbi_gene_ids)

529

In [59]:
combined_df_tf = combined_df[combined_df["target"].isin(ncbi_gene_ids)]
combined_df_sig = combined_df[
    (combined_df['padj_dea'] < 0.01) &
    (
        (combined_df['log2FoldChange_dea'] > 1) |
        (combined_df['log2FoldChange_dea'] < -1)
    )]


combined_df_sig = combined_df_sig[~combined_df_sig["target"].isin(ncbi_gene_ids)]
combined_df_sig.shape

(755, 29)

In [60]:
combined_df_tf_sig = pd.concat([combined_df_sig, combined_df_tf], axis=0, ignore_index=True)
combined_df_tf_sig.shape

(1285, 29)

In [61]:
combined_df_tf_sig[combined_df_tf_sig["GENE_SYMBOL_dea"] == "COQ10A"]["its_tf"].apply(
    lambda x: isinstance(x, list)
)

Series([], Name: its_tf, dtype: object)

In [62]:
combined_df_tf_sig[combined_df_tf_sig["GENE_SYMBOL_dea"] == "COQ10A"]["its_tf"].to_dict()

{}

In [63]:
combined_df_tf[~combined_df_tf["target"].isin(combined_df_sig["target"].tolist())]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
1,ENSMUSG00000059824,Ensembl,13170,ncbi_gene_id,Dbp,2816.783019,-2.985239,0.191760,-15.567614,1.208303e-54,...,"[{'id': 'GO:0005667', 'name': 'transcription r...","[{'id': 'GO:0000981', 'name': 'DNA-binding tra...",,"[{'id': 'KEGG:04710', 'name': 'Circadian rhyth...","[{'id': 'MIRNA:mmu-miR-122-5p', 'name': 'mmu-m...",,"[{'id': 'TF:M03882', 'name': 'Factor: RelB:p50...",,,"[{'stringdb_link_to': 'ENSMUSG00000020893', 'E..."
9,ENSMUSG00000037465,Ensembl,21847,ncbi_gene_id,Klf10,2481.208552,2.189743,0.166819,13.126470,2.322309e-39,...,"[{'id': 'GO:0005622', 'name': 'intracellular a...","[{'id': 'GO:0005515', 'name': 'protein binding...",,,"[{'id': 'MIRNA:mmu-miR-301b-3p', 'name': 'mmu-...",,"[{'id': 'TF:M00002', 'name': 'Factor: E47; mot...",,,"[{'stringdb_link_to': 'ENSMUSG00000025880', 'E..."
21,ENSMUSG00000042292,Ensembl,223701,ncbi_gene_id,Mkl1,1080.501466,-2.232213,0.191789,-11.638880,2.614271e-31,...,"[{'id': 'GO:0005737', 'name': 'cytoplasm', 'p_...","[{'id': 'GO:0005515', 'name': 'protein binding...","[{'id': 'HP:0031378', 'name': 'Abnormal lympho...",,"[{'id': 'MIRNA:mmu-miR-1a-3p', 'name': 'mmu-mi...","[{'id': 'REAC:R-MMU-162582', 'name': 'Signal T...","[{'id': 'TF:M02024_1', 'name': 'Factor: mef-2A...",,,"[{'stringdb_link_to': 'ENSMUSG00000044167', 'E..."
29,ENSMUSG00000018143,Ensembl,17135,ncbi_gene_id,Mafk,2165.985925,2.109705,0.193895,10.880657,1.425327e-27,...,"[{'id': 'GO:0005622', 'name': 'intracellular a...","[{'id': 'GO:0005515', 'name': 'protein binding...",,,"[{'id': 'MIRNA:mmu-miR-3473c', 'name': 'mmu-mi...","[{'id': 'REAC:R-MMU-9707616', 'name': 'Heme si...","[{'id': 'TF:M10174_1', 'name': 'Factor: AR; mo...","[{'id': 'WP:WP4265', 'name': 'Ethanol metaboli...",,"[{'stringdb_link_to': 'ENSMUSG00000025612', 'E..."
34,ENSMUSG00000040943,Ensembl,214133,ncbi_gene_id,Tet2,841.463638,1.321798,0.125819,10.505543,8.145301e-26,...,"[{'id': 'GO:0005622', 'name': 'intracellular a...","[{'id': 'GO:0005515', 'name': 'protein binding...","[{'id': 'HP:0030386', 'name': 'Abnormal propor...",,"[{'id': 'MIRNA:mmu-miR-466d-5p', 'name': 'mmu-...",,"[{'id': 'TF:M03882', 'name': 'Factor: RelB:p50...",,,"[{'stringdb_link_to': 'ENSMUSG00000022346', 'E..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11084,ENSMUSG00000000440,Ensembl,19016,ncbi_gene_id,Pparg,104.582903,-0.012934,0.259905,-0.049763,9.603111e-01,...,,,,,,,,,,
11131,ENSMUSG00000055866,Ensembl,18627,ncbi_gene_id,Per2,1058.682323,0.006802,0.207956,0.032707,9.739080e-01,...,,,,,,,,,,
11136,ENSMUSG00000033249,Ensembl,26386,ncbi_gene_id,Hsf4,367.784805,0.005086,0.162026,0.031391,9.749574e-01,...,,,,,,,,,,
11143,ENSMUSG00000026021,Ensembl,22218,ncbi_gene_id,Sumo1,1372.979132,-0.002733,0.094426,-0.028941,9.769115e-01,...,,,,,,,,,,


In [64]:
combined_df_tf_sig[combined_df_tf_sig["identifier"].isin(["ENSG00000001167", "ENSG00000116717"])]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi


In [55]:
combined_df_tf_sig[combined_df_tf_sig["identifier"] == "ENSG00000001167"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
205,ENSG00000001167,Ensembl,4800,ncbi_gene_id,NFYA,198.963515,0.339027,0.133075,2.547627,0.010846,...,,,,,,,,,,


In [56]:
combined_df_tf_sig[combined_df_tf_sig["GENE_SYMBOL_dea"] == "GADD45A"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
55,ENSG00000116717,Ensembl,1647,ncbi_gene_id,GADD45A,448.20828,1.432184,0.377628,3.792579,0.000149,...,"[{'id': 'GO:0019899', 'name': 'enzyme binding'...",,,"[{'id': 'KEGG:05220', 'name': 'Chronic myeloid...","[{'id': 'MIRNA:hsa-miR-26b-5p', 'name': 'hsa-m...","[{'id': 'REAC:R-HSA-6791312', 'name': 'TP53 Re...","[{'id': 'TF:M07322_1', 'name': 'Factor: HSF4; ...","[{'id': 'WP:WP3640', 'name': 'Imatinib and chr...",,"[{'stringdb_link_to': 'ENSG00000120129', 'Ense..."


In [57]:
combined_df_tf_sig[combined_df_tf_sig["identifier"] == "ENSG00000001167"]["its_target"].to_dict()

{205: [{'NCBI.GeneID.Target': '1647',
   'Ensembl.GeneID.Target': 'ENSG00000116717',
   'Name.Target': 'GADD45A',
   'UniprotID.Target': 'P24522',
   'Target.TFLink.ortho': 'Dr:Q6GMM1;Mm:P48316;Rn:Q66HL6',
   'Target.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay;inferred by curator',
   'PubmedID': '29126285;11525640;11420680;27924024;29087512',
   'Source.database': 'GTRD;ReMap;TRRUST',
   'Small-scale.evidence': 'Yes'},
  {'NCBI.GeneID.Target': '84271',
   'Ensembl.GeneID.Target': 'ENSG00000100227',
   'Name.Target': 'POLDIP3',
   'UniprotID.Target': 'Q9BY77',
   'Target.TFLink.ortho': 'Mm:Q8BG81;Rn:D4A2B0',
   'Target.nonTFLink.ortho': 'Dr:A0A0R4ILC0',
   'Detection.method': 'chromatin immunoprecipitation assay',
   'PubmedID': '29126285;27924024',
   'Source.database': 'GTRD;ReMap',
   'Small-scale.evidence': 'No'},
  {'NCBI.GeneID.Target': '4043',
   'Ensembl.GeneID.Target': 'ENSG00000163956',
   'Name.Target': 'LRPAP1',
   'UniprotID.Target': 

In [65]:
combined_df_tf_sig["its_target"][10]

In [66]:
# Extract all targets for NFYA ('NCBI.GeneID.TF')
ncbi_gene_ids_NFYA_targets = (
    combined_df_tf_sig[combined_df_tf_sig["identifier"] == "ENSG00000001167"]["its_target"]
    .apply(lambda x: [d["NCBI.GeneID.Target"] for d in x] if isinstance(x, list) else [])
    .explode()
    .dropna()
    .unique()
    .tolist()
)

len(ncbi_gene_ids_NFYA_targets)

0

In [50]:
# combined_df = combine_sources(
#     merged_df_mice,
#     [
#     ppi_df,
#     ],
# )

In [30]:
# combined_metadata = create_or_append_to_metadata(
#     bridgedb_metadata_mice,
#     [
#     ppi_metadata,
#     get_data_versions("hsapiens")
#     ],
# )

In [55]:
# combined_metadata

[{'datasource': 'StringDB',
  'metadata': {'source_version': {'source_version': '12.0'}},
  'query': {'size': 222,
   'input_type': 'HGNC',
   'number_of_added_edges': 532,
   'time': '0:00:00.651298',
   'date': '2024-12-18 14:43:18',
   'url': 'https://string-db.org/api'}},
 {'biomart': 'Ensembl',
  'biomart_version': '111',
  'display_name': 'Human',
  'genebuild': 'GRCh38.p14',
  'gprofiler_version': 'e111_eg58_p18_f463989d',
  'organism': 'hsapiens',
  'sources': {'CORUM': {'name': 'CORUM protein complexes',
    'version': '28.11.2022 Corum 4.1'},
   'GO:BP': {'name': 'biological process',
    'version': 'annotations: BioMart\nclasses: releases/2024-01-17'},
   'GO:CC': {'name': 'cellular component',
    'version': 'annotations: BioMart\nclasses: releases/2024-01-17'},
   'GO:MF': {'name': 'molecular function',
    'version': 'annotations: BioMart\nclasses: releases/2024-01-17'},
   'HP': {'name': 'Human Phenotype Ontology',
    'version': 'annotations: 01.2024\nclasses: None'},
 

In [68]:
combined_df[combined_df["identifier"] == "CDKN1A"]["StringDB_ppi"].to_dict()

{}

In [69]:
combined_df_tf_sig[combined_df_tf_sig["GENE_SYMBOL_dea"].str.contains("OTF6", case=False, na=False)]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi


In [70]:
combined_df[combined_df["GENE_SYMBOL_dea"] == "MT-ND4"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi


In [63]:
combined_df[combined_df["identifier"] == "ENSG00000198886"]["StringDB_ppi"].to_dict()

{10722: [{'stringdb_link_to': 'ENSG00000010256',
   'Ensembl': 'ENSP00000203407',
   'score': 0.971,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000108179',
   'Ensembl': 'ENSP00000225174',
   'score': 0.404,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000110955',
   'Ensembl': 'ENSP00000262030',
   'score': 0.553,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000154518',
   'Ensembl': 'ENSP00000284727',
   'score': 0.513,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000005700',
   'Ensembl': 'ENSP00000305721',
   'score': 0.457,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000198804',
   'Ensembl': 'ENSP00000354499',
   'score': 0.999,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000198727',
   'Ensembl': 'ENSP00000354554',
   'score': 0.999,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000198899',
   'Ensembl': 'ENSP00000354632',
   'score': 0.999,
   'Uniprot-TrEMB

In [64]:
combined_df[combined_df["identifier"] == "ENSG00000005700"]["StringDB_ppi"].to_dict()

{81: [{'stringdb_link_to': 'ENSG00000198886',
   'Ensembl': 'Ensembl:ENSP00000354961',
   'score': 0.457,
   'Uniprot-TrEMBL': 'IBTK'}]}

In [65]:
combined_df[combined_df["identifier"] == "ENSG00000108387"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
2558,ENSG00000108387,Ensembl,5414,ncbi_gene_id,2004-09-01 00:00:00,193.471021,0.073477,0.13493,0.544556,0.586059,...,,,,,,,,,"[{'gene_description': 'septin 4', 'evidence': ...",


In [67]:
pygraph = generator.save_graph(
    combined_df=combined_df_tf_sig,
    combined_metadata=bridgedb_metadata_mice,
    graph_name="graph_mice",
    graph_dir="./data/mice",
)

Combined DataFrame saved in ./data/mice/graph_mice/graph_mice_df.pkl
Metadata saved in ./data/mice/graph_mice/graph_mice_metadata.pkl
Building graph: 100%|██████████| 1285/1285 [00:04<00:00, 303.40it/s]
Graph is built successfully
Graph saved in ./data/mice/graph_mice/graph_mice_graph.pkl and ./data/mice/graph_mice/graph_mice_graph.gml


In [68]:
print(pygraph)

MultiDiGraph with 17327 nodes and 275684 edges


In [69]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(pygraph, "data/mice/graph_mice/networkx_graph_mice.graphml")