## Explore mitochondrial impairment in tumars 

**Aim**: in this notebook, you will see all the steps for collecting data and constructing a KG to explore mitochondrial impairment in tumer (both in human and mice)

### Import required libraries

In [1]:
# Import modules
import gzip
import os
import pickle
from io import BytesIO

import numpy as np
import pandas as pd
import requests
from gprofiler.gprofiler import GProfiler

from pyBiodatafuse import id_mapper
from pyBiodatafuse.annotators import (
    bgee,
    disgenet,
    minerva,
    molmedb,
    opentargets,
    pubchem,
    stringdb,
    wikipathways,
)
from pyBiodatafuse.constants import BGEE_GENE_EXPRESSION_LEVELS_COL, DISGENET_DISEASE_COL
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import (
    combine_sources,
    create_harmonized_input_file,
    create_or_append_to_metadata,
    get_identifier_of_interest,
)

os.makedirs("data", exist_ok=True)
base_dir = os.path.abspath(os.getcwd())

### Load the input files

In [2]:
# Read only specific columns and skip the first row
all_genes_human = pd.read_excel("datasets/cachexia_vs_control_all_genes.xlsx")
all_genes_human.rename(
    columns={"Unnamed: 0": "identifier", "Unnamed: 1": "GENE_SYMBOL"}, inplace=True
)
deg_human = all_genes_human[all_genes_human["padj"] < 0.05]
print("Number of genes:", len(all_genes_human["identifier"].unique()))
deg_human.head(1)

Number of genes: 12322


Unnamed: 0,identifier,GENE_SYMBOL,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,minus_log10_pvalue
6,ENSG00000001084,GCLC,412.449345,0.454035,0.1302,3.487222,0.000488,0.018659,3.311521


In [3]:
deg_human[deg_human["identifier"] == "ENSG00000159713"]  # does not exist in the deg table shared

Unnamed: 0,identifier,GENE_SYMBOL,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,minus_log10_pvalue
7049,ENSG00000159713,TPPP3,665.53875,1.436198,0.360226,3.986934,6.7e-05,0.005344,4.174362


### Entity resolution with BridgeDB

In [4]:
pickle_path_human = os.path.join(base_dir, "data/human/bridgedb_df_human.pkl")
metadata_path_human = os.path.join(base_dir, "data/human/bridgedb_metadata_human.pkl")

if not os.path.exists(pickle_path_human):
    bridgedb_df_human, bridgedb_metadata_human = id_mapper.bridgedb_xref(
        identifiers=all_genes_human,
        input_species="Human",
        input_datasource="Ensembl",
        output_datasource="All",
    )
    bridgedb_df_human.to_pickle(pickle_path_human)
    with open(metadata_path_human, "wb") as file:
        pickle.dump(bridgedb_metadata_human, file)
else:
    bridgedb_df_human = pd.read_pickle(pickle_path_human)
    with open(metadata_path_human, "rb") as file:
        bridgedb_metadata_human = pickle.load(file)

In [5]:
print("Number of genes with mapping in BridgeDb:", len(bridgedb_df_human["identifier"].unique()))
bridgedb_df_human.head(1)

Number of genes with mapping in BridgeDb: 12310


Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea
0,ENSG00000000003,Ensembl,HGNC:11858,HGNC Accession Number,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,0.0779,2.279542


### Enrichment analysis using g:Profiler
all the pathways and annotations are being added despite being significance.

In [16]:
gp = GProfiler(return_dataframe=True)
filename_human = "data/human/gprofiler_human_0.01.pkl"
if not os.path.exists(filename_human):
    gprofiler_human = gp.profile(
        organism="hsapiens",
        all_results=True,
        query=bridgedb_df_human[bridgedb_df_human["padj_dea"] <= 0.01]["identifier"]
        .unique()
        .tolist(),
        background=bridgedb_df_human["identifier"].unique().tolist(),
        no_evidences=False,
        significance_threshold_method="fdr",
        user_threshold=0.05,
    )
    gprofiler_human.rename(columns={"native": "id"}, inplace=True)
    gprofiler_human["datasource"] = "g:Profiler"
    gprofiler_human.to_pickle(filename_human)
else:
    with open(filename_human, "rb") as f:
        gprofiler_human = pickle.load(f)

In [17]:
print(gprofiler_human.shape)
gprofiler_human.head(1)

(18779, 17)


Unnamed: 0,source,id,name,p_value,significant,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,intersections,evidences,datasource
0,GO:CC,GO:0005739,mitochondrion,5.957169e-12,True,"""A semiautonomous, self replicating organelle ...",1434,226,69,12294,0.30531,0.048117,query_1,"[GO:0005737, GO:0043231]","[ENSG00000010256, ENSG00000023228, ENSG0000002...","[[IDA, HDA, IBA, TAS, IEA], [IDA, TAS, IEA], [...",g:Profiler


In [8]:
# Function to count upregulated and downregulated genes
def count_up_down(genes, bridgedb_df_human):
    if not isinstance(genes, list) or len(genes) == 0:  # Ensure genes is a list
        return 0, 0

    genes = [str(g).strip() for g in genes]
    filtered_df = bridgedb_df_human[bridgedb_df_human["identifier"].isin(genes)]

    upregulated = (filtered_df["log2FoldChange_dea"] > 0).sum()
    downregulated = (filtered_df["log2FoldChange_dea"] < 0).sum()
    
    return upregulated, downregulated

In [18]:
# Keep only significant results
gprofiler_sig_human= gprofiler_human[gprofiler_human["significant"] == True]

In [19]:
gprofiler_sig_human[["no_upregulated_genes", "no_downregulated_genes"]]  = gprofiler_sig_human["intersections"].apply(
    lambda genes: pd.Series(count_up_down(genes, bridgedb_df_human[bridgedb_df_human["target.source"] == "Ensembl"]))
)

In [20]:
gprofiler_sig_human.shape

(410, 19)

In [18]:
selected_columns = ["source", "id", "name", "p_value", "term_size", "intersection_size", "parents","no_upregulated_genes", "no_downregulated_genes"]
gprofiler_sig_human[selected_columns].to_csv("data/human/gprofiler_sig_human_0.05.csv", index=False)


#### For the graph

In [None]:
gprofiler_human = gprofiler_human[
    ~gprofiler_human["parents"].apply(lambda x: x == [])
]  # rm the root terms

In [None]:
from pyBiodatafuse.constants import GPROFILER


def create_path_info(row):
    path_info = {
        col: row[col] for col in gprofiler_human.columns if col not in ["intersections", "source"]
    }
    return path_info


gprofiler_human["gprofiler"] = gprofiler_human.apply(create_path_info, axis=1)
gprofiler_human = gprofiler_human.drop(
    columns=[
        col
        for col in gprofiler_human.columns
        if col not in ["source", "id", "intersections", "gprofiler"]
    ]
)
gprofiler_human = gprofiler_human.explode("intersections").reset_index(drop=True)
unique_sources = sorted(gprofiler_human["source"].unique())
gprofiler_human_final = pd.DataFrame()
gprofiler_human_final["intersections"] = gprofiler_human["intersections"].unique()
for source in unique_sources:
    source_subset = gprofiler_human[gprofiler_human["source"] == source]
    source_dictionaries = source_subset.groupby("intersections")["gprofiler"].apply(list).to_dict()
    gprofiler_human_final[f"{GPROFILER}_{str.lower(source)}"] = gprofiler_human_final[
        "intersections"
    ].map(source_dictionaries)
gprofiler_human_final.head(1)

Unnamed: 0,intersections,g:Profiler_corum,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp
0,ENSG00000001084,"[{'id': 'CORUM:7186', 'name': 'Glutamate-cyste...","[{'id': 'GO:0044281', 'name': 'small molecule ...","[{'id': 'GO:0005739', 'name': 'mitochondrion',...","[{'id': 'GO:0003824', 'name': 'catalytic activ...","[{'id': 'HP:0012337', 'name': 'Abnormal homeos...","[{'id': 'HPA:0440341', 'name': 'skeletal muscl...","[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:hsa-miR-30c-5p', 'name': 'hsa-m...","[{'id': 'REAC:R-HSA-1430728', 'name': 'Metabol...","[{'id': 'TF:M07428', 'name': 'Factor: Six-3; m...","[{'id': 'WP:WP2882', 'name': 'Nuclear receptor..."


In [14]:
## get version
def get_data_versions(organism):
    url = "https://biit.cs.ut.ee/gprofiler/api/util/data_versions"
    params = {"organism": organism}
    try:
        response = requests.get(url, params=params)

        if response.status_code == 200:
            data = response.json()
            return data
        else:
            raise Exception(f"Failed to retrieve data: {response.status_code}")
    except Exception as e:
        return {"error": str(e)}


get_data_versions("hsapiens")

{'biomart': 'Ensembl',
 'biomart_version': '112',
 'display_name': 'Human',
 'genebuild': 'GRCh38.p14',
 'gprofiler_version': 'e112_eg59_p19_25aa4782',
 'organism': 'hsapiens',
 'sources': {'CORUM': {'name': 'CORUM protein complexes',
   'version': '28.11.2022 Corum 4.1'},
  'GO:BP': {'name': 'biological process',
   'version': 'annotations: BioMart\nclasses: releases/2024-10-27'},
  'GO:CC': {'name': 'cellular component',
   'version': 'annotations: BioMart\nclasses: releases/2024-10-27'},
  'GO:MF': {'name': 'molecular function',
   'version': 'annotations: BioMart\nclasses: releases/2024-10-27'},
  'HP': {'name': 'Human Phenotype Ontology',
   'version': 'annotations: 02.2025\nclasses: None'},
  'HPA': {'name': 'Human Protein Atlas',
   'version': 'annotations: HPA website: 23-07-17\nclasses: script: 24-01-02'},
  'KEGG': {'name': 'Kyoto Encyclopedia of Genes and Genomes',
   'version': 'KEGG FTP Release 2024-01-22'},
  'MIRNA': {'name': 'miRTarBase', 'version': 'Release 9.0'},
  'R

In [None]:
ncbi_tf_gprofiler_df_human = pd.merge(
    ncbi_tf_df_human,
    gprofiler_human_final,
    left_on="identifier",
    right_on="intersections",
    how="outer",
)

In [None]:
ncbi_tf_gprofiler_df_human.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp
0,ENSG00000000003,Ensembl,7105,NCBI Gene,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,...,,,,,,,,,,
1,ENSG00000000419,Ensembl,8813,NCBI Gene,DPM1,601.924666,-0.104654,0.161936,-0.64627,0.518105,...,,,,,,,,,,


In [None]:
ncbi_tf_gprofiler_df_human[ncbi_tf_gprofiler_df_human["g:Profiler_reac"].notna()].head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp
6,ENSG00000001084,Ensembl,2729,NCBI Gene,GCLC,412.449345,0.454035,0.1302,3.487222,0.000488,...,"[{'id': 'GO:0044281', 'name': 'small molecule ...","[{'id': 'GO:0005739', 'name': 'mitochondrion',...","[{'id': 'GO:0003824', 'name': 'catalytic activ...","[{'id': 'HP:0012337', 'name': 'Abnormal homeos...","[{'id': 'HPA:0440341', 'name': 'skeletal muscl...","[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:hsa-miR-30c-5p', 'name': 'hsa-m...","[{'id': 'REAC:R-HSA-1430728', 'name': 'Metabol...","[{'id': 'TF:M07428', 'name': 'Factor: Six-3; m...","[{'id': 'WP:WP2882', 'name': 'Nuclear receptor..."


In [None]:
ncbi_tf_gprofiler_df_human[ncbi_tf_gprofiler_df_human["g:Profiler_reac"].notna()].head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp
6,ENSG00000001084,Ensembl,2729,NCBI Gene,GCLC,412.449345,0.454035,0.1302,3.487222,0.000488,...,"[{'id': 'GO:0044281', 'name': 'small molecule ...","[{'id': 'GO:0005739', 'name': 'mitochondrion',...","[{'id': 'GO:0003824', 'name': 'catalytic activ...","[{'id': 'HP:0012337', 'name': 'Abnormal homeos...","[{'id': 'HPA:0440341', 'name': 'skeletal muscl...","[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:hsa-miR-30c-5p', 'name': 'hsa-m...","[{'id': 'REAC:R-HSA-1430728', 'name': 'Metabol...","[{'id': 'TF:M07428', 'name': 'Factor: Six-3; m...","[{'id': 'WP:WP2882', 'name': 'Nuclear receptor..."


### Add MitoCarta data

In [21]:
# URLs for the Human MitoCarta Dataset
mitocarta_url_human = (
    "https://personal.broadinstitute.org/scalvo/MitoCarta3.0/Human.MitoCarta3.0.xls"
)
# Function to download and read a gzipped file into a dataframe
def download_save_and_read(url, filename):
    if not os.path.exists(filename):
        response = requests.get(url)
        with open(filename, "wb") as file:
            file.write(response.content)
    else:
        print(f"{filename} already exists, skipping download.")

    # Load the second sheet ("A Human MitoCarta3.0") into a DataFrame
    df = pd.read_excel(url, sheet_name="A Human MitoCarta3.0")
    return df


# Download and read the Human MitoCarta dataset
mitocarta_human_df = download_save_and_read(mitocarta_url_human, "data/human/human_mitocarta3.0.xls")

mitocarta_human_df.head(1)

data/human/human_mitocarta3.0.xls already exists, skipping download.


Unnamed: 0,HumanGeneID,MouseOrthologGeneID,Symbol,Synonyms,Description,MitoCarta3.0_List,MitoCarta3.0_Evidence,MitoCarta3.0_SubMitoLocalization,MitoCarta3.0_MitoPathways,TrainingDataset,...,liver_total_peak_intensity_log10,heart_total_peak_intensity_log10,skeletalmuscle_total_peak_intensity_log10,adipose_total_peak_intensity_log10,smallintestine_total_peak_intensity_log10,largeintestine_total_peak_intensity_log10,stomach_total_peak_intensity_log10,placenta_total_peak_intensity_log10,testis_total_peak_intensity_log10,HPA_Main_Location_2020 (Reliability)
0,1537,66445.0,CYC1,MC3DN6|UQCR4,cytochrome c1,MitoCarta3.0,"literature, APEX_IMS, APEX_matrix, targetP sig...",MIM,OXPHOS > Complex III > CIII subunits | Metabol...,Tmito,...,10.0,10.4,9.8,10.1,10.2,10.2,10.0,10.0,9.8,Mitochondria (Supported)


In [22]:
# Select relevant columns for inclusion in the graph
selected_columns_human = [
    "EnsemblGeneID_mapping_version_20200130",
    "Description",
    "MitoCarta3.0_Evidence",
    "MitoCarta3.0_SubMitoLocalization",
    "MitoCarta3.0_MitoPathways",
    "HPA_Main_Location_2020 (Reliability)",
    "Tissues",
]

mitocarta_human_selected = mitocarta_human_df[selected_columns_human]

# Rename columns for clarity
mitocarta_human_selected.rename(
    columns={
        "EnsemblGeneID_mapping_version_20200130": "ensembl_id",
        "Description": "gene_description",
        "MitoCarta3.0_Evidence": "evidence",
        "MitoCarta3.0_SubMitoLocalization": "sub_mito_localization",
        "MitoCarta3.0_MitoPathways": "mito_pathways",
        "HPA_Main_Location_2020 (Reliability)": "hpa_location",
        "Tissues": "tissue_expression",
    },
    inplace=True,
)

In [23]:
mitocarta_human_selected["mito_pathways"] = (
    mitocarta_human_selected["mito_pathways"]
    .str.split(">")
    .str[-1]
    .str.split("|")
    .str[0]
    .str.strip()
)

In [24]:
mitocarta_human_selected = mitocarta_human_selected.dropna(subset=["mito_pathways"]).reset_index(drop=True) # Drop rows with missing mito_pathways
mitocarta_human_selected["ensembl_id"] = mitocarta_human_selected["ensembl_id"].str.split("|")
mitocarta_human_selected = mitocarta_human_selected.explode("ensembl_id").reset_index(drop=True)


In [25]:
mitocarta_human_selected = mitocarta_human_selected[mitocarta_human_selected["ensembl_id"].isin(bridgedb_df_human["identifier"][bridgedb_df_human["padj_dea"] < 0.05])] # Keep only significant genes existing in our dataset

In [26]:
bridgedb_df_human.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea
0,ENSG00000000003,Ensembl,HGNC:11858,HGNC Accession Number,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,0.0779,2.279542


In [27]:
mitocarta_human_selected.head(3)

Unnamed: 0,ensembl_id,gene_description,evidence,sub_mito_localization,mito_pathways,hpa_location,tissue_expression
1,ENSG00000117118,succinate dehydrogenase complex iron sulfur su...,"literature, APEX_matrix, targetP signal+, yeas...",MIM,OXPHOS subunits,Mitochondria (Supported),all 14
4,ENSG00000010256,ubiquinol-cytochrome c reductase core protein 1,"literature, APEX_matrix, targetP signal+, yeas...",MIM,OXPHOS subunits,Mitochondria (Supported),all 14
7,ENSG00000088682,coenzyme Q9,"APEX_matrix, targetP signal+, yeast mito homol...",MIM,Coenzyme Q metabolism,,all 14


In [28]:
aggregated_mitocarta_human_selected = mitocarta_human_selected.groupby("mito_pathways").agg(
    {
        "ensembl_id": list,
    }
).reset_index()
aggregated_mitocarta_human_selected["intersections"] = aggregated_mitocarta_human_selected["ensembl_id"].apply(len)


In [29]:
aggregated_mitocarta_human_selected.head(4)

Unnamed: 0,mito_pathways,ensembl_id,intersections
0,ABC transporters,[ENSG00000115657],1
1,Amino acid metabolism,[ENSG00000133943],1
2,Apoptosis,[ENSG00000176171],1
3,Autophagy,[ENSG00000146729],1


In [30]:
aggregated_mitocarta_human_selected[["no_upregulated_genes", "no_downregulated_genes"]]  = aggregated_mitocarta_human_selected["ensembl_id"].apply(
    lambda genes: pd.Series(count_up_down(genes, bridgedb_df_human[bridgedb_df_human["target.source"] == "Ensembl"]))
)

In [42]:
aggregated_mitocarta_human_selected.to_csv("data/human/mitocarta_human_0.05.csv", index=False)


In [31]:
aggregated_mitocarta_human_selected

Unnamed: 0,mito_pathways,ensembl_id,intersections,no_upregulated_genes,no_downregulated_genes
0,ABC transporters,[ENSG00000115657],1,1,0
1,Amino acid metabolism,[ENSG00000133943],1,0,1
2,Apoptosis,[ENSG00000176171],1,1,0
3,Autophagy,[ENSG00000146729],1,0,1
4,Biotin utilizing proteins,[ENSG00000175198],1,0,1
5,Branched-chain amino acid dehydrogenase complex,"[ENSG00000137992, ENSG00000083123]",2,0,2
6,Branched-chain amino acid metabolism,"[ENSG00000148090, ENSG00000198130, ENSG0000019...",5,0,5
7,Calcium uniporter,[ENSG00000107745],1,0,1
8,Carnitine shuttle,[ENSG00000157184],1,0,1
9,Carnitine synthesis and transport,[ENSG00000185973],1,0,1


#### For the graph

In [7]:
import pandas as pd
mitocarta_human_1 = pd.read_csv("data/human/mitocarta_human_0.01.csv")
mitocarta_human_1.rename(
    columns={
        "intersections": "intersection_size_0.01",
        "no_upregulated_genes": "no_upregulated_genes_0.01",
        "no_downregulated_genes": "no_downregulated_genes_0.01"
    },
    inplace=True,
)

mitocarta_human_1 = mitocarta_human_1.drop(columns=["ensembl_id"])
mitocarta_human_1.head()

Unnamed: 0,mito_pathways,intersection_size_0.01,no_upregulated_genes_0.01,no_downregulated_genes_0.01
0,ABC transporters,1,1,0
1,Biotin utilizing proteins,1,0,1
2,Branched-chain amino acid dehydrogenase complex,1,0,1
3,Branched-chain amino acid metabolism,3,0,3
4,Calcium uniporter,1,0,1


In [8]:
mitocarta_human_5 = pd.read_csv("data/human/mitocarta_human_0.05.csv")
mitocarta_human_5.rename(
    columns={
        "intersections": "intersection_size_0.05",
        "no_upregulated_genes": "no_upregulated_genes_0.05",
        "no_downregulated_genes": "no_downregulated_genes_0.05"
    },
    inplace=True,
)

mitocarta_human_5 = mitocarta_human_5.drop(columns=["ensembl_id"])
mitocarta_human_5.head()

Unnamed: 0,mito_pathways,intersection_size_0.05,no_upregulated_genes_0.05,no_downregulated_genes_0.05
0,ABC transporters,1,1,0
1,Amino acid metabolism,1,0,1
2,Apoptosis,1,1,0
3,Autophagy,1,0,1
4,Biotin utilizing proteins,1,0,1


In [9]:
mitocarta_human = pd.merge(mitocarta_human_1, mitocarta_human_5, on="mito_pathways", how="outer")
mitocarta_human.head()

Unnamed: 0,mito_pathways,intersection_size_0.01,no_upregulated_genes_0.01,no_downregulated_genes_0.01,intersection_size_0.05,no_upregulated_genes_0.05,no_downregulated_genes_0.05
0,ABC transporters,1.0,1.0,0.0,1,1,0
1,Biotin utilizing proteins,1.0,0.0,1.0,1,0,1
2,Branched-chain amino acid dehydrogenase complex,1.0,0.0,1.0,2,0,2
3,Branched-chain amino acid metabolism,3.0,0.0,3.0,5,0,5
4,Calcium uniporter,1.0,0.0,1.0,1,0,1


In [10]:
mitocarta_human.to_csv("data/human/mitocarta_human_0.05&0.01.csv", index=False)


In [None]:
aggregated_mitocarta_human_selected.to_csv("data/human/mitocarta_human_0.05.csv", index=False)
gprofiler_sig_human[selected_columns].to_csv("data/human/gprofiler_sig_human_0.05.csv", index=False)


In [11]:
gprofiler_sig_human_1 = pd.read_csv("data/human/gprofiler_sig_human_0.01.csv")
gprofiler_sig_human_1.head()

Unnamed: 0,source,id,name,p_value,term_size,intersection_size,parents,no_upregulated_genes,no_downregulated_genes
0,GO:CC,GO:0005739,mitochondrion,5.957169e-12,1434,69,"['GO:0005737', 'GO:0043231']",6,63
1,KEGG,KEGG:01100,Metabolic pathways,1.486192e-11,1097,58,['KEGG:00000'],7,51
2,REAC,REAC:R-HSA-1428517,Aerobic respiration and respiratory electron t...,3.919927e-11,170,23,['REAC:R-HSA-1430728'],0,23
3,GO:BP,GO:0045333,cellular respiration,6.969748e-09,220,24,['GO:0015980'],0,24
4,GO:BP,GO:0009060,aerobic respiration,2.760689e-08,181,21,['GO:0045333'],0,21


In [13]:
import pandas as pd
gprofiler_sig_human_1 = pd.read_csv("data/human/gprofiler_sig_human_0.01.csv")
gprofiler_sig_human_1.rename(
    columns={
        "intersection_size": "intersection_size_0.01",
        "p_value": "p_value_0.01",
        "term_size": "term_size_0.01",
        "no_upregulated_genes": "no_upregulated_genes_0.01",
        "no_downregulated_genes": "no_downregulated_genes_0.01"
    },
    inplace=True,
)
gprofiler_sig_human_1 = gprofiler_sig_human_1.drop(columns=["parents"])
gprofiler_sig_human_1.head()

Unnamed: 0,source,id,name,p_value_0.01,term_size_0.01,intersection_size_0.01,no_upregulated_genes_0.01,no_downregulated_genes_0.01
0,GO:CC,GO:0005739,mitochondrion,5.957169e-12,1434,69,6,63
1,KEGG,KEGG:01100,Metabolic pathways,1.486192e-11,1097,58,7,51
2,REAC,REAC:R-HSA-1428517,Aerobic respiration and respiratory electron t...,3.919927e-11,170,23,0,23
3,GO:BP,GO:0045333,cellular respiration,6.969748e-09,220,24,0,24
4,GO:BP,GO:0009060,aerobic respiration,2.760689e-08,181,21,0,21


In [14]:
gprofiler_sig_human_5 = pd.read_csv("data/human/gprofiler_sig_human_0.05.csv")
gprofiler_sig_human_5.rename(
    columns={
        "intersection_size": "intersection_size_0.05",
        "p_value": "p_value_0.05",
        "term_size": "term_size_0.05",
        "no_upregulated_genes": "no_upregulated_genes_0.05",
        "no_downregulated_genes": "no_downregulated_genes_0.05"
    },
    inplace=True,
)
gprofiler_sig_human_5 = gprofiler_sig_human_5.drop(columns=["parents"])
gprofiler_sig_human_5.head()

Unnamed: 0,source,id,name,p_value_0.05,term_size_0.05,intersection_size_0.05,no_upregulated_genes_0.05,no_downregulated_genes_0.05
0,GO:BP,GO:0045333,cellular respiration,4.6550930000000005e-17,220,50,2,48
1,GO:CC,GO:0005739,mitochondrion,1.790872e-16,1434,145,14,131
2,REAC,REAC:R-HSA-1428517,Aerobic respiration and respiratory electron t...,5.063851e-16,170,42,0,42
3,GO:BP,GO:0006091,generation of precursor metabolites and energy,1.355359e-15,423,68,7,61
4,GO:BP,GO:0015980,energy derivation by oxidation of organic comp...,8.888125e-15,292,54,3,51


In [15]:
gprofiler_sig_human = pd.merge(gprofiler_sig_human_1, gprofiler_sig_human_5, on=["source", "id", "name"], how="outer")
gprofiler_sig_human.head()

Unnamed: 0,source,id,name,p_value_0.01,term_size_0.01,intersection_size_0.01,no_upregulated_genes_0.01,no_downregulated_genes_0.01,p_value_0.05,term_size_0.05,intersection_size_0.05,no_upregulated_genes_0.05,no_downregulated_genes_0.05
0,GO:CC,GO:0005739,mitochondrion,5.957169e-12,1434.0,69.0,6.0,63.0,1.790872e-16,1434.0,145.0,14.0,131.0
1,KEGG,KEGG:01100,Metabolic pathways,1.486192e-11,1097.0,58.0,7.0,51.0,7.790632e-14,1097.0,115.0,16.0,99.0
2,REAC,REAC:R-HSA-1428517,Aerobic respiration and respiratory electron t...,3.919927e-11,170.0,23.0,0.0,23.0,5.063851e-16,170.0,42.0,0.0,42.0
3,GO:BP,GO:0045333,cellular respiration,6.969748e-09,220.0,24.0,0.0,24.0,4.6550930000000005e-17,220.0,50.0,2.0,48.0
4,GO:BP,GO:0009060,aerobic respiration,2.760689e-08,181.0,21.0,0.0,21.0,4.666298e-14,181.0,41.0,2.0,39.0


In [16]:
gprofiler_sig_human.to_csv("data/human/gprofiler_sig_human_0.05&0.01.csv", index=False)


: 

In [None]:
from pyBiodatafuse.constants import MITOCARTA

df_melted = mitocarta_human_selected.apply(
    lambda row: [row["ensembl_id"], [row.drop("ensembl_id").to_dict()]], axis=1
)
df_transformed = pd.DataFrame(df_melted.tolist(), columns=["ensembl_id", MITOCARTA])
df_transformed.head()

Unnamed: 0,ensembl_id,MitoCarta
0,ENSG00000179091,"[{'gene_description': 'cytochrome c1', 'eviden..."
1,ENSG00000117118,[{'gene_description': 'succinate dehydrogenase...
2,ENSG00000167186,"[{'gene_description': 'coenzyme Q7, hydroxylas..."
3,ENSG00000073578,[{'gene_description': 'succinate dehydrogenase...
4,ENSG00000010256,[{'gene_description': 'ubiquinol-cytochrome c ...


In [None]:
# Merge Human MitoCarta with the graph dataset
ncbi_tf_gprofiler_mitocarta_df_human = pd.merge(
    ncbi_tf_gprofiler_df_human,
    df_transformed,
    left_on="identifier",
    right_on="ensembl_id",
    how="left",
).drop(columns=["ensembl_id"])

ncbi_tf_gprofiler_mitocarta_df_human.head(67)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta
0,ENSG00000000003,Ensembl,7105,NCBI Gene,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,...,,,,,,,,,,
1,ENSG00000000419,Ensembl,8813,NCBI Gene,DPM1,601.924666,-0.104654,0.161936,-0.646270,0.518105,...,,,,,,,,,,
2,ENSG00000000457,Ensembl,57147,NCBI Gene,SCYL3,244.623536,-0.285418,0.118343,-2.411784,0.015875,...,,,,,,,,,,
3,ENSG00000000938,Ensembl,2268,NCBI Gene,FGR,91.958767,0.599304,0.342110,1.751786,0.079811,...,,,,,,,,,,
4,ENSG00000000971,Ensembl,3075,NCBI Gene,CFH,499.583125,0.079448,0.255376,0.311101,0.755724,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,ENSG00000005073,Ensembl,3207,NCBI Gene,HOXA11,143.704857,0.301559,0.200188,1.506379,0.131970,...,,,,,,,,,,
63,ENSG00000005075,Ensembl,5439,NCBI Gene,POLR2J,1204.040045,-0.133500,0.107618,-1.240503,0.214789,...,,,,,,,,,,
64,ENSG00000005100,Ensembl,56919,NCBI Gene,DHX33,256.137917,0.166654,0.146994,1.133746,0.256901,...,,,,,,,,,,
65,ENSG00000005102,Ensembl,4222,NCBI Gene,MEOX1,329.566304,-0.395962,0.258980,-1.528929,0.126282,...,,,,,,,,,,


In [None]:
ncbi_tf_gprofiler_mitocarta_df_human[MITOCARTA][66]

[{'gene_description': 'DNA ligase 3',
  'evidence': 'literature, targetP signal, mito protein domain, coexpression',
  'sub_mito_localization': 'Matrix',
  'mito_pathways': 'mtDNA repair',
  'hpa_location': 'Nucleoplasm (Supported)',
  'tissue_expression': nan}]

In [None]:
ncbi_tf_gprofiler_mitocarta_df_human["target.source"] = ncbi_tf_gprofiler_mitocarta_df_human[
    "target.source"
].str.replace("NCBI Gene", "ncbi_gene_id", regex=False)

ncbi_tf_gprofiler_mitocarta_df_human.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta
0,ENSG00000000003,Ensembl,7105,ncbi_gene_id,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,...,,,,,,,,,,


In [None]:
ncbi_tf_gprofiler_mitocarta_df_human[MITOCARTA].to_dict()

{0: nan,
 1: nan,
 2: nan,
 3: nan,
 4: nan,
 5: nan,
 6: nan,
 7: nan,
 8: nan,
 9: nan,
 10: nan,
 11: nan,
 12: nan,
 13: nan,
 14: nan,
 15: nan,
 16: [{'gene_description': 'BCL2 associated agonist of cell death',
   'evidence': 'literature, mito protein domain',
   'sub_mito_localization': 'MOM',
   'mito_pathways': 'Apoptosis',
   'hpa_location': 'Mitochondria (Enhanced)',
   'tissue_expression': nan}],
 17: [{'gene_description': 'leucine aminopeptidase 3',
   'evidence': 'APEX_matrix, targetP signal+, Rickettsial homolog, coexpression, MS/MS++',
   'sub_mito_localization': 'Matrix',
   'mito_pathways': 'Proteases',
   'hpa_location': 'Cytosol (Supported)',
   'tissue_expression': 'all 14'}],
 18: nan,
 19: nan,
 20: nan,
 21: nan,
 22: nan,
 23: nan,
 24: nan,
 25: nan,
 26: nan,
 27: nan,
 28: nan,
 29: nan,
 30: nan,
 31: nan,
 32: [{'gene_description': 'NADH:ubiquinone oxidoreductase complex assembly factor 7',
   'evidence': 'APEX_matrix, GFP, targetP signal, Rickettsial hom

In [None]:
# ncbi_tf_gprofiler_mitocarta_df_human["its_target"][ncbi_tf_gprofiler_mitocarta_df_human["its_target"].notna()]
ncbi_tf_gprofiler_mitocarta_df_human["its_target"][7]

[{'NCBI.GeneID.Target': '1647',
  'Ensembl.GeneID.Target': 'ENSG00000116717',
  'Name.Target': 'GADD45A',
  'UniprotID.Target': 'P24522',
  'Target.TFLink.ortho': 'Dr:Q6GMM1;Mm:P48316;Rn:Q66HL6',
  'Target.nonTFLink.ortho': '-',
  'Detection.method': 'chromatin immunoprecipitation assay;inferred by curator',
  'PubmedID': '29126285;11525640;11420680;27924024;29087512',
  'Source.database': 'GTRD;ReMap;TRRUST',
  'Small-scale.evidence': 'Yes'},
 {'NCBI.GeneID.Target': '84271',
  'Ensembl.GeneID.Target': 'ENSG00000100227',
  'Name.Target': 'POLDIP3',
  'UniprotID.Target': 'Q9BY77',
  'Target.TFLink.ortho': 'Mm:Q8BG81;Rn:D4A2B0',
  'Target.nonTFLink.ortho': 'Dr:A0A0R4ILC0',
  'Detection.method': 'chromatin immunoprecipitation assay',
  'PubmedID': '29126285;27924024',
  'Source.database': 'GTRD;ReMap',
  'Small-scale.evidence': 'No'},
 {'NCBI.GeneID.Target': '54842',
  'Ensembl.GeneID.Target': 'ENSG00000151690',
  'Name.Target': 'MFSD6',
  'UniprotID.Target': 'Q6ZSS7',
  'Target.TFLink.or

Extract "NCBI Gene" ids for adding transcription factor (TF) and target interaction 

In [6]:
# from pyBiodatafuse.annotators.tflink import get_tf_target

# tflink_df, tflink_metadata = get_tf_target(
#     tf_file="data/human/TFLink_Homo_sapiens_interactions_All_simpleFormat_v1.0.tsv.gz",
#     filename="data/human/tflink_human.tsv.gz",
#     filter_deg=True,
#     bridgedb_df=bridgedb_df_human,
# )

In [7]:
# from pyBiodatafuse.annotators.gprofiler import get_gene_enrichment

# gprofiler_df, gprofiler_metadata = get_gene_enrichment(
#     bridgedb_df=bridgedb_df_human)

In [8]:
# gprofiler_df.head()

In [9]:
ncbi_df_human = get_identifier_of_interest(bridgedb_df_human, "NCBI Gene")
ncbi_to_ensembl = dict(zip(ncbi_df_human['target'], ncbi_df_human['identifier']))
ncbi_df_human.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea
0,ENSG00000000003,Ensembl,7105,NCBI Gene,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,0.0779,2.279542


In [10]:
bridgedb_df_human[bridgedb_df_human["target"] == "105378473"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea
105693,ENSG00000287047,Ensembl,105378473,NCBI Gene,,108.208973,-0.81557,0.391216,-2.084706,0.037096,0.21808,1.430673


In [11]:
print("Number of humman genes with mapping for NCBI Gene:", len(ncbi_df_human["target"].unique()))

Number of humman genes with mapping for NCBI Gene: 11792


### Download TF-target interactions

You can download the Gene-TF interactions dataset from **TFLink**. Please visit the following page for the download:

[TFLink Download Page](https://tflink.net/download/)

The datasets you need can be downloaded from the following links:

**For Humans (Homo sapiens):**
- [TFLink_Homo_sapiens_interactions_All_simpleFormat_v1.0.tsv.gz](https://cdn.netbiol.org/tflink/download_files/TFLink_Homo_sapiens_interactions_All_simpleFormat_v1.0.tsv.gz)

**For Mice (Mus musculus):**
- [TFLink_Mus_musculus_interactions_All_simpleFormat_v1.0.tsv.gz](https://cdn.netbiol.org/tflink/download_files/TFLink_Mus_musculus_interactions_All_simpleFormat_v1.0.tsv.gz)

These files contain the TF-target interaction data in a simple format for each species.


In [12]:
# URLs for the TF datasets
url_human = "https://cdn.netbiol.org/tflink/download_files/TFLink_Homo_sapiens_interactions_All_simpleFormat_v1.0.tsv.gz"

# Function to download and read a gzipped file into a dataframe
def download_save_and_read(url, filename):
    if not os.path.exists(filename):
        response = requests.get(url)
        with open(filename, "wb") as file:
            file.write(response.content)
    else:
        print(f"{filename} already exists, skipping download.")

    with gzip.open(filename, "rt") as f:
        df = pd.read_csv(f, sep="\t")
    return df

# Download and read the mouse and human datasets
tf_df_human = download_save_and_read(url_human, "data/human/tflink_human.tsv.gz")

# Keep rows where both TF and target exist in our dataset
tf_df_human = tf_df_human[tf_df_human["NCBI.GeneID.TF"].isin(ncbi_df_human["target"])]
tf_df_human = tf_df_human[tf_df_human["NCBI.GeneID.Target"].isin(ncbi_df_human["target"])]

tf_df_human.head(1)

Unnamed: 0,UniprotID.TF,UniprotID.Target,NCBI.GeneID.TF,NCBI.GeneID.Target,Name.TF,Name.Target,Detection.method,PubmedID,Organism,Source.database,Small-scale.evidence,TF.TFLink.ortho,TF.nonTFLink.ortho,Target.TFLink.ortho,Target.nonTFLink.ortho
1,P37231,P10826,5468,5915,PPARG,RARB,chromatin immunoprecipitation assay;inferred b...,17202159;12839938;29087512;27924024,Homo sapiens,GTRD;TRED;TRRUST,Yes,-,-,Mm:P22605;Rn:D3ZFD9,-


In [13]:
tf_df_human.shape

(4067478, 15)

In [14]:
# Extract the TF and the TF targets
tf_list_human = list(tf_df_human["NCBI.GeneID.TF"])
target_list_human = list(tf_df_human["NCBI.GeneID.Target"])

# Add 'is_tf' and 'is_target' columns
ncbi_df_human["is_tf"] = ncbi_df_human["target"].isin(tf_list_human)
ncbi_df_human["is_target"] = ncbi_df_human["target"].isin(target_list_human)

ncbi_df_human["is_target"].value_counts(), ncbi_df_human["is_tf"].value_counts()

(is_target
 True     11283
 False      534
 Name: count, dtype: int64,
 is_tf
 False    10633
 True      1184
 Name: count, dtype: int64)

In [15]:
# kepp only rows where the target is a DEG
tf_sig_df_human = tf_df_human[tf_df_human["NCBI.GeneID.Target"].isin(ncbi_df_human[ncbi_df_human["padj_dea"] <= 0.05]["target"])]
# add ensembl gene id for the TF and target
tf_sig_df_human['Ensembl.GeneID.TF'] = tf_sig_df_human['NCBI.GeneID.TF'].map(ncbi_to_ensembl)
tf_sig_df_human['Ensembl.GeneID.Target'] = tf_sig_df_human['NCBI.GeneID.Target'].map(ncbi_to_ensembl)
tf_sig_df_human.head(1)

Unnamed: 0,UniprotID.TF,UniprotID.Target,NCBI.GeneID.TF,NCBI.GeneID.Target,Name.TF,Name.Target,Detection.method,PubmedID,Organism,Source.database,Small-scale.evidence,TF.TFLink.ortho,TF.nonTFLink.ortho,Target.TFLink.ortho,Target.nonTFLink.ortho,Ensembl.GeneID.TF,Ensembl.GeneID.Target
6,P13631,P05412,5916,3725,RARG,JUN,inferred by curator,17202159,Homo sapiens,TRED,Yes,Dr:Q91392;Mm:P18911,Rn:D3ZF61,Dr:Q6NZT5;Mm:P05627;Rn:P17325,-,ENSG00000172819,ENSG00000177606


In [16]:
tf_sig_df_human[tf_sig_df_human["Name.TF"] == "MYC"]["Name.Target"].unique()

array(['PKM', 'TGFB3', 'TUBG1', 'HERPUD1', 'PMVK', 'IGF2R', 'TMPO',
       'AKR1A1', 'MCM7', 'PPIF', 'GAPDH', 'NDUFS1', 'ETFDH', 'GCLC',
       'FECH', 'ZPR1', 'JUN', 'PARP1', 'DCAF11', 'AXIN2', 'MYC', 'ETS2',
       'CD38', 'AP4M1', 'DUSP1', 'GADD45A', 'EIF3B', 'CDCA7', 'TFRC',
       'NOP56', 'NDUFV1', 'IFNAR1', 'ACO2', 'ABCB6', 'CDKN1A', 'DAXX',
       'ETFA', 'UNG', 'PRDX3', 'ARHGEF9', 'EFCAB7', 'CHSY1', 'SLC30A9',
       'ECHDC1', 'MT-CO2', 'COQ6', 'GNG2', 'AGO4', 'GOLGA6L9', 'AOPEP',
       'ECHS1', 'NFE2L2', 'ZFP36', 'ACKR1', 'PM20D2', 'MYLK4', 'SRRT',
       'ING2', 'DBT', 'NUCKS1', 'CERT1', 'TBL2', 'GDI1', 'CIAO2A',
       'HABP4', 'RMDN3', 'FABP3', 'LTBP1', 'INSIG1', 'PAAF1', 'ZNF367',
       'FIBIN', 'NDUFA9', 'COQ9', 'FAM210A', 'LYSMD4', 'SC5D', 'ATP6V0D1',
       'CD2BP2', 'HNRNPAB', 'FTH1', 'DOP1B', 'PNPLA4', 'OLFML3', 'CEP112',
       'CLTCL1', 'OTUD1', 'PLAAT1', 'RBM19', 'IBTK', 'FOXO1', 'ALDH1A2',
       'ADAMTS1', 'TP53BP1', 'LACTB', 'BCL6', 'SPRYD4', 'DCAF6', 'DGKG',

In [17]:
tf_sig_df_human[tf_sig_df_human["Name.TF"] == "PDHX"]["Name.Target"].sort_values()

2509282     AAMDC
5125177    ACADSB
1950997     ACBD5
831514     ACTR1A
3941183      AGO4
            ...  
2962146    WASHC3
3733022     YARS2
707720     ZC3H10
972910      ZCRB1
2592332     ZNRD2
Name: Name.Target, Length: 77, dtype: object

In [18]:
tf_sig_df_human.shape

(192033, 17)

In [19]:
# Function to add targets and TFs to each row (gene)
def add_target_and_tf_interaction(ncbi_df, tf_df, filename):
    if not os.path.exists(filename):
        ncbi_df["its_target"] = None
        print(f"Processing targets. {ncbi_df.shape}")
        for index, row in ncbi_df.iterrows():
            if index % 2000 == 0 and index > 0:
                print(f"Processed {index} rows (targets).")
            if row["is_tf"]:
                targets = tf_df[tf_df["NCBI.GeneID.TF"] == row["target"]]
                if not targets.empty:
                    target_info_list = targets[["NCBI.GeneID.Target",
                                                "Ensembl.GeneID.Target",
                                                "Name.Target",
                                                "UniprotID.Target",
                                                "Target.TFLink.ortho",
                                                "Target.nonTFLink.ortho",
                                                "Detection.method",
                                                "PubmedID",
                                                "Source.database",
                                                "Small-scale.evidence",
                                            ]].to_dict(orient="records")
                    ncbi_df.at[index, "its_target"] = target_info_list
                else:
                    ncbi_df.at[index, "its_target"] = []

        ncbi_df["its_tf"] = None
        print("Processing TFs...")
        for index, row in ncbi_df.iterrows():
            if index % 2000 == 0 and index > 0:
                print(f"Processed {index} rows (TFs).")
            if row["is_target"] and row["padj_dea"] <= 0.05:
                tf = tf_df[tf_df["NCBI.GeneID.Target"] == row["target"]]
                if not tf.empty:
                    tf_info_list = tf[["NCBI.GeneID.TF",
                                       "Ensembl.GeneID.TF",
                                       "Name.TF",
                                       "UniprotID.TF",
                                       "TF.TFLink.ortho",
                                       "TF.nonTFLink.ortho",
                                       "Detection.method",
                                       "PubmedID",
                                       "Source.database",
                                       "Small-scale.evidence",
                                    ]].to_dict(orient="records")
                    ncbi_df.at[index, "its_tf"] = tf_info_list
                else:
                    # if row["padj_dea"] <= 0.05:
                    #     ncbi_df.at[index, "its_tf"] = None
                    # else:
                    ncbi_df.at[index, "its_tf"] = []

        ncbi_df.to_pickle(filename)

    else:
        with open(filename, 'rb') as f:
            ncbi_df = pickle.load(f)

    return ncbi_df

# Add target and TF
ncbi_tf_df_human = add_target_and_tf_interaction(ncbi_df_human, tf_sig_df_human, filename="data/human/ncbi_tf_df_human.pkl")

Processing targets. (11817, 15)
Processed 2000 rows (targets).
Processed 4000 rows (targets).
Processed 6000 rows (targets).
Processed 8000 rows (targets).
Processed 10000 rows (targets).
Processing TFs...
Processed 2000 rows (TFs).
Processed 4000 rows (TFs).
Processed 6000 rows (TFs).
Processed 8000 rows (TFs).
Processed 10000 rows (TFs).


In [20]:
ncbi_tf_df_human.head()

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea,is_tf,is_target,its_target,its_tf
0,ENSG00000000003,Ensembl,7105,NCBI Gene,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,0.0779,2.279542,False,True,,
1,ENSG00000000419,Ensembl,8813,NCBI Gene,DPM1,601.924666,-0.104654,0.161936,-0.64627,0.518105,0.783377,0.285582,False,True,,
2,ENSG00000000457,Ensembl,57147,NCBI Gene,SCYL3,244.623536,-0.285418,0.118343,-2.411784,0.015875,0.141951,1.799295,False,True,,
3,ENSG00000000938,Ensembl,2268,NCBI Gene,FGR,91.958767,0.599304,0.34211,1.751786,0.079811,0.328648,1.09794,False,True,,
4,ENSG00000000971,Ensembl,3075,NCBI Gene,CFH,499.583125,0.079448,0.255376,0.311101,0.755724,0.90734,0.121637,False,True,,


In [21]:
ncbi_tf_df_human[ncbi_tf_df_human['its_target'].notna()].head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea,is_tf,is_target,its_target,its_tf
7,ENSG00000001167,Ensembl,4800,NCBI Gene,NFYA,198.963515,0.339027,0.133075,2.547627,0.010846,0.114127,1.964737,True,True,"[{'NCBI.GeneID.Target': '1647', 'Ensembl.GeneI...",


In [22]:
ncbi_tf_df_human[ncbi_tf_df_human['target']=="1647"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea,is_tf,is_target,its_target,its_tf
3343,ENSG00000116717,Ensembl,1647,NCBI Gene,GADD45A,448.20828,1.432184,0.377628,3.792579,0.000149,0.008832,3.826549,False,True,,"[{'NCBI.GeneID.TF': '672', 'Ensembl.GeneID.TF'..."


In [23]:
ncbi_tf_df_human[ncbi_tf_df_human['target']=="1647"]["its_tf"].to_dict()

{3343: [{'NCBI.GeneID.TF': '672',
   'Ensembl.GeneID.TF': 'ENSG00000012048',
   'Name.TF': 'BRCA1',
   'UniprotID.TF': 'P38398',
   'TF.TFLink.ortho': 'Mm:P48754;Rn:G3V8S5',
   'TF.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay;inferred by curator',
   'PubmedID': '17428466;12647291;18350249;11777930;10962562;27924024;18025037;17202159;29087512',
   'Source.database': 'GTRD;TRED;TRRUST',
   'Small-scale.evidence': 'Yes'},
  {'NCBI.GeneID.TF': '4790',
   'Ensembl.GeneID.TF': 'ENSG00000109320',
   'Name.TF': 'NFKB1',
   'UniprotID.TF': 'P19838',
   'TF.TFLink.ortho': 'Dm:Q94527;Mm:P25799;Rn:F1LQH2',
   'TF.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay;inferred by curator',
   'PubmedID': '29126285;27924024;17202159',
   'Source.database': 'GTRD;ReMap;TRED',
   'Small-scale.evidence': 'Yes'},
  {'NCBI.GeneID.TF': '367',
   'Ensembl.GeneID.TF': 'ENSG00000169083',
   'Name.TF': 'AR',
   'UniprotID.TF': 'P10275',
   'TF

In [24]:
ncbi_tf_df_human[ncbi_tf_df_human['target']=="1647"]["its_tf"].to_dict()

{3343: [{'NCBI.GeneID.TF': '672',
   'Ensembl.GeneID.TF': 'ENSG00000012048',
   'Name.TF': 'BRCA1',
   'UniprotID.TF': 'P38398',
   'TF.TFLink.ortho': 'Mm:P48754;Rn:G3V8S5',
   'TF.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay;inferred by curator',
   'PubmedID': '17428466;12647291;18350249;11777930;10962562;27924024;18025037;17202159;29087512',
   'Source.database': 'GTRD;TRED;TRRUST',
   'Small-scale.evidence': 'Yes'},
  {'NCBI.GeneID.TF': '4790',
   'Ensembl.GeneID.TF': 'ENSG00000109320',
   'Name.TF': 'NFKB1',
   'UniprotID.TF': 'P19838',
   'TF.TFLink.ortho': 'Dm:Q94527;Mm:P25799;Rn:F1LQH2',
   'TF.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay;inferred by curator',
   'PubmedID': '29126285;27924024;17202159',
   'Source.database': 'GTRD;ReMap;TRED',
   'Small-scale.evidence': 'Yes'},
  {'NCBI.GeneID.TF': '367',
   'Ensembl.GeneID.TF': 'ENSG00000169083',
   'Name.TF': 'AR',
   'UniprotID.TF': 'P10275',
   'TF

In [25]:
ncbi_tf_df_human[ncbi_tf_df_human['target']=="3725"]["its_target"].to_dict()

{9229: [{'NCBI.GeneID.Target': '2729',
   'Ensembl.GeneID.Target': 'ENSG00000001084',
   'Name.Target': 'GCLC',
   'UniprotID.Target': 'P48506',
   'Target.TFLink.ortho': 'Ce:Q20117;Dm:Q9W3K5;Dr:Q6NV35;Mm:P97494;Rn:P19468;Sc:P32477',
   'Target.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay;electrophoretic mobility shift assay;inferred by curator',
   'PubmedID': '11912197;16054171;29087512;27924024;11233143;22900683',
   'Source.database': 'GTRD;HTRI;TRRUST',
   'Small-scale.evidence': 'Yes'},
  {'NCBI.GeneID.Target': '3725',
   'Ensembl.GeneID.Target': 'ENSG00000177606',
   'Name.Target': 'JUN',
   'UniprotID.Target': 'P05412',
   'Target.TFLink.ortho': 'Dr:Q6NZT5;Mm:P05627;Rn:P17325',
   'Target.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay;inferred by curator',
   'PubmedID': '29126285;19671687;26578589;9502786;18971253;27924024;17202159;29087512',
   'Source.database': 'GTRD;ORegAnno;PAZAR;ReMap;TRED;TRRUST',

### Protein-Protein Interactions from STRING

In [45]:
# # Create a mapping dictionary for identifiers
# mapping = bridgedb_df_human[
#     (bridgedb_df_human['target.source'] == "HGNC") & (pd.notna(bridgedb_df_human['target']))
#     ].set_index('identifier')['target'].to_dict()

# # Apply mapping and replace values in identifier and identifier.source
# bridgedb_df_human['identifier'] = bridgedb_df_human['identifier'].map(mapping).combine_first(bridgedb_df_human['identifier'])
# bridgedb_df_human['identifier.source'] = bridgedb_df_human.apply(
#     lambda row: "HGNC" if row['identifier'] in mapping.values() else row['identifier.source'], axis=1
# )

In [46]:
# bridgedb_df_human[["identifier", "GENE_SYMBOL_dea"]][~(bridgedb_df_human["identifier"] == bridgedb_df_human["GENE_SYMBOL_dea"])].drop_duplicates()

In [41]:
string_path = os.path.join(base_dir, "data/human/string.pkl")
string_metadata_path = os.path.join(base_dir, "data/human/string_metadata.pkl")

if not os.path.exists(string_path):
    ppi_df, ppi_metadata = stringdb.get_ppi(
        bridgedb_df=bridgedb_df_human[bridgedb_df_human["padj_dea"] <= 0.05]
    )
    ppi_df.to_pickle(string_path)
    with open(string_metadata_path, "wb") as file:
        pickle.dump(ppi_metadata, file)
else:
    ppi_df = pd.read_pickle(string_path)
    with open(string_metadata_path, "rb") as file:
        ppi_metadata = pickle.load(file)

ppi_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea,StringDB_ppi
0,ENSG00000001084,Ensembl,ENSG00000001084,Ensembl,GCLC,412.449345,0.454035,0.1302,3.487222,0.000488,0.018659,3.311521,"[{'stringdb_link_to': 'ENSG00000100906', 'Ense..."
1,ENSG00000004468,Ensembl,ENSG00000004468,Ensembl,CD38,975.036348,-0.588804,0.173952,-3.384869,0.000712,0.023909,3.147446,"[{'stringdb_link_to': 'ENSG00000089163', 'Ense..."
2,ENSG00000004779,Ensembl,ENSG00000004779,Ensembl,NDUFAB1,2346.167702,-0.432984,0.13567,-3.191446,0.001416,0.036492,2.849052,"[{'stringdb_link_to': 'ENSG00000010256', 'Ense..."
3,ENSG00000005700,Ensembl,ENSG00000005700,Ensembl,IBTK,1511.600789,-0.436326,0.103624,-4.210661,2.5e-05,0.003024,4.594099,"[{'stringdb_link_to': 'ENSG00000198886', 'Ense..."
4,ENSG00000006327,Ensembl,ENSG00000006327,Ensembl,TNFRSF12A,445.737333,1.655781,0.519291,3.188539,0.00143,0.036587,2.844684,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."


In [42]:
from pyBiodatafuse.constants import STRING_PPI_COL

ppi_df[STRING_PPI_COL].to_dict()

{0: [{'stringdb_link_to': 'ENSG00000100906',
   'Ensembl': 'ENSP00000216797',
   'score': 0.556,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000125166',
   'Ensembl': 'ENSP00000245206',
   'score': 0.838,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000126432',
   'Ensembl': 'ENSP00000265462',
   'score': 0.518,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000167996',
   'Ensembl': 'ENSP00000273550',
   'score': 0.508,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000148672',
   'Ensembl': 'ENSP00000277865',
   'score': 0.847,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000165672',
   'Ensembl': 'ENSP00000298510',
   'score': 0.457,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000167815',
   'Ensembl': 'ENSP00000301522'

In [43]:
ppi_df[["target", "StringDB_ppi"]].head()

Unnamed: 0,target,StringDB_ppi
0,ENSG00000001084,"[{'stringdb_link_to': 'ENSG00000100906', 'Ense..."
1,ENSG00000004468,"[{'stringdb_link_to': 'ENSG00000089163', 'Ense..."
2,ENSG00000004779,"[{'stringdb_link_to': 'ENSG00000010256', 'Ense..."
3,ENSG00000005700,"[{'stringdb_link_to': 'ENSG00000198886', 'Ense..."
4,ENSG00000006327,"[{'stringdb_link_to': nan, 'Ensembl': nan, 'sc..."


In [44]:
ppi_df[ppi_df["target"] == "ENSG00000196177"]["StringDB_ppi"].values

array([list([{'stringdb_link_to': 'ENSG00000091140', 'Ensembl': 'ENSP00000205402', 'score': 0.471, 'Uniprot-TrEMBL': None, 'Uniprot-TrEMBL_link': None}, {'stringdb_link_to': 'ENSG00000100209', 'Ensembl': 'ENSP00000216027', 'score': 0.408, 'Uniprot-TrEMBL': None, 'Uniprot-TrEMBL_link': None}, {'stringdb_link_to': 'ENSG00000104325', 'Ensembl': 'ENSP00000220764', 'score': 0.4, 'Uniprot-TrEMBL': None, 'Uniprot-TrEMBL_link': None}, {'stringdb_link_to': 'ENSG00000110435', 'Ensembl': 'ENSP00000227868', 'score': 0.405, 'Uniprot-TrEMBL': None, 'Uniprot-TrEMBL_link': None}, {'stringdb_link_to': 'ENSG00000106049', 'Ensembl': 'ENSP00000265395', 'score': 0.581, 'Uniprot-TrEMBL': None, 'Uniprot-TrEMBL_link': None}, {'stringdb_link_to': 'ENSG00000075239', 'Ensembl': 'ENSP00000265838', 'score': 0.968, 'Uniprot-TrEMBL': None, 'Uniprot-TrEMBL_link': None}, {'stringdb_link_to': 'ENSG00000151093', 'Ensembl': 'ENSP00000280701', 'score': 0.413, 'Uniprot-TrEMBL': None, 'Uniprot-TrEMBL_link': None}, {'stringd

In [45]:
ncbi_tf_gprofiler_mitocarta_df_human.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta
0,ENSG00000000003,Ensembl,7105,ncbi_gene_id,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,...,,,,,,,,,,


In [46]:
combined_df = pd.merge(
    ncbi_tf_gprofiler_mitocarta_df_human,
    ppi_df[["target", "StringDB_ppi"]].rename(columns={"target": "identifier"}),
    on="identifier",
    how="left",
)

In [47]:
combined_df.columns

Index(['identifier', 'identifier.source', 'target', 'target.source',
       'GENE_SYMBOL_dea', 'baseMean_dea', 'log2FoldChange_dea', 'lfcSE_dea',
       'stat_dea', 'pvalue_dea', 'padj_dea', 'minus_log10_pvalue_dea', 'is_tf',
       'is_target', 'its_target', 'its_tf', 'intersections',
       'g:Profiler_corum', 'g:Profiler_go:bp', 'g:Profiler_go:cc',
       'g:Profiler_go:mf', 'g:Profiler_hp', 'g:Profiler_hpa',
       'g:Profiler_kegg', 'g:Profiler_mirna', 'g:Profiler_reac',
       'g:Profiler_tf', 'g:Profiler_wp', 'MitoCarta', 'StringDB_ppi'],
      dtype='object')

In [48]:
combined_df.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
0,ENSG00000000003,Ensembl,7105,ncbi_gene_id,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,...,,,,,,,,,,


In [49]:
combined_df["StringDB_ppi"].to_dict()

{0: nan,
 1: nan,
 2: nan,
 3: nan,
 4: nan,
 5: nan,
 6: [{'stringdb_link_to': 'ENSG00000100906',
   'Ensembl': 'ENSP00000216797',
   'score': 0.556,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000125166',
   'Ensembl': 'ENSP00000245206',
   'score': 0.838,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000126432',
   'Ensembl': 'ENSP00000265462',
   'score': 0.518,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000167996',
   'Ensembl': 'ENSP00000273550',
   'score': 0.508,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000148672',
   'Ensembl': 'ENSP00000277865',
   'score': 0.847,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000165672',
   'Ensembl': 'ENSP00000298510',
   'score': 0.457,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_t

In [50]:
filename_human = "data/human/combined_df.pkl"

combined_df.to_pickle(filename_human)

### graph

In [5]:
filename_human = "data/human/combined_df.pkl"
with open(filename_human, "rb") as f:
    combined_df = pickle.load(f)

In [6]:
# Extract all 'NCBI.GeneID.TF' values into a single list
ncbi_gene_ids = (
    combined_df["its_tf"]
    .apply(lambda x: [d["NCBI.GeneID.TF"] for d in x] if isinstance(x, list) else [])
    .explode()
    .dropna()
    .unique()
    .tolist()
)

len(ncbi_gene_ids)

997

In [7]:
combined_df_tf = combined_df[combined_df["target"].isin(ncbi_gene_ids)]
combined_df_sig = combined_df[combined_df["padj_dea"] <= 0.05]
combined_df_sig = combined_df_sig[~combined_df_sig["target"].isin(ncbi_gene_ids)]
combined_df_sig.shape

(537, 30)

In [8]:
combined_df_tf_sig = pd.concat([combined_df_sig, combined_df_tf], axis=0, ignore_index=True)
combined_df_tf_sig.shape

(1535, 30)

In [9]:
combined_df_tf_sig[combined_df_tf_sig["GENE_SYMBOL_dea"] == "COQ10A"]["its_tf"].apply(
    lambda x: isinstance(x, list)
)

207    True
Name: its_tf, dtype: bool

In [10]:
combined_df_tf_sig[combined_df_tf_sig["GENE_SYMBOL_dea"] == "COQ10A"]["its_tf"].to_dict()

{207: [{'NCBI.GeneID.TF': '112398',
   'Ensembl.GeneID.TF': 'ENSG00000269858',
   'Name.TF': 'EGLN2',
   'UniprotID.TF': 'Q96KS0',
   'TF.TFLink.ortho': 'Mm:Q91YE2;Rn:Q6AYU4',
   'TF.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay',
   'PubmedID': '29126285',
   'Source.database': 'ReMap',
   'Small-scale.evidence': 'No'},
  {'NCBI.GeneID.TF': '11091',
   'Ensembl.GeneID.TF': 'ENSG00000196363',
   'Name.TF': 'WDR5',
   'UniprotID.TF': 'P61964',
   'TF.TFLink.ortho': 'Dm:Q9V3J8;Dr:Q7ZTX2;Mm:P61965',
   'TF.nonTFLink.ortho': 'Rn:Q498M4',
   'Detection.method': 'chromatin immunoprecipitation assay',
   'PubmedID': '27924024',
   'Source.database': 'GTRD',
   'Small-scale.evidence': 'No'},
  {'NCBI.GeneID.TF': '1024',
   'Ensembl.GeneID.TF': 'ENSG00000132964',
   'Name.TF': 'CDK8',
   'UniprotID.TF': 'P49336',
   'TF.TFLink.ortho': 'Dm:Q9VT57;Dr:A8E4S2;Mm:Q8R3L8',
   'TF.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay',


In [11]:
combined_df_tf[~combined_df_tf["target"].isin(combined_df_sig["target"].tolist())]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
7,ENSG00000001167,Ensembl,4800,ncbi_gene_id,NFYA,198.963515,0.339027,0.133075,2.547627,0.010846,...,,,,,,,,,,
43,ENSG00000004487,Ensembl,23028,ncbi_gene_id,KDM1A,1310.615484,0.045821,0.067400,0.679830,0.496612,...,,,,,,,,,,
74,ENSG00000005339,Ensembl,1387,ncbi_gene_id,CREBBP,1852.242330,0.116510,0.162612,0.716489,0.473690,...,,,,,,,,,,
88,ENSG00000005889,Ensembl,7543,ncbi_gene_id,ZFX,732.044736,-0.164081,0.159735,-1.027205,0.304324,...,,,,,,,,,,
99,ENSG00000006194,Ensembl,10127,ncbi_gene_id,ZNF263,337.618503,0.141491,0.132231,1.070034,0.284604,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11743,ENSG00000275700,Ensembl,26574,ncbi_gene_id,AATF,405.250619,0.422212,0.134613,3.136484,0.001710,...,"[{'id': 'GO:0019901', 'name': 'protein kinase ...",,,,"[{'id': 'MIRNA:hsa-miR-634', 'name': 'hsa-miR-...","[{'id': 'REAC:R-HSA-162582', 'name': 'Signal T...","[{'id': 'TF:M10696', 'name': 'Factor: HOXA6; m...","[{'id': 'WP:WP5287', 'name': '17q12 copy numbe...",,"[{'stringdb_link_to': 'ENSG00000112578', 'Ense..."
11756,ENSG00000276644,Ensembl,1602,ncbi_gene_id,DACH1,87.387806,-0.001658,0.179004,-0.009262,0.992610,...,,,,,,,,,,
11758,ENSG00000277258,Ensembl,7703,ncbi_gene_id,PCGF2,160.048407,0.002932,0.203806,0.014386,0.988522,...,,,,,,,,,,
11760,ENSG00000277494,Ensembl,338328,ncbi_gene_id,GPIHBP1,585.805576,-0.190933,0.355165,-0.537591,0.590860,...,,,,,,,,,,


In [12]:
combined_df_tf_sig[combined_df_tf_sig["identifier"].isin(["ENSG00000001167", "ENSG00000116717"])]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
139,ENSG00000116717,Ensembl,1647,ncbi_gene_id,GADD45A,448.20828,1.432184,0.377628,3.792579,0.000149,...,"[{'id': 'GO:0042802', 'name': 'identical prote...",,,"[{'id': 'KEGG:05216', 'name': 'Thyroid cancer'...","[{'id': 'MIRNA:hsa-miR-5701', 'name': 'hsa-miR...","[{'id': 'REAC:R-HSA-3700989', 'name': 'Transcr...","[{'id': 'TF:M10696', 'name': 'Factor: HOXA6; m...","[{'id': 'WP:WP1742', 'name': 'TP53 network', '...",,"[{'stringdb_link_to': 'ENSG00000120129', 'Ense..."
537,ENSG00000001167,Ensembl,4800,ncbi_gene_id,NFYA,198.963515,0.339027,0.133075,2.547627,0.010846,...,,,,,,,,,,


In [13]:
combined_df_tf_sig[combined_df_tf_sig["identifier"] == "ENSG00000001167"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
537,ENSG00000001167,Ensembl,4800,ncbi_gene_id,NFYA,198.963515,0.339027,0.133075,2.547627,0.010846,...,,,,,,,,,,


In [14]:
combined_df_tf_sig[combined_df_tf_sig["GENE_SYMBOL_dea"] == "GADD45A"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
139,ENSG00000116717,Ensembl,1647,ncbi_gene_id,GADD45A,448.20828,1.432184,0.377628,3.792579,0.000149,...,"[{'id': 'GO:0042802', 'name': 'identical prote...",,,"[{'id': 'KEGG:05216', 'name': 'Thyroid cancer'...","[{'id': 'MIRNA:hsa-miR-5701', 'name': 'hsa-miR...","[{'id': 'REAC:R-HSA-3700989', 'name': 'Transcr...","[{'id': 'TF:M10696', 'name': 'Factor: HOXA6; m...","[{'id': 'WP:WP1742', 'name': 'TP53 network', '...",,"[{'stringdb_link_to': 'ENSG00000120129', 'Ense..."


In [15]:
combined_df_tf_sig[combined_df_tf_sig["identifier"] == "ENSG00000001167"]["its_target"].to_dict()

{537: [{'NCBI.GeneID.Target': '1647',
   'Ensembl.GeneID.Target': 'ENSG00000116717',
   'Name.Target': 'GADD45A',
   'UniprotID.Target': 'P24522',
   'Target.TFLink.ortho': 'Dr:Q6GMM1;Mm:P48316;Rn:Q66HL6',
   'Target.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay;inferred by curator',
   'PubmedID': '29126285;11525640;11420680;27924024;29087512',
   'Source.database': 'GTRD;ReMap;TRRUST',
   'Small-scale.evidence': 'Yes'},
  {'NCBI.GeneID.Target': '84271',
   'Ensembl.GeneID.Target': 'ENSG00000100227',
   'Name.Target': 'POLDIP3',
   'UniprotID.Target': 'Q9BY77',
   'Target.TFLink.ortho': 'Mm:Q8BG81;Rn:D4A2B0',
   'Target.nonTFLink.ortho': 'Dr:A0A0R4ILC0',
   'Detection.method': 'chromatin immunoprecipitation assay',
   'PubmedID': '29126285;27924024',
   'Source.database': 'GTRD;ReMap',
   'Small-scale.evidence': 'No'},
  {'NCBI.GeneID.Target': '54842',
   'Ensembl.GeneID.Target': 'ENSG00000151690',
   'Name.Target': 'MFSD6',
   'UniprotID.Target': 

In [16]:
combined_df_tf_sig["its_target"][10]

In [17]:
# Extract all targets for NFYA ('NCBI.GeneID.TF')
ncbi_gene_ids_NFYA_targets = (
    combined_df_tf_sig[combined_df_tf_sig["identifier"] == "ENSG00000001167"]["its_target"]
    .apply(lambda x: [d["NCBI.GeneID.Target"] for d in x] if isinstance(x, list) else [])
    .explode()
    .dropna()
    .unique()
    .tolist()
)

len(ncbi_gene_ids_NFYA_targets)

502

In [70]:
# combined_df = combine_sources(
#     merged_df_human,
#     [
#     ppi_df,
#     ],
# )

In [71]:
# combined_metadata = create_or_append_to_metadata(
#     bridgedb_metadata_human,
#     [
#     ppi_metadata,
#     get_data_versions("hsapiens")
#     ],
# )

In [72]:
# combined_metadata

In [18]:
combined_df[combined_df["identifier"] == "CDKN1A"]["StringDB_ppi"].to_dict()

{}

In [19]:
combined_df_tf_sig[combined_df_tf_sig["GENE_SYMBOL_dea"].str.contains("OTF6", case=False, na=False)]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi


In [20]:
combined_df[combined_df["GENE_SYMBOL_dea"] == "MT-ND4"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
10722,ENSG00000198886,Ensembl,4538,ncbi_gene_id,MT-ND4,1098081.0,-0.722805,0.164567,-4.392165,1.1e-05,...,"[{'id': 'GO:0016491', 'name': 'oxidoreductase ...","[{'id': 'HP:0001941', 'name': 'Acidosis', 'p_v...","[{'id': 'HPA:0440341', 'name': 'skeletal muscl...","[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:hsa-miR-652-3p', 'name': 'hsa-m...","[{'id': 'REAC:R-HSA-1428517', 'name': 'The cit...","[{'id': 'TF:M07428', 'name': 'Factor: Six-3; m...","[{'id': 'WP:WP111', 'name': 'Electron transpor...","[{'gene_description': 'NADH dehydrogenase, sub...","[{'stringdb_link_to': 'ENSG00000010256', 'Ense..."


In [21]:
combined_df[combined_df["identifier"] == "ENSG00000198886"]["StringDB_ppi"].to_dict()

{10722: [{'stringdb_link_to': 'ENSG00000010256',
   'Ensembl': 'ENSP00000203407',
   'score': 0.971,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000108179',
   'Ensembl': 'ENSP00000225174',
   'score': 0.404,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000130414',
   'Ensembl': 'ENSP00000252711',
   'score': 0.999,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000110955',
   'Ensembl': 'ENSP00000262030',
   'score': 0.553,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000213619',
   'Ensembl': 'ENSP00000263774',
   'score': 0.999,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000139180',
   'Ensembl': 'ENSP00000266544',
   'score': 0.999,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None},
  {'stringdb_link_to': 'ENSG00000139631',
   'Ensembl': 'ENSP00000267

In [22]:
combined_df[combined_df["identifier"] == "ENSG00000005700"]["StringDB_ppi"].to_dict()

{81: [{'stringdb_link_to': 'ENSG00000198886',
   'Ensembl': 'ENSP00000354961',
   'score': 0.457,
   'Uniprot-TrEMBL': None,
   'Uniprot-TrEMBL_link': None}]}

In [23]:
combined_df[combined_df["identifier"] == "ENSG00000108387"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
2558,ENSG00000108387,Ensembl,5414,ncbi_gene_id,2004-09-01 00:00:00,193.471021,0.073477,0.13493,0.544556,0.586059,...,,,,,,,,,"[{'gene_description': 'septin 4', 'evidence': ...",


In [24]:
pygraph = generator.save_graph(
    combined_df=combined_df_tf_sig,
    combined_metadata=bridgedb_metadata_human,
    graph_name="graph_human",
    graph_dir="./data/human",
)

Combined DataFrame saved in ./data/human/graph_human/graph_human_df.pkl
Metadata saved in ./data/human/graph_human/graph_human_metadata.pkl
Building graph: 100%|██████████| 1535/1535 [00:04<00:00, 310.14it/s]
Graph is built successfully
Graph saved in ./data/human/graph_human/graph_human_graph.pkl and ./data/human/graph_human/graph_human_graph.gml


In [25]:
print(pygraph)

MultiDiGraph with 17603 nodes and 316748 edges


In [26]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(pygraph, "data/human/graph_human/networkx_graph_human.graphml")

: 