## Explore mitochondrial impairment in tumars 

**Aim**: in this notebook, you will see all the steps for collecting data and constructing a KG to explore mitochondrial impairment in tumer (both in human and mice)

### Import required libraries

In [1]:
# Import modules
import os
import pickle

import pandas as pd

from pyBiodatafuse import id_mapper
from pyBiodatafuse.annotators import (
    tflink,
    gprofiler,
    mitocarta,
    stringdb,
)

from pyBiodatafuse.constants import STRING_PPI_COL
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import (
    combine_sources,
    create_harmonized_input_file,
    create_or_append_to_metadata,
    get_identifier_of_interest,
)

os.makedirs("data", exist_ok=True)
base_dir = os.path.abspath(os.getcwd())

### Load the input files

In [2]:
# Read only specific columns and skip the first row
all_genes = pd.read_excel("datasets/cachexia_vs_control_all_genes.xlsx")
all_genes.rename(
    columns={"Unnamed: 0": "identifier", "Unnamed: 1": "GENE_SYMBOL"}, inplace=True
)
deg_data = all_genes[all_genes["padj"] < 0.01]
print("Number of genes:", len(all_genes["identifier"].unique()))
deg_data.head(1)

Number of genes: 12322


Unnamed: 0,identifier,GENE_SYMBOL,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,minus_log10_pvalue
80,ENSG00000005700,IBTK,1511.600789,-0.436326,0.103624,-4.210661,2.5e-05,0.003024,4.594099


In [3]:
deg_data[deg_data["identifier"] == "ENSG00000159713"]  # does not exist in the deg table shared

Unnamed: 0,identifier,GENE_SYMBOL,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,minus_log10_pvalue
7049,ENSG00000159713,TPPP3,665.53875,1.436198,0.360226,3.986934,6.7e-05,0.005344,4.174362


### Entity resolution with BridgeDB

In [4]:
pickle_path = os.path.join(base_dir, "data/human/bridgedb_df.pkl")
metadata_path = os.path.join(base_dir, "data/human/bridgedb_metadata.pkl")

if not os.path.exists(pickle_path):
    bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
        identifiers=all_genes,
        input_species="Human",
        input_datasource="Ensembl",
        output_datasource="All",
    )
    bridgedb_df.to_pickle(pickle_path)
    with open(metadata_path, "wb") as file:
        pickle.dump(bridgedb_metadata, file)
else:
    bridgedb_df = pd.read_pickle(pickle_path)
    with open(metadata_path, "rb") as file:
        bridgedb_metadata = pickle.load(file)

In [5]:
print("Number of genes with mapping in BridgeDb:", len(bridgedb_df["identifier"].unique()))
bridgedb_df.head(1)

Number of genes with mapping in BridgeDb: 12310


Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea
0,ENSG00000000003,Ensembl,HGNC:11858,HGNC Accession Number,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,0.0779,2.279542


### TF-target interactions

In [6]:
tflink_path = os.path.join(base_dir, "data/human/tflink.pkl")
tflink_metadata_path = os.path.join(base_dir, "data/human/tflink_metadata.pkl")

if not os.path.exists(tflink_path):
    tflink_df, tflink_metadata = tflink.get_tf_target(
        bridgedb_df=bridgedb_df,
        tf_file="TFLink_Homo_sapiens_interactions_All_simpleFormat_v1.0.tsv.gz",
        filename="data/human/tflink_human.tsv.gz",
        filter_deg=True,
    )
    tflink_df.to_pickle(tflink_path)
    with open(tflink_metadata_path, "wb") as file:
        pickle.dump(tflink_metadata, file)
else:
    tflink_df = pd.read_pickle(tflink_path)
    with open(tflink_metadata_path, "rb") as file:
        tflink_metadata = pickle.load(file)

tflink_df.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea,is_tf,is_target,its_target,its_tf
0,ENSG00000000003,Ensembl,7105,NCBI Gene,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,0.0779,2.279542,False,True,,


### Enrichment analysis using g:Profiler
all the pathways and annotations are being added despite being significance.

In [7]:
gprofiler_path = os.path.join(base_dir, "data/human/gprofiler.pkl")
gprofiler_metadata_path = os.path.join(base_dir, "data/human/gprofiler_metadata.pkl")

if not os.path.exists(gprofiler_path):
    gprofiler_df, gprofiler_metadata = gprofiler.get_gene_enrichment(
        bridgedb_df=bridgedb_df)
    gprofiler_df.to_pickle(gprofiler_path)
    with open(gprofiler_metadata_path, "wb") as file:
        pickle.dump(gprofiler_metadata, file)
else:
    gprofiler_df = pd.read_pickle(gprofiler_path)
    with open(gprofiler_metadata_path, "rb") as file:
        gprofiler_metadata = pickle.load(file)

gprofiler_df.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp
0,ENSG00000000003,Ensembl,ENSG00000000003,Ensembl,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,...,,,,,,,,,,


In [8]:
gprofiler_df.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp
0,ENSG00000000003,Ensembl,ENSG00000000003,Ensembl,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,...,,,,,,,,,,


In [9]:
gprofiler_df[gprofiler_df["g:Profiler_reac"].notna()].head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:bp,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp
182,ENSG00000010256,Ensembl,ENSG00000010256,Ensembl,UQCRC1,12502.431814,-0.379555,0.094841,-4.002025,6.3e-05,...,"[{'id': 'GO:0045333', 'name': 'cellular respir...","[{'id': 'GO:0005739', 'name': 'mitochondrion',...","[{'id': 'GO:0016491', 'name': 'oxidoreductase ...","[{'id': 'HP:0012444', 'name': 'Brain atrophy',...","[{'id': 'HPA:0271133', 'name': 'kidney; distal...","[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:hsa-miR-320b', 'name': 'hsa-miR...","[{'id': 'REAC:R-HSA-1428517', 'name': 'The cit...","[{'id': 'TF:M08968_1', 'name': 'Factor: RXR-AL...","[{'id': 'WP:WP111', 'name': 'Electron transpor..."


### Add MitoCarta data

In [10]:
mitocarta_path = os.path.join(base_dir, "data/human/mitocarta.pkl")
mitocarta_metadata_path = os.path.join(base_dir, "data/human/mitocarta_metadata.pkl")

if not os.path.exists(mitocarta_path):
    mitocarta_df, mitocarta_metadata = mitocarta.get_gene_mito_pathways(
        bridgedb_df=bridgedb_df,
        mitocarta_file="Human.MitoCarta3.0.xls",
        filename="data/human/mitocarta3.0_human.xls",
        species="hsapiens",
        sheet_name="A Human MitoCarta3.0"
    )
    mitocarta_df.to_pickle(mitocarta_path)
    with open(mitocarta_metadata_path, "wb") as file:
        pickle.dump(mitocarta_metadata, file)
else:
    mitocarta_df = pd.read_pickle(mitocarta_path)
    with open(mitocarta_metadata_path, "rb") as file:
        mitocarta_metadata = pickle.load(file)

mitocarta_df.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea,MitoCarta
0,ENSG00000000003,Ensembl,ENSG00000000003,Ensembl,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,0.0779,2.279542,


In [11]:
mitocarta_df[mitocarta_df["identifier"] == "ENSG00000005156"]["MitoCarta"].to_dict()

{65: [{'gene_description': 'DNA ligase 3',
   'evidence': 'literature, targetP signal, mito protein domain, coexpression',
   'sub_mito_localization': 'Matrix',
   'mito_pathways': 'mtDNA repair',
   'hpa_location': 'Nucleoplasm (Supported)',
   'tissue_expression': nan}]}

In [12]:
mitocarta_df[mitocarta_df["identifier"] == "ENSG00000179091"]["MitoCarta"].to_dict()

{9339: [{'gene_description': 'cytochrome c1',
   'evidence': 'literature, APEX_IMS, APEX_matrix, targetP signal, yeast mito homolog++, Rickettsial homolog, mito protein domain+, induction, coexpression++, MS/MS++',
   'sub_mito_localization': 'MIM',
   'mito_pathways': 'OXPHOS subunits',
   'hpa_location': 'Mitochondria (Supported)',
   'tissue_expression': 'all 14'}]}

In [13]:
mitocarta_df[mitocarta_df["identifier"] == "ENSG00000167186"]["MitoCarta"].to_dict()

{8029: [{'gene_description': 'coenzyme Q7, hydroxylase',
   'evidence': 'literature, targetP signal+, yeast mito homolog++, Rickettsial homolog, mito protein domain+, coexpression++, MS/MS++',
   'sub_mito_localization': 'MIM',
   'mito_pathways': 'Coenzyme Q metabolism',
   'hpa_location': 'Plasma membrane (Approved)',
   'tissue_expression': 'all 14'}]}

### Protein-Protein Interactions from STRING

In [14]:
string_path = os.path.join(base_dir, "data/human/string.pkl")
string_metadata_path = os.path.join(base_dir, "data/human/string_metadata.pkl")

if not os.path.exists(string_path):
    ppi_df, ppi_metadata = stringdb.get_ppi(
        bridgedb_df=bridgedb_df[bridgedb_df["padj_dea"] <= 0.01]
    )
    ppi_df.to_pickle(string_path)
    with open(string_metadata_path, "wb") as file:
        pickle.dump(ppi_metadata, file)
else:
    ppi_df = pd.read_pickle(string_path)
    with open(string_metadata_path, "rb") as file:
        ppi_metadata = pickle.load(file)

ppi_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea,StringDB_ppi
0,ENSG00000005700,Ensembl,ENSG00000005700,Ensembl,IBTK,1511.600789,-0.436326,0.103624,-4.210661,2.5e-05,0.003024,4.594099,"[{'stringdb_link_to': 'ENSG00000198886', 'Ense..."
1,ENSG00000010256,Ensembl,ENSG00000010256,Ensembl,UQCRC1,12502.431814,-0.379555,0.094841,-4.002025,6.3e-05,0.005194,4.202023,"[{'stringdb_link_to': 'ENSG00000083123', 'Ense..."
2,ENSG00000023228,Ensembl,ENSG00000023228,Ensembl,NDUFS1,10790.035279,-0.482229,0.11943,-4.037767,5.4e-05,0.004715,4.26791,"[{'stringdb_link_to': 'ENSG00000010256', 'Ense..."
3,ENSG00000023330,Ensembl,ENSG00000023330,Ensembl,ALAS1,1874.194907,-0.607163,0.161082,-3.769279,0.000164,0.009256,3.785899,"[{'stringdb_link_to': 'ENSG00000100209', 'Ense..."
4,ENSG00000025039,Ensembl,ENSG00000025039,Ensembl,RRAGD,3945.823792,-0.296703,0.067217,-4.414091,1e-05,0.001894,4.993811,"[{'stringdb_link_to': 'ENSG00000123643', 'Ense..."


In [15]:
ppi_df[STRING_PPI_COL].to_dict()

{0: [{'stringdb_link_to': 'ENSG00000198886',
   'Ensembl': 'Ensembl:ENSP00000354961',
   'score': 0.457,
   'Uniprot-TrEMBL': 'IBTK'}],
 1: [{'stringdb_link_to': 'ENSG00000083123',
   'Ensembl': 'Ensembl:ENSP00000318351',
   'score': 0.423,
   'Uniprot-TrEMBL': 'UQCRC1'},
  {'stringdb_link_to': 'ENSG00000198899',
   'Ensembl': 'Ensembl:ENSP00000354632',
   'score': 0.435,
   'Uniprot-TrEMBL': 'UQCRC1'},
  {'stringdb_link_to': 'ENSG00000067225',
   'Ensembl': 'Ensembl:ENSP00000320171',
   'score': 0.442,
   'Uniprot-TrEMBL': 'UQCRC1'},
  {'stringdb_link_to': 'ENSG00000136143',
   'Ensembl': 'Ensembl:ENSP00000494360',
   'score': 0.462,
   'Uniprot-TrEMBL': 'UQCRC1'},
  {'stringdb_link_to': 'ENSG00000110435',
   'Ensembl': 'Ensembl:ENSP00000227868',
   'score': 0.5,
   'Uniprot-TrEMBL': 'UQCRC1'},
  {'stringdb_link_to': 'ENSG00000154518',
   'Ensembl': 'Ensembl:ENSP00000284727',
   'score': 0.614,
   'Uniprot-TrEMBL': 'UQCRC1'},
  {'stringdb_link_to': 'ENSG00000127884',
   'Ensembl': 'En

## Graph generation 

### Combine all data and metadata

In [20]:
ppi_df.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,padj_dea,minus_log10_pvalue_dea,StringDB_ppi
0,ENSG00000005700,Ensembl,ENSG00000005700,Ensembl,IBTK,1511.600789,-0.436326,0.103624,-4.210661,2.5e-05,0.003024,4.594099,"[{'stringdb_link_to': 'ENSG00000198886', 'Ense..."


In [16]:
combined_df = combine_sources(
    bridgedb_df,
    [
        tflink_df,
        mitocarta_df,
        gprofiler_df,
        ppi_df,
    ],
)

In [19]:
combined_df[combined_df["identifier"] == "ENSG00000167186"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,StringDB_ppi
8070,ENSG00000167186,Ensembl,ENSG00000167186,Ensembl,COQ7,974.044694,-0.162285,0.105202,-1.542607,0.122926,...,,,,,,,,,,


In [20]:
combined_df.to_pickle("data/human/combined_df.pkl")

In [21]:
combined_metadata = create_or_append_to_metadata(
    bridgedb_metadata,
    [
        tflink_metadata,
        mitocarta_metadata,
        gprofiler_metadata,
        ppi_metadata,
    ],
)

In [22]:
combined_metadata

[{'datasource': 'TFLink',
  'metadata': {'download date': '2025-01-09 11:52:11',
   'download link': 'https://cdn.netbiol.org/tflink/download_files/TFLink_Homo_sapiens_interactions_All_simpleFormat_v1.0.tsv.gz'}},
 {'datasource': 'MitoCarta',
  'metadata': {'download date': '2025-01-09 11:55:20',
   'download link': 'https://personal.broadinstitute.org/scalvo/MitoCarta3.0//Human.MitoCarta3.0.xls'}},
 {'datasource': 'g:Profiler',
  'metadata': {'biomart': 'Ensembl',
   'biomart_version': '111',
   'display_name': 'Human',
   'genebuild': 'GRCh38.p14',
   'gprofiler_version': 'e111_eg58_p18_f463989d',
   'organism': 'hsapiens',
   'sources': {'CORUM': {'name': 'CORUM protein complexes',
     'version': '28.11.2022 Corum 4.1'},
    'GO:BP': {'name': 'biological process',
     'version': 'annotations: BioMart\nclasses: releases/2024-01-17'},
    'GO:CC': {'name': 'cellular component',
     'version': 'annotations: BioMart\nclasses: releases/2024-01-17'},
    'GO:MF': {'name': 'molecular fu

In [23]:
with open("data/human/combined_metadata.pkl", "wb") as out:
    pickle.dump(combined_metadata, out)

### Create a graph from the annotated dataframe

In [24]:
combined_df.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:cc,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,StringDB_ppi
0,ENSG00000000003,Ensembl,ENSG00000000003,Ensembl,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,...,,,,,,,,,,


In [25]:
pygraph = generator.save_graph(
    combined_df=combined_df,
    combined_metadata=combined_metadata,
    graph_name="examples",
    graph_dir="./data/human",
)

Combined DataFrame saved in ./data/human/examples/examples_df.pkl
Metadata saved in ./data/human/examples/examples_metadata.pkl
Building graph: 100%|██████████| 12404/12404 [00:03<00:00, 3500.27it/s]
Graph is built successfully


NetworkXError: datetime.datetime(2004, 9, 1, 0, 0) is not a string

In [42]:
combined_df = pd.merge(
    ncbi_tf_gprofiler_mitocarta_df_human,
    ppi_df[["target", "StringDB_ppi"]].rename(columns={"target": "identifier"}),
    on="identifier",
    how="left",
)

In [43]:
combined_df.columns

Index(['identifier', 'identifier.source', 'target', 'target.source',
       'GENE_SYMBOL_dea', 'baseMean_dea', 'log2FoldChange_dea', 'lfcSE_dea',
       'stat_dea', 'pvalue_dea', 'padj_dea', 'minus_log10_pvalue_dea', 'is_tf',
       'is_target', 'its_target', 'its_tf', 'intersections',
       'g:Profiler_corum', 'g:Profiler_go:bp', 'g:Profiler_go:cc',
       'g:Profiler_go:mf', 'g:Profiler_hp', 'g:Profiler_hpa',
       'g:Profiler_kegg', 'g:Profiler_mirna', 'g:Profiler_reac',
       'g:Profiler_tf', 'g:Profiler_wp', 'MitoCarta', 'StringDB_ppi'],
      dtype='object')

In [44]:
combined_df.head(1)

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
0,ENSG00000000003,Ensembl,7105,ncbi_gene_id,TSPAN6,106.899174,-0.401659,0.143909,-2.791057,0.005254,...,,,,,,,,,,


In [45]:
combined_df["StringDB_ppi"].to_dict()

{0: nan,
 1: nan,
 2: nan,
 3: nan,
 4: nan,
 5: nan,
 6: nan,
 7: nan,
 8: nan,
 9: nan,
 10: nan,
 11: nan,
 12: nan,
 13: nan,
 14: nan,
 15: nan,
 16: nan,
 17: nan,
 18: nan,
 19: nan,
 20: nan,
 21: nan,
 22: nan,
 23: nan,
 24: nan,
 25: nan,
 26: nan,
 27: nan,
 28: nan,
 29: nan,
 30: nan,
 31: nan,
 32: nan,
 33: nan,
 34: nan,
 35: nan,
 36: nan,
 37: nan,
 38: nan,
 39: nan,
 40: nan,
 41: nan,
 42: nan,
 43: nan,
 44: nan,
 45: nan,
 46: nan,
 47: nan,
 48: nan,
 49: nan,
 50: nan,
 51: nan,
 52: nan,
 53: nan,
 54: nan,
 55: nan,
 56: nan,
 57: nan,
 58: nan,
 59: nan,
 60: nan,
 61: nan,
 62: nan,
 63: nan,
 64: nan,
 65: nan,
 66: nan,
 67: nan,
 68: nan,
 69: nan,
 70: nan,
 71: nan,
 72: nan,
 73: nan,
 74: nan,
 75: nan,
 76: nan,
 77: nan,
 78: nan,
 79: nan,
 80: nan,
 81: [{'stringdb_link_to': 'ENSG00000198886',
   'Ensembl': 'Ensembl:ENSP00000354961',
   'score': 0.457,
   'Uniprot-TrEMBL': 'IBTK'}],
 82: nan,
 83: nan,
 84: nan,
 85: nan,
 86: nan,
 87: nan,
 88

In [46]:
filename_human = "data/human/combined_df.pkl"

combined_df.to_pickle(filename_human)

### graph

In [9]:
filename_human = "data/human/combined_df.pkl"
with open(filename_human, "rb") as f:
    combined_df = pickle.load(f)

In [10]:
# Extract all 'NCBI.GeneID.TF' values into a single list
ncbi_gene_ids = (
    combined_df["its_tf"]
    .apply(lambda x: [d["NCBI.GeneID.TF"] for d in x] if isinstance(x, list) else [])
    .explode()
    .dropna()
    .unique()
    .tolist()
)

len(ncbi_gene_ids)

952

In [11]:
combined_df_tf = combined_df[combined_df["target"].isin(ncbi_gene_ids)]
combined_df_sig = combined_df[combined_df["padj_dea"] <= 0.01]
combined_df_sig = combined_df_sig[~combined_df_sig["target"].isin(ncbi_gene_ids)]
combined_df_sig.shape

(205, 30)

In [12]:
combined_df_tf_sig = pd.concat([combined_df_sig, combined_df_tf], axis=0, ignore_index=True)
combined_df_tf_sig.shape

(1158, 30)

In [13]:
combined_df_tf_sig[combined_df_tf_sig["GENE_SYMBOL_dea"] == "COQ10A"]["its_tf"].apply(
    lambda x: isinstance(x, list)
)

81    True
Name: its_tf, dtype: bool

In [14]:
combined_df_tf_sig[combined_df_tf_sig["GENE_SYMBOL_dea"] == "COQ10A"]["its_tf"].to_dict()

{81: [{'NCBI.GeneID.TF': '112398',
   'Ensembl.GeneID.TF': 'ENSG00000269858',
   'Name.TF': 'EGLN2',
   'UniprotID.TF': 'Q96KS0',
   'TF.TFLink.ortho': 'Mm:Q91YE2;Rn:Q6AYU4',
   'TF.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay',
   'PubmedID': '29126285',
   'Source.database': 'ReMap',
   'Small-scale.evidence': 'No'},
  {'NCBI.GeneID.TF': '11091',
   'Ensembl.GeneID.TF': 'ENSG00000196363',
   'Name.TF': 'WDR5',
   'UniprotID.TF': 'P61964',
   'TF.TFLink.ortho': 'Dm:Q9V3J8;Dr:Q7ZTX2;Mm:P61965',
   'TF.nonTFLink.ortho': 'Rn:Q498M4',
   'Detection.method': 'chromatin immunoprecipitation assay',
   'PubmedID': '27924024',
   'Source.database': 'GTRD',
   'Small-scale.evidence': 'No'},
  {'NCBI.GeneID.TF': '1024',
   'Ensembl.GeneID.TF': 'ENSG00000132964',
   'Name.TF': 'CDK8',
   'UniprotID.TF': 'P49336',
   'TF.TFLink.ortho': 'Dm:Q9VT57;Dr:A8E4S2;Mm:Q8R3L8',
   'TF.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay',
 

In [15]:
combined_df_tf[~combined_df_tf["target"].isin(combined_df_sig["target"].tolist())]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
7,ENSG00000001167,Ensembl,4800,ncbi_gene_id,NFYA,198.963515,0.339027,0.133075,2.547627,0.010846,...,,,,,,,,,,
43,ENSG00000004487,Ensembl,23028,ncbi_gene_id,KDM1A,1310.615484,0.045821,0.067400,0.679830,0.496612,...,,,,,,,,,,
74,ENSG00000005339,Ensembl,1387,ncbi_gene_id,CREBBP,1852.242330,0.116510,0.162612,0.716489,0.473690,...,,,,,,,,,,
88,ENSG00000005889,Ensembl,7543,ncbi_gene_id,ZFX,732.044736,-0.164081,0.159735,-1.027205,0.304324,...,,,,,,,,,,
99,ENSG00000006194,Ensembl,10127,ncbi_gene_id,ZNF263,337.618503,0.141491,0.132231,1.070034,0.284604,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11743,ENSG00000275700,Ensembl,26574,ncbi_gene_id,AATF,405.250619,0.422212,0.134613,3.136484,0.001710,...,,,,,,,,,,
11756,ENSG00000276644,Ensembl,1602,ncbi_gene_id,DACH1,87.387806,-0.001658,0.179004,-0.009262,0.992610,...,,,,,,,,,,
11758,ENSG00000277258,Ensembl,7703,ncbi_gene_id,PCGF2,160.048407,0.002932,0.203806,0.014386,0.988522,...,,,,,,,,,,
11760,ENSG00000277494,Ensembl,338328,ncbi_gene_id,GPIHBP1,585.805576,-0.190933,0.355165,-0.537591,0.590860,...,,,,,,,,,,


In [16]:
combined_df_tf_sig[combined_df_tf_sig["identifier"].isin(["ENSG00000001167", "ENSG00000116717"])]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
55,ENSG00000116717,Ensembl,1647,ncbi_gene_id,GADD45A,448.20828,1.432184,0.377628,3.792579,0.000149,...,"[{'id': 'GO:0019899', 'name': 'enzyme binding'...",,,"[{'id': 'KEGG:05220', 'name': 'Chronic myeloid...","[{'id': 'MIRNA:hsa-miR-26b-5p', 'name': 'hsa-m...","[{'id': 'REAC:R-HSA-6791312', 'name': 'TP53 Re...","[{'id': 'TF:M07322_1', 'name': 'Factor: HSF4; ...","[{'id': 'WP:WP3640', 'name': 'Imatinib and chr...",,"[{'stringdb_link_to': 'ENSG00000120129', 'Ense..."
205,ENSG00000001167,Ensembl,4800,ncbi_gene_id,NFYA,198.963515,0.339027,0.133075,2.547627,0.010846,...,,,,,,,,,,


In [17]:
combined_df_tf_sig[combined_df_tf_sig["identifier"] == "ENSG00000001167"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
205,ENSG00000001167,Ensembl,4800,ncbi_gene_id,NFYA,198.963515,0.339027,0.133075,2.547627,0.010846,...,,,,,,,,,,


In [18]:
combined_df_tf_sig[combined_df_tf_sig["GENE_SYMBOL_dea"] == "GADD45A"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
55,ENSG00000116717,Ensembl,1647,ncbi_gene_id,GADD45A,448.20828,1.432184,0.377628,3.792579,0.000149,...,"[{'id': 'GO:0019899', 'name': 'enzyme binding'...",,,"[{'id': 'KEGG:05220', 'name': 'Chronic myeloid...","[{'id': 'MIRNA:hsa-miR-26b-5p', 'name': 'hsa-m...","[{'id': 'REAC:R-HSA-6791312', 'name': 'TP53 Re...","[{'id': 'TF:M07322_1', 'name': 'Factor: HSF4; ...","[{'id': 'WP:WP3640', 'name': 'Imatinib and chr...",,"[{'stringdb_link_to': 'ENSG00000120129', 'Ense..."


In [19]:
combined_df_tf_sig[combined_df_tf_sig["identifier"] == "ENSG00000001167"]["its_target"].to_dict()

{205: [{'NCBI.GeneID.Target': '1647',
   'Ensembl.GeneID.Target': 'ENSG00000116717',
   'Name.Target': 'GADD45A',
   'UniprotID.Target': 'P24522',
   'Target.TFLink.ortho': 'Dr:Q6GMM1;Mm:P48316;Rn:Q66HL6',
   'Target.nonTFLink.ortho': '-',
   'Detection.method': 'chromatin immunoprecipitation assay;inferred by curator',
   'PubmedID': '29126285;11525640;11420680;27924024;29087512',
   'Source.database': 'GTRD;ReMap;TRRUST',
   'Small-scale.evidence': 'Yes'},
  {'NCBI.GeneID.Target': '84271',
   'Ensembl.GeneID.Target': 'ENSG00000100227',
   'Name.Target': 'POLDIP3',
   'UniprotID.Target': 'Q9BY77',
   'Target.TFLink.ortho': 'Mm:Q8BG81;Rn:D4A2B0',
   'Target.nonTFLink.ortho': 'Dr:A0A0R4ILC0',
   'Detection.method': 'chromatin immunoprecipitation assay',
   'PubmedID': '29126285;27924024',
   'Source.database': 'GTRD;ReMap',
   'Small-scale.evidence': 'No'},
  {'NCBI.GeneID.Target': '4043',
   'Ensembl.GeneID.Target': 'ENSG00000163956',
   'Name.Target': 'LRPAP1',
   'UniprotID.Target': 

In [20]:
combined_df_tf_sig["its_target"][10]

In [21]:
# Extract all targets for NFYA ('NCBI.GeneID.TF')
ncbi_gene_ids_NFYA_targets = (
    combined_df_tf_sig[combined_df_tf_sig["identifier"] == "ENSG00000001167"]["its_target"]
    .apply(lambda x: [d["NCBI.GeneID.Target"] for d in x] if isinstance(x, list) else [])
    .explode()
    .dropna()
    .unique()
    .tolist()
)

len(ncbi_gene_ids_NFYA_targets)

194

In [50]:
# combined_df = combine_sources(
#     merged_df_human,
#     [
#     ppi_df,
#     ],
# )

In [30]:
# combined_metadata = create_or_append_to_metadata(
#     bridgedb_metadata_human,
#     [
#     ppi_metadata,
#     get_data_versions("hsapiens")
#     ],
# )

In [55]:
# combined_metadata

[{'datasource': 'StringDB',
  'metadata': {'source_version': {'source_version': '12.0'}},
  'query': {'size': 222,
   'input_type': 'HGNC',
   'number_of_added_edges': 532,
   'time': '0:00:00.651298',
   'date': '2024-12-18 14:43:18',
   'url': 'https://string-db.org/api'}},
 {'biomart': 'Ensembl',
  'biomart_version': '111',
  'display_name': 'Human',
  'genebuild': 'GRCh38.p14',
  'gprofiler_version': 'e111_eg58_p18_f463989d',
  'organism': 'hsapiens',
  'sources': {'CORUM': {'name': 'CORUM protein complexes',
    'version': '28.11.2022 Corum 4.1'},
   'GO:BP': {'name': 'biological process',
    'version': 'annotations: BioMart\nclasses: releases/2024-01-17'},
   'GO:CC': {'name': 'cellular component',
    'version': 'annotations: BioMart\nclasses: releases/2024-01-17'},
   'GO:MF': {'name': 'molecular function',
    'version': 'annotations: BioMart\nclasses: releases/2024-01-17'},
   'HP': {'name': 'Human Phenotype Ontology',
    'version': 'annotations: 01.2024\nclasses: None'},
 

In [22]:
combined_df[combined_df["identifier"] == "CDKN1A"]["StringDB_ppi"].to_dict()

{}

In [23]:
combined_df_tf_sig[combined_df_tf_sig["GENE_SYMBOL_dea"].str.contains("OTF6", case=False, na=False)]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi


In [24]:
combined_df[combined_df["GENE_SYMBOL_dea"] == "MT-ND4"]

Unnamed: 0,identifier,identifier.source,target,target.source,GENE_SYMBOL_dea,baseMean_dea,log2FoldChange_dea,lfcSE_dea,stat_dea,pvalue_dea,...,g:Profiler_go:mf,g:Profiler_hp,g:Profiler_hpa,g:Profiler_kegg,g:Profiler_mirna,g:Profiler_reac,g:Profiler_tf,g:Profiler_wp,MitoCarta,StringDB_ppi
10722,ENSG00000198886,Ensembl,4538,ncbi_gene_id,MT-ND4,1098081.0,-0.722805,0.164567,-4.392165,1.1e-05,...,"[{'id': 'GO:0016491', 'name': 'oxidoreductase ...","[{'id': 'HP:0003128', 'name': 'Lactic acidosis...","[{'id': 'HPA:0540053', 'name': 'stomach 1; gla...","[{'id': 'KEGG:01100', 'name': 'Metabolic pathw...","[{'id': 'MIRNA:hsa-miR-320a', 'name': 'hsa-miR...","[{'id': 'REAC:R-HSA-1428517', 'name': 'The cit...","[{'id': 'TF:M07428', 'name': 'Factor: Six-3; m...","[{'id': 'WP:WP111', 'name': 'Electron transpor...","[{'gene_description': 'NADH dehydrogenase, sub...","[{'stringdb_link_to': 'ENSG00000010256', 'Ense..."


In [25]:
combined_df[combined_df["identifier"] == "ENSG00000198886"]["StringDB_ppi"].to_dict()

{10722: [{'stringdb_link_to': 'ENSG00000010256',
   'Ensembl': 'ENSP00000203407',
   'score': 0.971,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000108179',
   'Ensembl': 'ENSP00000225174',
   'score': 0.404,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000110955',
   'Ensembl': 'ENSP00000262030',
   'score': 0.553,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000154518',
   'Ensembl': 'ENSP00000284727',
   'score': 0.513,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000005700',
   'Ensembl': 'ENSP00000305721',
   'score': 0.457,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000198804',
   'Ensembl': 'ENSP00000354499',
   'score': 0.999,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000198727',
   'Ensembl': 'ENSP00000354554',
   'score': 0.999,
   'Uniprot-TrEMBL': 'MT-ND4'},
  {'stringdb_link_to': 'ENSG00000198899',
   'Ensembl': 'ENSP00000354632',
   'score': 0.999,
   'Uniprot-TrEMB

In [26]:
combined_df[combined_df["identifier"] == "ENSG00000005700"]["StringDB_ppi"].to_dict()

{81: [{'stringdb_link_to': 'ENSG00000198886',
   'Ensembl': 'Ensembl:ENSP00000354961',
   'score': 0.457,
   'Uniprot-TrEMBL': 'IBTK'}]}

In [27]:
pygraph = generator.save_graph(
    combined_df=combined_df_tf_sig,
    combined_metadata=bridgedb_metadata_human,
    graph_name="mito_test",
    graph_dir="./data",
)

Combined DataFrame saved in ./data/mito_test/mito_test_df.pkl
Metadata saved in ./data/mito_test/mito_test_metadata.pkl
Building graph:  23%|██▎       | 265/1158 [00:00<00:02, 385.67it/s]

I am tf:  ENSG00000116717 ENSG00000001167
None
I am tf:  ENSG00000100227 ENSG00000001167
None
I am tf:  ENSG00000163956 ENSG00000001167
None
I am tf:  ENSG00000108551 ENSG00000001167
None
I am tf:  ENSG00000101187 ENSG00000001167
None
I am tf:  ENSG00000142188 ENSG00000001167
None
I am tf:  ENSG00000115525 ENSG00000001167
None
I am tf:  ENSG00000157045 ENSG00000001167
None
I am tf:  ENSG00000141084 ENSG00000001167
None
I am tf:  ENSG00000033327 ENSG00000001167
None
I am tf:  ENSG00000167740 ENSG00000001167
None
I am tf:  ENSG00000067704 ENSG00000001167
None
I am tf:  ENSG00000168675 ENSG00000001167
None
I am tf:  ENSG00000166619 ENSG00000001167
None
I am tf:  ENSG00000107745 ENSG00000001167
None
I am tf:  ENSG00000182712 ENSG00000001167
None
I am tf:  ENSG00000023228 ENSG00000001167
None
I am tf:  ENSG00000091140 ENSG00000001167
None
I am tf:  ENSG00000197982 ENSG00000001167
None
I am tf:  ENSG00000071794 ENSG00000001167
None
I am tf:  ENSG00000125863 ENSG00000001167
None
I am tf:  ENS

Building graph:  34%|███▍      | 391/1158 [00:01<00:02, 344.81it/s]

I am tf:  ENSG00000182973 ENSG00000080298
None
I am tf:  ENSG00000103056 ENSG00000080298
None
I am tf:  ENSG00000203965 ENSG00000080298
None
I am tf:  ENSG00000148120 ENSG00000080298
None
I am tf:  ENSG00000154240 ENSG00000080298
None
I am tf:  ENSG00000128918 ENSG00000080298
None
I am tf:  ENSG00000067369 ENSG00000080298
None
I am tf:  ENSG00000113916 ENSG00000080298
None
I am tf:  ENSG00000152904 ENSG00000080298
None
I am tf:  ENSG00000100577 ENSG00000080298
None
I am tf:  ENSG00000143252 ENSG00000080298
None
I am tf:  ENSG00000163956 ENSG00000080298
None
I am tf:  ENSG00000142188 ENSG00000080298
None
I am tf:  ENSG00000141084 ENSG00000080298
None
I am tf:  ENSG00000167740 ENSG00000080298
None
I am tf:  ENSG00000166619 ENSG00000080298
None
I am tf:  ENSG00000154814 ENSG00000080298
None
I am tf:  ENSG00000159884 ENSG00000080298
None
I am tf:  ENSG00000135469 ENSG00000080298
None
I am tf:  ENSG00000141252 ENSG00000080298
None
I am tf:  ENSG00000089101 ENSG00000080298
None
I am tf:  ENS

Building graph:  40%|████      | 464/1158 [00:01<00:02, 326.13it/s]

I am tf:  ENSG00000167740 ENSG00000102935
None
I am tf:  ENSG00000168209 ENSG00000102935
None
I am tf:  ENSG00000135469 ENSG00000102935
None
I am tf:  ENSG00000157557 ENSG00000102935
None
I am tf:  ENSG00000141759 ENSG00000102935
None
I am tf:  ENSG00000120129 ENSG00000102935
None
I am tf:  ENSG00000029534 ENSG00000102935
None
I am tf:  ENSG00000025039 ENSG00000102935
None
I am tf:  ENSG00000163171 ENSG00000102935
None
I am tf:  ENSG00000072274 ENSG00000102935
None
I am tf:  ENSG00000166165 ENSG00000102935
None
I am tf:  ENSG00000197448 ENSG00000102935
None
I am tf:  ENSG00000154734 ENSG00000102935
None
I am tf:  ENSG00000145476 ENSG00000102935
None
I am tf:  ENSG00000100813 ENSG00000102935
None
I am tf:  ENSG00000137571 ENSG00000102935
None
I am tf:  ENSG00000101187 ENSG00000102935
None
I am tf:  ENSG00000071205 ENSG00000102935
None
I am tf:  ENSG00000113916 ENSG00000102974
None
I am tf:  ENSG00000136997 ENSG00000102974
None
I am tf:  ENSG00000089101 ENSG00000102974
None
I am tf:  ENS

Building graph:  43%|████▎     | 498/1158 [00:01<00:02, 324.50it/s]

I am tf:  ENSG00000131873 ENSG00000111880
None
I am tf:  ENSG00000067225 ENSG00000111880
None
I am tf:  ENSG00000148120 ENSG00000111880
None
I am tf:  ENSG00000128016 ENSG00000111880
None
I am tf:  ENSG00000113163 ENSG00000111880
None
I am tf:  ENSG00000166797 ENSG00000111880
None
I am tf:  ENSG00000049323 ENSG00000111880
None
I am tf:  ENSG00000167996 ENSG00000111880
None
I am tf:  ENSG00000154240 ENSG00000111880
None
I am tf:  ENSG00000128918 ENSG00000111880
None
I am tf:  ENSG00000113916 ENSG00000111880
None
I am tf:  ENSG00000115307 ENSG00000111880
None
I am tf:  ENSG00000159388 ENSG00000111880
None
I am tf:  ENSG00000175198 ENSG00000111880
None
I am tf:  ENSG00000078967 ENSG00000111880
None
I am tf:  ENSG00000111641 ENSG00000111880
None
I am tf:  ENSG00000153107 ENSG00000111880
None
I am tf:  ENSG00000196177 ENSG00000111880
None
I am tf:  ENSG00000120802 ENSG00000111880
None
I am tf:  ENSG00000154518 ENSG00000111880
None
I am tf:  ENSG00000100813 ENSG00000111880
None
I am tf:  ENS

Building graph:  49%|████▉     | 569/1158 [00:01<00:01, 300.70it/s]

I am tf:  ENSG00000100906 ENSG00000118260
None
I am tf:  ENSG00000175198 ENSG00000118260
None
I am tf:  ENSG00000152904 ENSG00000118260
None
I am tf:  ENSG00000100209 ENSG00000118260
None
I am tf:  ENSG00000140905 ENSG00000118260
None
I am tf:  ENSG00000078967 ENSG00000118260
None
I am tf:  ENSG00000273540 ENSG00000118260
None
I am tf:  ENSG00000100577 ENSG00000118260
None
I am tf:  ENSG00000125246 ENSG00000118260
None
I am tf:  ENSG00000111641 ENSG00000118260
None
I am tf:  ENSG00000153107 ENSG00000118260
None
I am tf:  ENSG00000243056 ENSG00000118260
None
I am tf:  ENSG00000145476 ENSG00000118260
None
I am tf:  ENSG00000196177 ENSG00000118260
None
I am tf:  ENSG00000120802 ENSG00000118260
None
I am tf:  ENSG00000145362 ENSG00000118260
None
I am tf:  ENSG00000106049 ENSG00000118260
None
I am tf:  ENSG00000123643 ENSG00000118260
None
I am tf:  ENSG00000154518 ENSG00000118260
None
I am tf:  ENSG00000100813 ENSG00000118260
None
I am tf:  ENSG00000163083 ENSG00000118260
None
I am tf:  ENS

Building graph:  54%|█████▍    | 629/1158 [00:02<00:02, 204.62it/s]

 ENSG00000141759 ENSG00000127511
None
I am tf:  ENSG00000160214 ENSG00000127511
None
I am tf:  ENSG00000141252 ENSG00000127511
None
I am tf:  ENSG00000089101 ENSG00000127511
None
I am tf:  ENSG00000125166 ENSG00000127511
None
I am tf:  ENSG00000120129 ENSG00000127511
None
I am tf:  ENSG00000204569 ENSG00000127511
None
I am tf:  ENSG00000138759 ENSG00000127511
None
I am tf:  ENSG00000116704 ENSG00000127511
None
I am tf:  ENSG00000072274 ENSG00000127511
None
I am tf:  ENSG00000174021 ENSG00000127511
None
I am tf:  ENSG00000125148 ENSG00000127511
None
I am tf:  ENSG00000129347 ENSG00000127511
None
I am tf:  ENSG00000161513 ENSG00000127511
None
I am tf:  ENSG00000148672 ENSG00000127511
None
I am tf:  ENSG00000221978 ENSG00000127511
None
I am tf:  ENSG00000166165 ENSG00000127511
None
I am tf:  ENSG00000123124 ENSG00000127511
None
I am tf:  ENSG00000152620 ENSG00000127511
None
I am tf:  ENSG00000175782 ENSG00000127511
None
I am tf:  ENSG00000120029 ENSG00000127511
None
I am tf:  ENSG00000112

Building graph:  65%|██████▌   | 753/1158 [00:02<00:01, 318.15it/s]

I am tf:  ENSG00000104325 ENSG00000137693
None
I am tf:  ENSG00000159884 ENSG00000137693
None
I am tf:  ENSG00000165887 ENSG00000137693
None
I am tf:  ENSG00000184924 ENSG00000137693
None
I am tf:  ENSG00000148677 ENSG00000137693
None
I am tf:  ENSG00000144306 ENSG00000137693
None
I am tf:  ENSG00000221944 ENSG00000137693
None
I am tf:  ENSG00000198763 ENSG00000137693
None
I am tf:  ENSG00000146416 ENSG00000137693
None
I am tf:  ENSG00000141759 ENSG00000137693
None
I am tf:  ENSG00000141252 ENSG00000137693
None
I am tf:  ENSG00000089101 ENSG00000137693
None
I am tf:  ENSG00000125166 ENSG00000137693
None
I am tf:  ENSG00000120129 ENSG00000137693
None
I am tf:  ENSG00000116717 ENSG00000137693
None
I am tf:  ENSG00000139998 ENSG00000137693
None
I am tf:  ENSG00000138759 ENSG00000137693
None
I am tf:  ENSG00000163171 ENSG00000137693
None
I am tf:  ENSG00000116704 ENSG00000137693
None
I am tf:  ENSG00000125148 ENSG00000137693
None
I am tf:  ENSG00000129347 ENSG00000137693
None
I am tf:  ENS

Building graph:  73%|███████▎  | 845/1158 [00:02<00:00, 330.94it/s]


I am tf:  ENSG00000171617 ENSG00000162702
None
I am tf:  ENSG00000180901 ENSG00000162702
None
I am tf:  ENSG00000163961 ENSG00000162702
None
I am tf:  ENSG00000135778 ENSG00000162702
None
I am tf:  ENSG00000164983 ENSG00000162702
None
I am tf:  ENSG00000203965 ENSG00000162702
None
I am tf:  ENSG00000067225 ENSG00000162702
None
I am tf:  ENSG00000128016 ENSG00000162702
None
I am tf:  ENSG00000146281 ENSG00000162702
None
I am tf:  ENSG00000166797 ENSG00000162702
None
I am tf:  ENSG00000167996 ENSG00000162702
None
I am tf:  ENSG00000128918 ENSG00000162702
None
I am tf:  ENSG00000154734 ENSG00000162702
None
I am tf:  ENSG00000067369 ENSG00000162702
None
I am tf:  ENSG00000113916 ENSG00000162702
None
I am tf:  ENSG00000171735 ENSG00000162702
None
I am tf:  ENSG00000115307 ENSG00000162702
None
I am tf:  ENSG00000100906 ENSG00000162702
None
I am tf:  ENSG00000152904 ENSG00000162702
None
I am tf:  ENSG00000111641 ENSG00000162702
None
I am tf:  ENSG00000120802 ENSG00000162702
None
I am tf:  EN

Building graph:  84%|████████▍ | 977/1158 [00:03<00:00, 391.36it/s]

I am tf:  ENSG00000154814 ENSG00000168610
None
I am tf:  ENSG00000104325 ENSG00000168610
None
I am tf:  ENSG00000168209 ENSG00000168610
None
I am tf:  ENSG00000087053 ENSG00000168610
None
I am tf:  ENSG00000091986 ENSG00000168610
None
I am tf:  ENSG00000148090 ENSG00000168610
None
I am tf:  ENSG00000159884 ENSG00000168610
None
I am tf:  ENSG00000165887 ENSG00000168610
None
I am tf:  ENSG00000184924 ENSG00000168610
None
I am tf:  ENSG00000141429 ENSG00000168610
None
I am tf:  ENSG00000148677 ENSG00000168610
None
I am tf:  ENSG00000135469 ENSG00000168610
None
I am tf:  ENSG00000144306 ENSG00000168610
None
I am tf:  ENSG00000157557 ENSG00000168610
None
I am tf:  ENSG00000221944 ENSG00000168610
None
I am tf:  ENSG00000198763 ENSG00000168610
None
I am tf:  ENSG00000189058 ENSG00000168610
None
I am tf:  ENSG00000146416 ENSG00000168610
None
I am tf:  ENSG00000141759 ENSG00000168610
None
I am tf:  ENSG00000160214 ENSG00000168610
None
I am tf:  ENSG00000141252 ENSG00000168610
None
I am tf:  ENS

Building graph:  93%|█████████▎| 1075/1158 [00:03<00:00, 367.89it/s]

I am tf:  ENSG00000157557 ENSG00000178175
None
I am tf:  ENSG00000221944 ENSG00000178175
None
I am tf:  ENSG00000189058 ENSG00000178175
None
I am tf:  ENSG00000141759 ENSG00000178175
None
I am tf:  ENSG00000089101 ENSG00000178175
None
I am tf:  ENSG00000120129 ENSG00000178175
None
I am tf:  ENSG00000116717 ENSG00000178175
None
I am tf:  ENSG00000196440 ENSG00000178175
None
I am tf:  ENSG00000198125 ENSG00000178175
None
I am tf:  ENSG00000138759 ENSG00000178175
None
I am tf:  ENSG00000025039 ENSG00000178175
None
I am tf:  ENSG00000174021 ENSG00000178175
None
I am tf:  ENSG00000129347 ENSG00000178175
None
I am tf:  ENSG00000166165 ENSG00000178175
None
I am tf:  ENSG00000172995 ENSG00000178175
None
I am tf:  ENSG00000142082 ENSG00000178175
None
I am tf:  ENSG00000087884 ENSG00000178175
None
I am tf:  ENSG00000148154 ENSG00000178175
None
I am tf:  ENSG00000124762 ENSG00000178175
None
I am tf:  ENSG00000226742 ENSG00000178175
None
I am tf:  ENSG00000150347 ENSG00000178175
None
I am tf:  ENS

Building graph: 100%|██████████| 1158/1158 [00:03<00:00, 319.26it/s]

I am tf:  ENSG00000120129 ENSG00000197024
None
I am tf:  ENSG00000196440 ENSG00000197024
None
I am tf:  ENSG00000139998 ENSG00000197024
None
I am tf:  ENSG00000138759 ENSG00000197024
None
I am tf:  ENSG00000029534 ENSG00000197024
None
I am tf:  ENSG00000025039 ENSG00000197024
None
I am tf:  ENSG00000112992 ENSG00000197024
None
I am tf:  ENSG00000166165 ENSG00000197024
None
I am tf:  ENSG00000152620 ENSG00000197024
None
I am tf:  ENSG00000109794 ENSG00000197024
None
I am tf:  ENSG00000163637 ENSG00000197024
None
I am tf:  ENSG00000166741 ENSG00000197024
None
I am tf:  ENSG00000087884 ENSG00000197024
None
I am tf:  ENSG00000150347 ENSG00000197024
None
I am tf:  ENSG00000171617 ENSG00000197024
None
I am tf:  ENSG00000067057 ENSG00000197024
None
I am tf:  ENSG00000149532 ENSG00000197024
None
I am tf:  ENSG00000164983 ENSG00000197024
None
I am tf:  ENSG00000067225 ENSG00000197024
None
I am tf:  ENSG00000113163 ENSG00000197024
None
I am tf:  ENSG00000167996 ENSG00000197024
None
I am tf:  ENS


Graph is built successfully
Graph saved in ./data/mito_test/mito_test_graph.pkl and ./data/mito_test/mito_test_graph.gml


In [28]:
print(pygraph)

MultiDiGraph with 12329 nodes and 144183 edges


In [29]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(pygraph, "networkx_graph_test_1.graphml")

In [None]:
def add_tflink_gene_tf_subgraph(g, gene_node_label, annot_list):
    """Construct part of the graph by linking the gene to genes.

    :param g: the input graph to extend with new nodes and edges.
    :param gene_node_label: the gene node to be linked to other genes entities.
    :param annot_list: list of protein-protein interactions from StringDb.
    :returns: a NetworkX MultiDiGraph
    """
    for tf in annot_list:
        TFLINK_EDGE_LABEL = "tf_regulates"
        edge_attrs = {
            "datasource": TFLINK,
            "name_target": None,
            "uniprotid_target": None,
            "detection_method": None,
            "pubmedid": None,
            "source_database": None,
            "small_scale_evidence": None,
            "label": TFLINK_EDGE_LABEL,
        }
        edge_attrs["name_target"] = tf["Name.Target"]
        edge_attrs["uniprotid_target"] = tf["UniprotID.Target"]
        edge_attrs["detection_method"] = tf["Detection.method"]
        edge_attrs["pubmedid"] = tf["PubmedID"]
        edge_attrs["source_database"] = tf["Source.database"]
        edge_attrs["small_scale_evidence"] = tf["Small-scale.evidence"]

        edge_hash = hash(frozenset(edge_attrs.items()))
        edge_attrs["edge_hash"] = edge_hash

        edge_data = g.get_edge_data(tf["Name.Target"], gene_node_label)
        print("I am tf: ", tf["Name.Target"], gene_node_label)
        print(edge_data)

        edge_data = {} if edge_data is None else edge_data
        node_exists = [x for x, y in edge_data.items() if y["attr_dict"]["edge_hash"] == edge_hash]
        if len(node_exists) == 0 and not pd.isna(tf["Name.Target"]):
            # g.add_edge(
            #     gene_node_label,
            #     tf["Name.Target"],
            #     label=TFLINK_EDGE_LABEL,
            #     attr_dict=edge_attrs,
            # )
            g.add_edge(
                tf["Name.Target"],
                gene_node_label,
                label="tf_regulates",
                attr_dict=edge_attrs,
            )
    return g