## Explore mitochondrial impairment in tumars 

**Aim**: in this notebook, you will see all the steps for collecting data and constructing a KG to explore mitochondrial impairment in tumer (both in human and mice)

### Import required libraries

In [None]:
# Import modules
import os
import pickle

import pandas as pd

from pyBiodatafuse import id_mapper
from pyBiodatafuse.annotators import (
    tflink,
    gprofiler,
    mitocarta,
    stringdb,
)

from pyBiodatafuse.constants import STRING_PPI_COL
from pyBiodatafuse.graph import generator
from pyBiodatafuse.utils import (
    combine_sources,
    create_harmonized_input_file,
    create_or_append_to_metadata,
    get_identifier_of_interest,
)

os.makedirs("data", exist_ok=True)
base_dir = os.path.abspath(os.getcwd())

### Load the input files

In [None]:
# Read only specific columns and skip the first row
all_genes = pd.read_excel("datasets/cachexia_vs_control_all_genes.xlsx")
all_genes.rename(
    columns={"Unnamed: 0": "identifier", "Unnamed: 1": "GENE_SYMBOL"}, inplace=True
)
deg_data = all_genes[all_genes["padj"] < 0.01]
print("Number of genes:", len(all_genes["identifier"].unique()))
deg_data.head(1)

In [None]:
deg_data[deg_data["identifier"] == "ENSG00000159713"]  # does not exist in the deg table shared (internal check for this usecase)

### Entity resolution with BridgeDB

In [None]:
pickle_path = os.path.join(base_dir, "data/human/bridgedb_df.pkl")
metadata_path = os.path.join(base_dir, "data/human/bridgedb_metadata.pkl")

if not os.path.exists(pickle_path):
    bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
        identifiers=all_genes,
        input_species="Human",
        input_datasource="Ensembl",
        output_datasource="All",
    )
    bridgedb_df.to_pickle(pickle_path)
    with open(metadata_path, "wb") as file:
        pickle.dump(bridgedb_metadata, file)
else:
    bridgedb_df = pd.read_pickle(pickle_path)
    with open(metadata_path, "rb") as file:
        bridgedb_metadata = pickle.load(file)

In [None]:
print("Number of genes with mapping in BridgeDb:", len(bridgedb_df["identifier"].unique()))
bridgedb_df.head(1)

### TF-target interactions

In [None]:
tflink_path = os.path.join(base_dir, "data/human/tflink.pkl")
tflink_metadata_path = os.path.join(base_dir, "data/human/tflink_metadata.pkl")

if not os.path.exists(tflink_path):
    tflink_df, tflink_metadata = tflink.get_tf_target(
        bridgedb_df=bridgedb_df,
        tf_file="TFLink_Homo_sapiens_interactions_All_simpleFormat_v1.0.tsv.gz",
        filename="data/human/tflink_human.tsv.gz",
        filter_deg=True,
        padj_colname="padj",
        padj_filter=0.05
    )
    tflink_df.to_pickle(tflink_path)
    with open(tflink_metadata_path, "wb") as file:
        pickle.dump(tflink_metadata, file)
else:
    tflink_df = pd.read_pickle(tflink_path)
    with open(tflink_metadata_path, "rb") as file:
        tflink_metadata = pickle.load(file)

tflink_df.head(2)

In [None]:
tflink_df[tflink_df['target']=="1647"]

In [None]:
tflink_metadata

### Enrichment analysis using g:Profiler
all the pathways and annotations are being added despite being significance.

In [None]:
gprofiler_path = os.path.join(base_dir, "data/human/gprofiler.pkl")
gprofiler_metadata_path = os.path.join(base_dir, "data/human/gprofiler_metadata.pkl")

if not os.path.exists(gprofiler_path):
    gprofiler_df, gprofiler_metadata = gprofiler.get_gene_enrichment(
        bridgedb_df=bridgedb_df, species="hsapiens", padj_colname="padj", padj_filter=0.05)
    gprofiler_df.to_pickle(gprofiler_path)
    with open(gprofiler_metadata_path, "wb") as file:
        pickle.dump(gprofiler_metadata, file)
else:
    gprofiler_df = pd.read_pickle(gprofiler_path)
    with open(gprofiler_metadata_path, "rb") as file:
        gprofiler_metadata = pickle.load(file)

gprofiler_df.head(1)

In [None]:
gprofiler_df[gprofiler_df["g:Profiler_reac"].notna()].head(1)

### Add MitoCarta data

In [None]:
mitocarta_path = os.path.join(base_dir, "data/human/mitocarta.pkl")
mitocarta_metadata_path = os.path.join(base_dir, "data/human/mitocarta_metadata.pkl")

if not os.path.exists(mitocarta_path):
    mitocarta_df, mitocarta_metadata = mitocarta.get_gene_mito_pathways(
        bridgedb_df=bridgedb_df,
        mitocarta_file="Human.MitoCarta3.0.xls",
        filename="data/human/mitocarta3.0_human.xls",
        species="hsapiens",
        sheet_name="A Human MitoCarta3.0"
    )
    mitocarta_df.to_pickle(mitocarta_path)
    with open(mitocarta_metadata_path, "wb") as file:
        pickle.dump(mitocarta_metadata, file)
else:
    mitocarta_df = pd.read_pickle(mitocarta_path)
    with open(mitocarta_metadata_path, "rb") as file:
        mitocarta_metadata = pickle.load(file)

mitocarta_df.head(1)

In [None]:
mitocarta_df[mitocarta_df["identifier"] == "ENSG00000005156"]["MitoCarta"].to_dict()

In [None]:
mitocarta_df[mitocarta_df["identifier"] == "ENSG00000179091"]["MitoCarta"].to_dict()

In [None]:
mitocarta_df[mitocarta_df["identifier"] == "ENSG00000167186"]["MitoCarta"].to_dict()

### Protein-Protein Interactions from STRING

In [None]:
string_path = os.path.join(base_dir, "data/human/string.pkl")
string_metadata_path = os.path.join(base_dir, "data/human/string_metadata.pkl")

if not os.path.exists(string_path):
    ppi_df, ppi_metadata = stringdb.get_ppi(
        bridgedb_df=bridgedb_df[bridgedb_df["padj_dea"] <= 0.05]
    )
    # ppi_df.to_pickle(string_path)
    with open(string_metadata_path, "wb") as file:
        pickle.dump(ppi_metadata, file)
else:
    ppi_df = pd.read_pickle(string_path)
    with open(string_metadata_path, "rb") as file:
        ppi_metadata = pickle.load(file)

ppi_df.head()

In [None]:
ppi_df[STRING_PPI_COL].to_dict()

In [None]:
ppi_df[STRING_PPI_COL].to_dict()

## Graph generation 

### Combine all data and metadata

In [None]:
combined_df = combine_sources(
    bridgedb_df,
    [
        tflink_df,
        mitocarta_df,
        gprofiler_df,
        # ppi_df,
    ],
)

In [None]:
combined_df[combined_df["identifier"] == "ENSG00000167186"]

In [None]:
combined_df_path = os.path.join(base_dir, "data/human/combined_df.pkl")

if not os.path.exists(combined_df_path):
    combined_df.to_pickle("data/human/combined_df.pkl")
else:
    with open(combined_df_path, "rb") as f:
        combined_df = pickle.load(f)

combined_df.head()


In [None]:
combined_metadata = create_or_append_to_metadata(
    bridgedb_metadata,
    [
        tflink_metadata,
        mitocarta_metadata,
        gprofiler_metadata,
        # ppi_metadata,
    ],
)

In [None]:
combined_metadata

In [None]:
with open("data/human/combined_metadata.pkl", "wb") as out:
    pickle.dump(combined_metadata, out)

### Create a graph from the annotated dataframe

In [None]:
combined_df.head(1)

#### subseting the rows to contruct the graph based on the biological quetion

In [None]:
# Extract all 'NCBI.GeneID.TF' values into a single list
ncbi_gene_ids = (
    combined_df["its_tf"]
    .apply(lambda x: [d["NCBI.GeneID.TF"] for d in x] if isinstance(x, list) else [])
    .explode()
    .dropna()
    .unique()
    .tolist()
)

len(ncbi_gene_ids)

In [None]:
combined_df_tf = combined_df[combined_df["target"].isin(ncbi_gene_ids)]
combined_df_sig = combined_df[combined_df["padj_dea"] <= 0.05]
combined_df_sig = combined_df_sig[~combined_df_sig["target"].isin(ncbi_gene_ids)]
combined_df_sig.shape

In [None]:
combined_df_tf_sig = pd.concat([combined_df_sig, combined_df_tf], axis=0, ignore_index=True)
combined_df_tf_sig.shape

In [None]:
combined_df[combined_df["identifier"] == "ENSG00000108387"]["MitoCarta"]

In [None]:
combined_df.loc[combined_df["identifier"] == "ENSG00000108387", "GENE_SYMBOL_dea"]="SEPTIN4"
combined_df = combined_df[combined_df["identifier"] != "ENSG00000108387"]

In [None]:
# Extract all targets for NFYA ('NCBI.GeneID.TF')
ncbi_gene_ids_NFYA_targets = (
    combined_df_tf_sig[combined_df_tf_sig["identifier"] == "ENSG00000001167"]["its_target"]
    .apply(lambda x: [d["NCBI.GeneID.Target"] for d in x] if isinstance(x, list) else [])
    .explode()
    .dropna()
    .unique()
    .tolist()
)

len(ncbi_gene_ids_NFYA_targets)

In [None]:
pygraph = generator.save_graph(
    combined_df=combined_df,
    combined_metadata=combined_metadata,
    graph_name="examples",
    graph_dir="./data/human",
)

In [None]:
print(pygraph)

In [None]:
from pyBiodatafuse.graph import neo4j

neo4j.save_graph_to_graphml(pygraph, "./data/human/networkx_human_graph.graphml")