# Example 3: Differentially Expressed Gene Analysis Workflow

This notebook provides insights on how to use the tool if you have DEG file.

# Import modules

In [1]:
import os
import pickle

import pandas as pd
from tqdm import tqdm

from pyBiodatafuse import data_loader, id_mapper
from pyBiodatafuse.analyzer.summarize import BioGraph
from pyBiodatafuse.annotators import kegg, molmedb
from pyBiodatafuse.graph import saver
from pyBiodatafuse.utils import combine_sources, create_or_append_to_metadata

tqdm.pandas()

In [2]:
DATA_DIR = "./data/dea_workflow"
os.makedirs(DATA_DIR, exist_ok=True)

## Load the DEA results

In [3]:
data_input = data_loader.create_df_from_dea("data/dea_example.xls")
data_input.head()

Unnamed: 0,identifier,F value,Pr(>F),FDR(>F),Log2FC,t.ratio,p.value
0,ENSG00000000003,198663.0,0.07389,0.11995,0.21672,267662.0,0.10432
1,ENSG00000000005,532116.0,8e-05,0.00041,-0.34276,-158217.0,0.69388
2,ENSG00000000419,123704.0,0.29357,0.36707,-0.03554,-0.68119,0.99364
3,ENSG00000000457,122821.0,0.29781,0.37146,-0.10078,-168058.0,0.62915
4,ENSG00000000460,0.7331,0.624,0.67303,0.25185,134237.0,0.83158


### Filtering the DEA table

In [4]:
data_filtered = data_loader.filter_dea(data_input, column_name="FDR(>F)", min_value=0.05)
data_filtered = data_loader.filter_dea(data_filtered, column_name="Log2FC", abs_value=0.5)

## Entity resolution using BridgeDB

In [5]:
pickle_path = f"{DATA_DIR}/DEA_gene_list.pkl"
metadata_path = f"{DATA_DIR}/DEA_gene_list_metadata.pkl"

In [6]:
if not os.path.exists(pickle_path):
    bridgedb_df, bridgedb_metadata = id_mapper.bridgedb_xref(
        identifiers=data_filtered,
        input_species="Human",
        input_datasource="Ensembl",
        output_datasource="All",
    )
    bridgedb_df.to_pickle(pickle_path)
    with open(metadata_path, "wb") as f:
        pickle.dump(bridgedb_metadata, f)
else:
    bridgedb_df = pd.read_pickle(pickle_path)
    with open(metadata_path, "rb") as f:
        bridgedb_metadata = pickle.load(f)
bridgedb_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,F value_dea,Pr(>F)_dea,FDR(>F)_dea,Log2FC_dea,t.ratio_dea,p.value_dea
0,ENSG00000005108,Ensembl,214920_PM_at,Affy,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273
1,ENSG00000005108,Ensembl,Hs.29900.0.S1_3p_at,Affy,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273
2,ENSG00000005108,Ensembl,GO:0016021,Gene Ontology,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273
3,ENSG00000005108,Ensembl,uc064bom.1,UCSC Genome Browser,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273
4,ENSG00000005108,Ensembl,GO:0016020,Gene Ontology,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273


## Pathway annotations using KEGG

In [7]:
kegg_df_path = f"{DATA_DIR}/kegg_df.pkl"
kegg_metadata_path = f"{DATA_DIR}/kegg_metadata.pkl"

In [8]:
if not os.path.exists(kegg_df_path):
    kegg_df, kegg_metadata = kegg.get_pathways(bridgedb_df=bridgedb_df)
    kegg_df.to_pickle(kegg_df_path)
    with open(kegg_metadata_path, "wb") as f:
        pickle.dump(kegg_metadata, f)
else:
    kegg_df = pd.read_pickle(kegg_df_path)
    with open(kegg_metadata_path, "rb") as f:
        kegg_metadata = pickle.load(f)
kegg_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,F value_dea,Pr(>F)_dea,FDR(>F)_dea,Log2FC_dea,t.ratio_dea,p.value_dea,KEGG_pathways
0,ENSG00000005108,Ensembl,221981,NCBI Gene,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273,"[{'pathway_id': nan, 'pathway_label': nan, 'ge..."
1,ENSG00000005189,Ensembl,81691,NCBI Gene,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'pathway_id': 'path:hsa03008', 'pathway_labe..."
2,ENSG00000008226,Ensembl,9940,NCBI Gene,1751.0,0.11493,0.1717,0.9616,254376.0,0.14795,"[{'pathway_id': nan, 'pathway_label': nan, 'ge..."
3,ENSG00000008517,Ensembl,9235,NCBI Gene,240722.0,0.03233,0.06086,-0.77406,-308262.0,0.03358,"[{'pathway_id': 'path:hsa04060', 'pathway_labe..."
4,ENSG00000021826,Ensembl,1373,NCBI Gene,189207.0,0.08899,0.1396,0.50922,304276.0,0.04013,"[{'pathway_id': 'path:hsa00220', 'pathway_labe..."


## Inhibitors of trasporters encoded by genes annotation by MolMeDB

In [9]:
inhibitor_df_path = f"{DATA_DIR}/inhibitor_df.pkl"
inhibitor_metadata_path = f"{DATA_DIR}/inhibitor_metadata.pkl"

In [10]:
if not os.path.exists(inhibitor_df_path):
    inhibitor_df, inhibitor_metadata = molmedb.get_gene_compound_inhibitor(bridgedb_df=bridgedb_df)
    inhibitor_df.to_pickle(inhibitor_df_path)
    with open(inhibitor_metadata_path, "wb") as f:
        pickle.dump(inhibitor_metadata, f)
else:
    inhibitor_df = pd.read_pickle(inhibitor_df_path)
    with open(inhibitor_metadata_path, "rb") as f:
        inhibitor_metadata = pickle.load(f)
inhibitor_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,F value_dea,Pr(>F)_dea,FDR(>F)_dea,Log2FC_dea,t.ratio_dea,p.value_dea,MolMeDB_transporter_inhibitor
0,ENSG00000005108,Ensembl,Q9UPZ6,Uniprot-TrEMBL,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
1,ENSG00000005189,Ensembl,A0A024R390,Uniprot-TrEMBL,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
2,ENSG00000005189,Ensembl,H3BM72,Uniprot-TrEMBL,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
3,ENSG00000005189,Ensembl,H3BPB2,Uniprot-TrEMBL,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
4,ENSG00000005189,Ensembl,H3BQ98,Uniprot-TrEMBL,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'compound_name': nan, 'inchikey': nan, 'smil..."


## Combing all the results into single dataframe

In [11]:
combined_df = combine_sources(bridgedb_df, df_list=[kegg_df, inhibitor_df])
combined_df.head(4)

Unnamed: 0,identifier,identifier.source,target,target.source,F value_dea_x,Pr(>F)_dea_x,FDR(>F)_dea_x,Log2FC_dea_x,t.ratio_dea_x,p.value_dea_x,...,t.ratio_dea_y,p.value_dea_y,KEGG_pathways,F value_dea,Pr(>F)_dea,FDR(>F)_dea,Log2FC_dea,t.ratio_dea,p.value_dea,MolMeDB_transporter_inhibitor
0,ENSG00000005108,Ensembl,ENSG00000005108,Ensembl,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273,...,239599.0,0.20273,"[{'pathway_id': nan, 'pathway_label': nan, 'ge...",169738.0,0.12886,0.18847,0.59097,239599.0,0.20273,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
1,ENSG00000005189,Ensembl,ENSG00000005189,Ensembl,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,...,335289.0,0.01499,"[{'pathway_id': 'path:hsa03008', 'pathway_labe...",250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
2,ENSG00000005189,Ensembl,ENSG00000005189,Ensembl,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,...,335289.0,0.01499,"[{'pathway_id': 'path:hsa03008', 'pathway_labe...",250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'compound_name': nan, 'inchikey': nan, 'smil..."
3,ENSG00000005189,Ensembl,ENSG00000005189,Ensembl,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,...,335289.0,0.01499,"[{'pathway_id': 'path:hsa03008', 'pathway_labe...",250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'compound_name': nan, 'inchikey': nan, 'smil..."


In [12]:
combined_metadata = create_or_append_to_metadata(
    bridgedb_metadata, [kegg_metadata, inhibitor_metadata]
)

# Saving the graph

In [13]:
pygraph = saver.save_graph(
    combined_df=combined_df,
    combined_metadata=combined_metadata,
    graph_name="dea",
    graph_dir=DATA_DIR,
)

Combined DataFrame saved in ./data/dea_workflow/dea_df.pkl
Metadata saved in ./data/dea_workflow/dea_metadata.pkl
Building graph: 100%|██████████| 667/667 [00:00<00:00, 19375.98it/s]
Graph is built successfully
Graph saved in ./data/dea_workflow/dea_graph.pkl and ./data/dea_workflow/dea_graph.gml
Graph saved in ./data/dea_workflow/dea_graph.edgelist


# Graph statistics

In [14]:
graph_obj = BioGraph(graph=pygraph)
graph_obj.graph_summary

0,1
Nodes,422.0
Edges,295.0
Components,199.0
Network Density,0.00166


In [15]:
graph_obj.count_nodes_by_data_source(plot=True)

In [16]:
graph_obj.count_edge_by_data_source(plot=True)