# Example: Differentially Expressed Gene Analysis Workflow

This notebook provides insights on how to use the tool if you have DEG file.
Our current data sources include:
* DisGeNet
* MolMeDB
* OpenTargets
* WikiPathways
* WikiData
* STRING
* Bgee

In [1]:
# Import modules
from pyBiodatafuse import data_loader, id_mapper
from pyBiodatafuse.annotators import disgenet, molmedb  # Example with just two resource
from pyBiodatafuse.utils import combine_sources

## Load the DEA results

In [2]:
data_input = data_loader.create_df_from_dea("dea_example.xls")
data_input.head()

Unnamed: 0,identifier,F value,Pr(>F),FDR(>F),Log2FC,t.ratio,p.value
0,ENSG00000000003,198663.0,0.07389,0.11995,0.21672,267662.0,0.10432
1,ENSG00000000005,532116.0,8e-05,0.00041,-0.34276,-158217.0,0.69388
2,ENSG00000000419,123704.0,0.29357,0.36707,-0.03554,-0.68119,0.99364
3,ENSG00000000457,122821.0,0.29781,0.37146,-0.10078,-168058.0,0.62915
4,ENSG00000000460,0.7331,0.624,0.67303,0.25185,134237.0,0.83158


### Filtering the DEA table

In [3]:
data_filtered = data_loader.filter_dea(data_input, column_name="FDR(>F)", min_value=0.05)
data_filtered = data_loader.filter_dea(data_filtered, column_name="Log2FC", abs_value=0.5)
data_filtered.head()

Unnamed: 0,identifier,F value,Pr(>F),FDR(>F),Log2FC,t.ratio,p.value
70,ENSG00000005108,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273
73,ENSG00000005189,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499
168,ENSG00000008226,1751.0,0.11493,0.1717,0.9616,254376.0,0.14795
181,ENSG00000008517,240722.0,0.03233,0.06086,-0.77406,-308262.0,0.03358
337,ENSG00000021826,189207.0,0.08899,0.1396,0.50922,304276.0,0.04013


## Entity resolution using BridgeDB

In [4]:
bridgdb_df, bridgdb_metadata = id_mapper.bridgedb_xref(
    identifiers=data_filtered,
    input_species="Human",
    input_datasource="Ensembl",
    output_datasource="All",
)
bridgdb_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,F value_dea,Pr(>F)_dea,FDR(>F)_dea,Log2FC_dea,t.ratio_dea,p.value_dea
0,ENSG00000005108,Ensembl,214920_PM_at,Affy,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273
1,ENSG00000005108,Ensembl,Hs.29900.0.S1_3p_at,Affy,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273
2,ENSG00000005108,Ensembl,221981,WikiGenes,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273
3,ENSG00000005108,Ensembl,GO:0016021,Gene Ontology,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273
4,ENSG00000005108,Ensembl,uc064bom.1,UCSC Genome Browser,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273


## Disease annotating using DisGeNet

In [5]:
api_key = "0209751bfa7b6a981a8f5fb5f062313067ecd36c"  # TODO: add your key
params = {"source": "CURATED", "format": "json"}  # only curated data
disgenet_result, disgenet_metadata = disgenet.get_gene_disease(
    bridgedb_df=bridgdb_df, api_key=api_key, params=params
)
disgenet_result.head()

Unnamed: 0,identifier,identifier.source,target,target.source,F value_dea,Pr(>F)_dea,FDR(>F)_dea,Log2FC_dea,t.ratio_dea,p.value_dea,DisGeNET
0,ENSG00000005108,Ensembl,221981,NCBI Gene,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273,"[{'gene_dsi': nan, 'gene_dpi': nan, 'gene_pli'..."
1,ENSG00000005189,Ensembl,81691,NCBI Gene,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'gene_dsi': nan, 'gene_dpi': nan, 'gene_pli'..."
2,ENSG00000008226,Ensembl,9940,NCBI Gene,1751.0,0.11493,0.1717,0.9616,254376.0,0.14795,"[{'gene_dsi': nan, 'gene_dpi': nan, 'gene_pli'..."
3,ENSG00000008517,Ensembl,9235,NCBI Gene,240722.0,0.03233,0.06086,-0.77406,-308262.0,0.03358,"[{'gene_dsi': nan, 'gene_dpi': nan, 'gene_pli'..."
4,ENSG00000021826,Ensembl,1373,NCBI Gene,189207.0,0.08899,0.1396,0.50922,304276.0,0.04013,"[{'gene_dsi': nan, 'gene_dpi': nan, 'gene_pli'..."


In [6]:
disgenet_result["DisGeNET"][0]

[{'gene_dsi': nan,
  'gene_dpi': nan,
  'gene_pli': nan,
  'protein_class': nan,
  'protein_class_name': nan,
  'diseaseid': nan,
  'disease_name': nan,
  'disease_class': nan,
  'disease_class_name': nan,
  'disease_type': nan,
  'disease_semantic_type': nan,
  'score': nan,
  'ei': nan,
  'el': nan,
  'year_initial': nan,
  'year_final': nan,
  'source': nan}]

## Inhibitors of trasporters encoded by genes annotation by MolMeDB

In [7]:
inhibitor_df, inhibitor_metadata = molmedb.get_gene_mol_inhibitor(bridgedb_df=bridgdb_df)
inhibitor_df.head()

Unnamed: 0,identifier,identifier.source,target,target.source,F value_dea,Pr(>F)_dea,FDR(>F)_dea,Log2FC_dea,t.ratio_dea,p.value_dea,transporter_inhibitor
0,ENSG00000005108,Ensembl,Q9UPZ6,Uniprot-TrEMBL,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
1,ENSG00000005189,Ensembl,A0A024R390,Uniprot-TrEMBL,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
2,ENSG00000005189,Ensembl,H3BM72,Uniprot-TrEMBL,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
3,ENSG00000005189,Ensembl,H3BPB2,Uniprot-TrEMBL,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
4,ENSG00000005189,Ensembl,H3BQ98,Uniprot-TrEMBL,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."


In [8]:
inhibitor_df["transporter_inhibitor"][72]

[{'label': nan,
  'InChIKey': nan,
  'SMILES': nan,
  'pubchem_compound_id': nan,
  'molmedb_id': nan,
  'source_doi': nan,
  'source_pmid': nan,
  'chebi_id': nan,
  'drugbank_id': nan,
  'pdb_ligand_id': nan}]

## Combing all the results into single dataframe

In [9]:
combined_df = combine_sources([disgenet_result, inhibitor_df])

In [10]:
combined_df.head(4)

Unnamed: 0,identifier,identifier.source,target,target.source,F value_dea,Pr(>F)_dea,FDR(>F)_dea,Log2FC_dea,t.ratio_dea,p.value_dea,DisGeNET,transporter_inhibitor
0,ENSG00000005108,Ensembl,221981,NCBI Gene,169738.0,0.12886,0.18847,0.59097,239599.0,0.20273,"[{'gene_dsi': nan, 'gene_dpi': nan, 'gene_pli'...","[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
1,ENSG00000005189,Ensembl,81691,NCBI Gene,250342.0,0.02632,0.05167,0.70124,335289.0,0.01499,"[{'gene_dsi': nan, 'gene_dpi': nan, 'gene_pli'...","[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
2,ENSG00000008226,Ensembl,9940,NCBI Gene,1751.0,0.11493,0.1717,0.9616,254376.0,0.14795,"[{'gene_dsi': nan, 'gene_dpi': nan, 'gene_pli'...","[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
3,ENSG00000008517,Ensembl,9235,NCBI Gene,240722.0,0.03233,0.06086,-0.77406,-308262.0,0.03358,"[{'gene_dsi': nan, 'gene_dpi': nan, 'gene_pli'...","[{'label': nan, 'InChIKey': nan, 'SMILES': nan..."
