# Enrichment Analysis on LCI Predictions

Analyzes the enrichment of proteins predicted by LCI

In [14]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append(os.path.abspath(os.path.join('..')))

import numpy as np
import matplotlib.pyplot as plt 
import torch
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm_notebook as tqdm
import goatools
from goatools.base import download_go_basic_obo, download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.associations import read_ncbi_gene2go
from goatools.go_enrichment import GOEnrichmentStudy


from dpp.methods.lci.lci_method import LCIModule
from dpp.data.network import PPINetwork
from dpp.util import Params
from dpp.data.associations import load_diseases

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading Data
Load disease associations and protein-protein interaction network.

In [9]:
# load diseases
diseases_dict = load_diseases("../data/associations/disgenet-associations.csv", exclude_splits=['none'])

In [11]:
# load network
network = PPINetwork("../../data/networks/bio-pathways-network.txt")
n = len(network)

## Load Predictions
Load predictions from a disease protein prediction method.

In [15]:
df = pd.read_csv("../experiments/bio-pathways/dpp_predict/lci/predictions.csv")

In [16]:
df

Unnamed: 0,1,2,9,10,12,13,14,15,16,18,...,100862847,100885850,100996747,101059918,101060200,101060521,101290500,101929876,101930400,102288414
0,-0.784300,0.013253,-0.677851,-0.993564,-0.464602,-0.901915,-0.750326,-0.779077,-1.028024,-0.656335,...,-0.856752,-0.929528,-0.958575,-0.430411,-1.251774,-1.028024,-1.035517,-1.028024,-1.258981,-0.429775
1,-1.028024,-0.997050,-1.028024,-1.040959,-1.028024,-1.062652,-1.028024,-1.033659,-1.028024,-0.957697,...,-0.859228,-1.028024,-1.028024,-0.657097,-1.024402,-1.028024,-1.023026,-1.028024,-1.052357,-1.049147
2,-0.701300,-0.785917,-1.028024,-1.045091,-1.028024,-0.900795,-0.126222,-1.028024,-1.028024,-0.665178,...,-1.124004,-1.028024,-1.028024,-1.028024,-1.066144,-1.028024,-1.028024,-1.028024,-1.028024,-1.055895
3,-0.863765,-0.678050,-0.992089,-1.079400,-0.464602,-0.995097,-0.912291,-1.014449,-1.028024,-0.796113,...,-1.030918,-0.965783,-0.907104,-1.028024,-0.949311,-1.028024,-1.039803,-1.028024,-1.316651,-0.488121
4,-0.024629,0.265325,-0.573819,-1.206218,-1.028024,-0.715231,-0.889896,-0.722942,-1.028024,0.046602,...,-0.763842,-0.910892,-1.049837,-0.161650,-0.965766,-1.028024,-0.957423,-1.028024,-1.096272,-0.593325
5,-0.955593,-0.994876,-1.028024,-1.121276,-1.028024,-0.832372,-0.917265,-0.904241,-1.028024,-0.926156,...,-0.805592,-1.028024,-0.991032,-1.028024,-1.131723,-1.028024,-1.039943,-1.028024,-1.028024,-1.043352
6,-1.020473,-0.465358,-0.900508,-1.107121,-1.028024,-0.813381,-0.667157,-0.209395,-1.028024,-0.364860,...,-0.857115,-0.955728,-1.126455,-0.717074,-1.119198,-1.028024,-0.543749,-1.555316,-0.930554,0.870967
7,-1.000481,-0.744736,-1.028024,-1.061516,-1.028024,-1.028024,-1.055847,-1.028024,-1.028024,-1.021858,...,-0.959316,-1.028024,-0.967417,-0.978151,-1.028024,-1.028024,-0.768000,-1.028024,-1.028024,-0.868075
8,-0.794633,0.003106,1.403869,-1.047753,-1.028024,-0.117284,-1.574167,-0.611062,-1.028024,0.549958,...,-0.075405,-0.938407,-0.891669,-0.574223,-1.167206,-1.110456,-0.730002,-1.382221,-1.099813,-0.931372
9,-1.149860,-0.807657,-0.864696,-1.039589,-1.028024,-1.028024,-1.016576,-1.051126,-1.028024,-0.558861,...,-0.716561,-0.924961,-1.051131,-0.870540,-1.089541,-1.028024,-0.975888,-1.028024,-1.054323,-1.046910


## Load Enrichment Analysis
Prepare an enrichment study

In [21]:
# download GOA obo file
obo_fname = download_go_basic_obo();

  EXISTS: go-basic.obo


In [22]:
# load gene ontology
obodag = GODag("go-basic.obo");

go-basic.obo: fmt(1.2) rel(2019-01-16) 47,377 GO Terms


In [12]:
geneid2go = read_ncbi_gene2go("gene2go", taxids=[9606])

  20,385 items READ: gene2go


In [23]:
goeaobj = GOEnrichmentStudy(protein_to_node.keys(), # List of mouse protein-coding genes
                            geneid2go, # geneid/GO associations
                            obodag, # Ontologies
                            propagate_counts = True,
                            alpha = 0.05, # default significance cut-off
                            methods = ['fdr_bh']) # defult multipletest correction method

fisher module not installed.  Falling back on scipy.stats.fisher_exact


Propagating term counts to parents ..


 76% 16,420 of 21,557 population items found in association


## Perform Enrichment Analysis
Perform an enrichment analysis on one disease. 

In [94]:
disease_id = "C1862314"
disease_proteins = diseases_dict[disease_id].proteins
ranked_nodes = disease_to_ranks[disease_id]
pred_proteins = map(node_to_protein.get, ranked_nodes[-len(disease_proteins):])

disease_results = goeaobj.run_study(disease_proteins)

pred_results = goeaobj.run_study(pred_proteins)

100%     23 of     23 study items found in association
100%     23 of     23 study items found in population(21557)
Calculating 21,968 uncorrected p-values using fisher_scipy_stats
  21,968 GO terms are associated with 16,420 of 21,557 population items
   1,564 GO terms are associated with     23 of     23 study items
     334 GO terms found significant (< 0.05=alpha) after multitest correction: statsmodels fdr_bh
 70%     16 of     23 study items found in association
100%     23 of     23 study items found in population(21557)
Calculating 21,968 uncorrected p-values using fisher_scipy_stats
  21,968 GO terms are associated with 16,420 of 21,557 population items
     521 GO terms are associated with     16 of     23 study items
      14 GO terms found significant (< 0.05=alpha) after multitest correction: statsmodels fdr_bh


In [85]:
k = 10
disease_top_k = sorted(disease_results, key=lambda x: x.p_fdr_bh)[:k]
disease_significant = [r for r in disease_results if r.p_fdr_bh < 0.02]

pred_top_k = sorted(pred_results, key=lambda x: x.p_fdr_bh)[:k]
pred_significant = [r for r in pred_results if r.p_fdr_bh < 0.02]

In [86]:
intersection = set([result.goterm.name for result in disease_top_k]) & set([result.goterm.name for result in pred_top_k])
union = set([result.goterm.name for result in disease_top_k]) | set([result.goterm.name for result in pred_top_k])

print("Jaccard Similarity: {}".format(1.0*len(intersection)/len(union)))
print(intersection)

Jaccard Similarity: 0.25
set(['cellular response to DNA damage stimulus', 'DNA repair', 'damaged DNA binding', 'DNA metabolic process'])


In [87]:
intersection = set([result.goterm.name for result in pred_significant]) & set([result.goterm.name for result in disease_significant])
union = set([result.goterm.name for result in pred_significant]) | set([result.goterm.name for result in disease_significant])

print("Jaccard Similarity: {}".format(1.0*len(intersection)/len(union)))
print(intersection)

Jaccard Similarity: 0.290243902439
set(['cell cycle process', 'oxidized DNA binding', 'negative regulation of DNA metabolic process', 'protein-DNA complex subunit organization', 'organic substance metabolic process', 'mitotic cell cycle checkpoint', 'regulation of DNA replication', 'nucleobase-containing compound metabolic process', 'regulation of nucleobase-containing compound metabolic process', 'intracellular organelle part', 'DNA repair complex', 'regulation of smoothened signaling pathway', 'UV protection', 'cellular response to abiotic stimulus', 'DNA repair', 'intrinsic apoptotic signaling pathway in response to DNA damage', 'regulation of helicase activity', 'nucleoside-triphosphatase activity', 'cellular aromatic compound metabolic process', 'organic cyclic compound metabolic process', 'nucleic acid metabolic process', 'pyrophosphatase activity', 'apoptotic signaling pathway', 'ATPase activity, coupled', 'mismatch repair', 'organic cyclic compound binding', 'hydrolase activity

In [None]:
# compute jaccard similarity for all diseases
# find ten diseases 