# ComPath -  Gene based comparison


##### Author: Daniel Domingo-Fernandez

The goal of this notebook is to calculate the overlap between KEGG and Reactome pathways based on the similarity gene sets (statistics: hypergeometric test).

### Notebook imports

In [4]:
import time
import sys
import os

import pandas as pd
from collections import defaultdict
from difflib import SequenceMatcher

### Notebook configuration

In [5]:
time.asctime()

'Mon Jan 29 10:52:17 2018'

In [6]:
print(sys.version)

3.4.5 (default, Dec 11 2017, 14:22:24) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-16)]


In [8]:
COMPATH_PATH = os.environ['COMPATH']

### Load resources

In [9]:
kegg_excel = os.path.join(COMPATH_PATH,'src','compath','static','resources','excel','kegg_gene_sets.csv')
reactome_excel = os.path.join(COMPATH_PATH,'src','compath','static','resources','excel','reactome_gene_sets.csv')

kegg_dataframe = pd.read_csv(kegg_excel, dtype=object)
reactome_dataframe = pd.read_csv(reactome_excel, dtype=object)

# Remove the 'Homo sapiens' out of the KEGG pathways
kegg_dataframe.columns = [
    kegg_pathway.replace(' - Homo sapiens (human)', '')
    for kegg_pathway in kegg_dataframe
] 
    
kegg_pathways = defaultdict(set)
reactome_pathways = defaultdict(set)

############################ KEGG ###################################
for pathway_name in kegg_dataframe: # 323 pathways KEGG
                
    for gene in kegg_dataframe[pathway_name].unique():
        if not isinstance(gene, str): # There are NaN in the Pandas nArray
            continue
            
        kegg_pathways[pathway_name].add(gene)
        
############################ Reactome ###################################
for pathway_name in reactome_dataframe: # 2162 pathways KEGG
                
    for gene in reactome_dataframe[pathway_name].unique():
        if not isinstance(gene, str): # There are NaN in the Pandas nArray
            continue
            
        reactome_pathways[pathway_name].add(gene)