# ComPath -  Gene based comparison


The goal of this notebook is to calculate the overlap between ComPath resources based on the similarity gene sets. The measurement used in this notebook to measure pathway similarity is described in [Chen et al.](http://journals.plos.org/plosone/article/authors?id=10.1371/journal.pone.0099030).

#### Author: [Daniel Domingo-Fernández](https://github.com/ddomingof) 

### Import packages needed

In [None]:
import time
import sys
import os

import pandas as pd
import itertools as itt
from collections import defaultdict

from bio2bel_kegg.manager import Manager as KeggManager
from bio2bel_reactome.manager import Manager as ReactomeManager
from bio2bel_wikipathways.manager import Manager as WikiPathwaysManager

### Configuration

In [None]:
time.asctime()

In [None]:
print(sys.version)

In [None]:
COMPATH_PATH = os.environ['COMPATH']
EXCELS_PATH = os.path.join(COMPATH_PATH,'src','compath','static','resources','excel')

### Load resources

In [None]:
kegg_excel = os.path.join(EXCELS_PATH,'kegg_gene_sets.xlsx')
reactome_excel= os.path.join(EXCELS_PATH,'reactome_gene_sets.xlsx')
wikipathways_excel = os.path.join(EXCELS_PATH,'wikipathways_gene_sets.xlsx')

In [None]:
def create_pathway_gene_set_dict(dataframe):
    """Creates a pathway genes dictionary
    
    :param pandas.DataFrame dataset: gene sets df
    :rtype: collections.defaultdict
    :returns: dictionary of pathway gene sets
    """
    
    pathway_dictionary = defaultdict(set)
    
    for pathway_name in dataframe: # iterate over columns in dataframe

        for gene in dataframe[pathway_name].unique():
            if not isinstance(gene, str): # There are NaN in the Pandas nArray
                continue

            pathway_dictionary[pathway_name].add(gene)
            
    return pathway_dictionary

Load KEGG

In [None]:
kegg_dataframe = pd.read_excel(kegg_excel, dtype=object)

# Remove the 'Homo sapiens' out of the KEGG pathways
kegg_dataframe.columns = [
    kegg_pathway.replace(' - Homo sapiens (human)', '')
    for kegg_pathway in kegg_dataframe
] 

kegg_pathways = create_pathway_gene_set_dict(kegg_dataframe)

kegg_manager = KeggManager()

kegg_names_to_ids = kegg_manager.get_pathway_names_to_ids()

kegg_names_to_ids = {
    key.replace(' - Homo sapiens (human)', ''): value.strip('path:hsa')
    for key, value in kegg_names_to_ids.items()
}

try:
    assert (len(kegg_names_to_ids.keys()) == 325)
except AssertionError as error:
    print('KEGG Database contains {} pathways'.format(len(kegg_names_to_ids.keys())))
    
try:
    assert (len(kegg_pathways.keys()) == 325)
except AssertionError as error:
    print('DataFrame does not contain 325 pathways, contains: {}'.format(len(kegg_pathways.keys())))

Load Reactome

In [None]:
reactome_dataframe = pd.read_excel(reactome_excel, dtype=object)

reactome_pathways = create_pathway_gene_set_dict(reactome_dataframe)

reactome_manager = ReactomeManager()

reactome_names_to_ids = reactome_manager.get_pathway_names_to_ids()

try:
    assert (len(reactome_names_to_ids.keys()) == 2162)
except AssertionError as error:
    print('Reactome Database contains {} pathways'.format(len(reactome_names_to_ids.keys()))) # Total of 2662 of those: 2132 are not empty
    
try:
    assert (len(reactome_pathways.keys()) == 2132)
except AssertionError as error:
    print('DataFrame does not contain 2132 pathways, contains: {}'.format(len(reactome_pathways.keys())))


Load WikiPathways

In [None]:
wikipathways_dataframe = pd.read_excel(wikipathways_excel, dtype=object)

wikipathways_pathways = create_pathway_gene_set_dict(wikipathways_dataframe)

wikipathways_manager = WikiPathwaysManager()

wikipathways_names_to_ids = wikipathways_manager.get_pathway_names_to_ids()

# Update on 3rd April 2018
try:
    assert (len(wikipathways_names_to_ids.keys()) == 420)
except AssertionError as error:
    print('WikiPathways Database contains {} pathways'.format(len(wikipathways_names_to_ids.keys())))
    
try:
    assert (len(wikipathways_pathways.keys()) == 420)
except AssertionError as error:
    print('DataFrame does not contain 420 pathways, contains: {}'.format(len(wikipathways_pathways.keys())))

### Pairwise comparison of pathway databases based on content (gene sets)

In [None]:
pathway_databases = ['KEGG', 'Reactome','WikiPathways']

for pathway_database_1, pathway_database2 in itt.combinations(pathway_databases, 2):
    print("{} vs {}".format(pathway_database_1, pathway_database2))

In [None]:
def calculate_jaccard(set_1, set_2):
    """calculates jaccard similarity between two sets
    
    :param set set_1: set 1
    :param set set_2: set 2
    :returns similarity
    :rtype: float
    """
    
    intersection = len(set_1.intersection(set_2))
    smaller_set = min(len(set_1), len(set_2))
        
    return intersection/smaller_set

def create_similarity_matrix(pathway_database_1, pathway_database_2):
    """Creates a similarity matrix for a given pathway-geneset dataset
    
    :param dict pathway_database_1: pathway gene set dictionary
    :param dict pathway_database_2: pathway gene set dictionary
    :rtype: pandas.DataFrame
    :returns: similarity matrix
    """
    
    rows = sorted(pathway_database_1.keys())
    columns = sorted(pathway_database_2.keys())
    
    similarity_dataframe = pd.DataFrame(0.0, index=rows, columns=columns)

    for (pathway_1, gene_set_1), (pathway_2, gene_set_2) in itt.product(pathway_database_1.items(),pathway_database_2.items()):
        similarity_dataframe[pathway_2][pathway_1] = calculate_jaccard(gene_set_1, gene_set_2)
    
    return similarity_dataframe

def get_top_matches(names, top=10):
    """Orders list of tuples by second value and returns top values
    
    :param list[tuple[str,float]] names: list of tuples
    :param int top: top values to return
    """
    sorted_names = sorted(names, key=lambda x: x[1], reverse=True)
    
    return sorted_names[0:top]   

def _check_empty_cell(cell):
    """Checks if there are not overlapping pathways"""
        
    if not cell:
        return ''
    
    cell = [
        '{}|{}'.format(pathway_name, round(similarity, 2))
        for pathway_name, similarity in cell
    ]
    
    return '\n'.join(cell)

def create_curation_template_column(similarity_matrix):
    """Creates the first two columns of the curation template
    
    :param pandas.DataFrame similarity_matrix: similarity matrix
    """
    
    indexes = similarity_matrix.index.tolist()
     
    curation_template = []   

    for reference_pathway, columns in similarity_matrix.iterrows():
        
        row_overlapping_pathways_from_resource_2 = [
            (pathway_from_resource_1, cell)
            for pathway_from_resource_1, cell in columns.iteritems()
            if cell != 0.0
        ]
        
        # Order pathways by descendent similarity and filter them to the top 10 pathways
        overlapping_ordered_pathways = get_top_matches(row_overlapping_pathways_from_resource_2)
        
        # Future row in the exported excel
        curation_template.append(
            [_check_empty_cell(overlapping_ordered_pathways)
            ]
        )
    
    return pd.DataFrame(curation_template, index=indexes, columns=['Overlapping pathways (%)'])

KEGG vs Reactome

In [None]:
%%time
kegg_vs_reactome_overlap = create_similarity_matrix(kegg_pathways, reactome_pathways)

kegg_vs_reactome = create_curation_template_column(kegg_vs_reactome_overlap)

kegg_vs_reactome.to_csv('kegg_vs_reactome_content_similarity.csv')

KEGG vs WikiPathways

In [None]:
%%time
kegg_vs_wikipathways_overlap = create_similarity_matrix(kegg_pathways, wikipathways_pathways)

kegg_vs_wikipathways = create_curation_template_column(kegg_vs_wikipathways_overlap)

kegg_vs_wikipathways.to_csv('kegg_vs_wikipathways_content_similarity.csv')

Reactome vs WikiPathways

In [None]:
%%time
wikipathways_pathways_vs_reactome_overlap = create_similarity_matrix(wikipathways_pathways, reactome_pathways)

wikipathways_vs_reactome = create_curation_template_column(wikipathways_pathways_vs_reactome_overlap)

wikipathways_vs_reactome.to_csv('wikipathways_vs_reactome_content_similarity.csv')