# ComPath -  Pathway name similarity comparison

The goal of this notebook is to calculate the pathway pairs with the highest similarity based of their name. This information is used together with the content similarity to manually map the pathway pairs.

#### Author: [Daniel Domingo-Fernández](https://github.com/ddomingof) 

### Import packages needed

In [None]:
import time
import sys
import os

import pandas as pd
from difflib import SequenceMatcher
import itertools as itt

from bio2bel_kegg.manager import Manager as KeggManager
from bio2bel_reactome.manager import Manager as ReactomeManager
from bio2bel_wikipathways.manager import Manager as WikiPathwaysManager

### Configuration

In [None]:
time.asctime()

In [None]:
print(sys.version)

In [None]:
COMPATH_PATH = os.environ['COMPATH']

### Load resources

In [None]:
"""KEGG"""

kegg_manager = KeggManager() # Initialize the KEGG Manager

# Remove the 'Homo sapiens' out of the KEGG pathways in order to facilitate the string matching
kegg_pathways = [
    kegg_pathway.name.replace(' - Homo sapiens (human)', '')
    for kegg_pathway in kegg_manager.get_all_pathways()
] 

try:
    assert (len(kegg_pathways) == 325)
except AssertionError as error:
    print('The number of pathways in the database {} does not match 325'.format(len(kegg_pathways)))
    
"""Reactome"""
reactome_manager = ReactomeManager() # Initialize the Reactome Manager

reactome_pathways = [
    reactome_pathway.name
    for reactome_pathway in reactome_manager.get_all_pathways()
    if reactome_pathway.species.name == 'Homo sapiens' # Filter to get only Human Pathways
] 

try:
    assert (len(reactome_pathways) == 2171)
except AssertionError as error:
    print('The number of pathways in the database {} does not match 2171'.format(len(reactome_pathways)))

"""WikiPathways"""
wikipathways_manager = WikiPathwaysManager() # Initialize the WikiPathways Manager

wikipathways_pathways = [
    wikipathways_pathway.name
    for wikipathways_pathway in wikipathways_manager.get_all_pathways()
] 

try:
    assert (len(wikipathways_pathways) == 423)
except AssertionError as error:
    print('The number of pathways in the database {} does not match 423'.format(len(wikipathways_pathways)))
    

### Pathway mapping based on name similarity

In [None]:
pathway_databases = ['KEGG', 'Reactome','WikiPathways']

for pathway_database_1, pathway_database2 in itt.combinations(pathway_databases, 2):
    print("{} vs {}".format(pathway_database_1, pathway_database2))

In [None]:
def calculate_similarity(name_1, name_2):
    """Calculates the string based similarity between two names
    
    :param str name_1: name 1
    :param str name_2: name 2
    :rtype: float
    :return: Levenshtein similarity
    """
    return SequenceMatcher(None, name_1, name_2).ratio()

def get_top_matches(names, top=5):
    """Orders list of tuples by second value and returns top values
    
    :param list[tuple[str,float]] names: list of tuples
    :param int top: top values to return
    """
    sorted_names = sorted(names, key=lambda x: x[1], reverse=True)
    
    return sorted_names[0:top]   

def filter_results(results, threshold=0.6):
    """Only present results with high similarity
    
    :param list[tuple[str,float]] results: list of tuples
    :param threhsold threshold: threhsolding
    """
    return [
        (name, value)
        for name, value in results
        if value > threshold
    ]
    
def create_template(pathways_1, pathways_2, top=5):
    """Create the column of name similarity in the curation template
    
    :param list pathways_1: pathway list from resource 1
    :param list pathways_2: pathway list from resource 2
    :param int top: number of most similar pathways added to the column
    """
    
    # Order alphabetically to keep the same order as other comparisons
    pathways_1 = sorted(pathways_1)
    pathways_2 = sorted(pathways_2)
    
    curation_template = []

    for pathway_1 in pathways_1:
        
        similarities_with_pathways_2 = [
            (pathway_2, calculate_similarity(pathway_1, pathway_2))
            for pathway_2 in pathways_2
        ]
        
        # Order pathways by descendent similarity
        top_pathways = get_top_matches(similarities_with_pathways_2)
        

        # Future row in the exported excel. Note that names with a similarity lower than 0.6 are filtered
        curation_template.append(
            ['\n'.join([name for name, similarity in filter_results(top_pathways)])
            ]
        )
    
    return pd.DataFrame(curation_template, index=pathways_1, columns=['Top {} pathways according to string similarity'.format(top)])      


KEGG vs Reactome

In [None]:
%%time
kegg_vs_reactome = create_template(kegg_pathways, reactome_pathways)

kegg_vs_reactome.to_csv('kegg_vs_reactome_string_similarity.csv')

KEGG vs WikiPathways

In [None]:
%%time
kegg_vs_wikipathways = create_template(kegg_pathways, wikipathways_pathways)

kegg_vs_wikipathways.to_csv('kegg_vs_wikipathways_string_similarity.csv')

WikiPathways vs Reactome

In [None]:
%%time
wikipathways_vs_reactome = create_template(wikipathways_pathways, reactome_pathways)

wikipathways_vs_reactome.to_csv('wikipathways_vs_reactome_string_similarity.csv')