# Clustergrammer heatmap

#### This notebooks generate the json files needed to plot the clustergrammer heatmap

Author: [Daniel Domingo-Fernández](https://github.com/ddomingof) 

In [49]:
import time
import sys
import os

from clustergrammer import Network
import pandas as pd
from collections import defaultdict
import itertools as itt

In [50]:
time.asctime()

'Fri Feb 16 13:30:26 2018'

In [51]:
print(sys.version)

3.4.5 (default, Dec 11 2017, 14:22:24) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-16)]


In [52]:
%matplotlib inline

Define location of gene set files

In [53]:
BASE_PATH = os.environ['COMPATH']

In [54]:
kegg_excel = os.path.join(BASE_PATH,'src','compath','static','resources','excel','kegg_gene_sets.csv')
reactome_excel= os.path.join(BASE_PATH,'src','compath','static','resources','excel','reactome_gene_sets.csv')
wikipathways_excel = os.path.join(BASE_PATH,'src','compath','static','resources','excel','wikipathways_gene_sets.csv')

In [55]:
def create_pathway_gene_set_dict(dataframe):
    """Creates a pathway genes dictionary
    
    :param pandas.DataFrame dataset: gene sets df
    :rtype: collections.defaultdict
    :returns: dictionary of pathway gene sets
    """
    
    pathway_dictionary = defaultdict(set)
    
    for pathway_name in dataframe: # iterate over columns in dataframe

        for gene in dataframe[pathway_name].unique():
            if not isinstance(gene, str): # There are NaN in the Pandas nArray
                continue

            pathway_dictionary[pathway_name].add(gene)
            
    return pathway_dictionary

Load KEGG

In [56]:
kegg_dataframe = pd.read_csv(kegg_excel, dtype=object)

# Remove the 'Homo sapiens' out of the KEGG pathways
kegg_dataframe.columns = [
    kegg_pathway.replace(' - Homo sapiens (human)', '')
    for kegg_pathway in kegg_dataframe
] 

kegg_pathways = create_pathway_gene_set_dict(kegg_dataframe)
        
assert (len(kegg_pathways.keys()) == 323)

Load Reactome

In [57]:
reactome_dataframe = pd.read_csv(reactome_excel, dtype=object)

reactome_pathways = create_pathway_gene_set_dict(reactome_dataframe)

assert (len(reactome_pathways.keys()) == 2132) # Total of 2636 of those: 2132 are not empty

Load WikiPathways

In [58]:
wikipathways_dataframe = pd.read_csv(wikipathways_excel, dtype=object)

wikipathways_pathways = create_pathway_gene_set_dict(wikipathways_dataframe)

assert (len(wikipathways_pathways.keys()) == 408)

In [59]:
def create_similarity_matrix(dataset):
    """Creates a similarity matrix for a given pathway-geneset dataset
    
    :param dict dataset: pathway gene set dictionary
    :rtype: pandas.DataFrame
    :returns: similarity matrix
    """
    
    index = sorted(dataset.keys())
    similarity_dataframe = pd.DataFrame(0.0, index=index, columns=index)
    
    for pathway_1, pathway_2 in itt.product(index, index):

        intersection = len(dataset[pathway_1].intersection(dataset[pathway_2]))
        smaller_set = min(len(dataset[pathway_1]), len(dataset[pathway_2]))
                
        similarity = float(intersection/smaller_set) # Formula to calculate similarity
            
        similarity_dataframe[pathway_1][pathway_2] = similarity
        
    return similarity_dataframe

Create similarity matrix

In [46]:
kegg_similarity_matirx = create_similarity_matrix(kegg_pathways)
# reactome_similarity_matirx = create_similarity_matrix(reactome_pathways)
# wikipathways_similarity_matirx = create_similarity_matrix(wikipathways_pathways)

In [66]:
kegg_similarity_matirx

Unnamed: 0,2-Oxocarboxylic acid metabolism,ABC transporters,AGE-RAGE signaling pathway in diabetic complications,AMPK signaling pathway,Acute myeloid leukemia,Adherens junction,Adipocytokine signaling pathway,Adrenergic signaling in cardiomyocytes,African trypanosomiasis,"Alanine, aspartate and glutamate metabolism",...,Vitamin B6 metabolism,Vitamin digestion and absorption,Wnt signaling pathway,alpha-Linolenic acid metabolism,beta-Alanine metabolism,cAMP signaling pathway,cGMP-PKG signaling pathway,mRNA surveillance pathway,mTOR signaling pathway,p53 signaling pathway
2-Oxocarboxylic acid metabolism,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.222222,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ABC transporters,0.000000,1.000000,0.000000,0.022727,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.041667,0.000000,0.000000,0.000000,0.045455,0.000000,0.000000,0.000000,0.000000
AGE-RAGE signaling pathway in diabetic complications,0.000000,0.000000,1.000000,0.111111,0.318182,0.125000,0.159420,0.161616,0.352941,0.000000,...,0.000000,0.000000,0.151515,0.000000,0.000000,0.202020,0.131313,0.000000,0.171717,0.073529
AMPK signaling pathway,0.000000,0.022727,0.111111,1.000000,0.227273,0.041667,0.478261,0.225000,0.000000,0.000000,...,0.000000,0.000000,0.016667,0.000000,0.032258,0.150000,0.133333,0.175824,0.266667,0.044118
Acute myeloid leukemia,0.000000,0.000000,0.318182,0.227273,1.000000,0.090909,0.151515,0.075758,0.000000,0.000000,...,0.000000,0.000000,0.106061,0.000000,0.000000,0.272727,0.136364,0.000000,0.409091,0.015152
Adherens junction,0.000000,0.000000,0.125000,0.041667,0.090909,1.000000,0.000000,0.027778,0.000000,0.000000,...,0.000000,0.000000,0.263889,0.000000,0.000000,0.125000,0.055556,0.000000,0.069444,0.000000
Adipocytokine signaling pathway,0.000000,0.000000,0.159420,0.478261,0.151515,0.000000,1.000000,0.043478,0.029412,0.000000,...,0.000000,0.000000,0.043478,0.000000,0.000000,0.159420,0.086957,0.000000,0.173913,0.000000
Adrenergic signaling in cardiomyocytes,0.000000,0.000000,0.161616,0.225000,0.075758,0.027778,0.043478,1.000000,0.176471,0.000000,...,0.000000,0.000000,0.084507,0.000000,0.000000,0.472222,0.500000,0.208791,0.041667,0.000000
African trypanosomiasis,0.000000,0.000000,0.352941,0.000000,0.000000,0.000000,0.029412,0.176471,1.000000,0.000000,...,0.000000,0.041667,0.205882,0.000000,0.000000,0.000000,0.147059,0.000000,0.117647,0.029412
"Alanine, aspartate and glutamate metabolism",0.222222,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,...,0.000000,0.041667,0.000000,0.000000,0.096774,0.000000,0.000000,0.000000,0.000000,0.000000


In [60]:
net = Network()
net.load_df(kegg_similarity_matirx)

# Z-score normalize the rows
net.normalize(axis='row', norm_type='zscore', keep_orig=True)

# filter for the top 100 columns based on their absolute value sum
net.filter_N_top('col', 100, 'sum')

net.cluster()

In [64]:
# save visualization JSON to file for use by front end
net.write_json_to_file('viz', 'kegg_clustergrammer.json')