In [1]:
from itertools import product
from collections import Counter, defaultdict

import re
import os
import sys
import graco
import imgkit
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
class InputParameters():
    RUN   = sys.argv[1]
    RANGE = 10

    ALPHA = 0.05
    MIN_GO = 5
    MAX_GO = 500
    MIN_LVL = 0
    MAX_LVL = np.inf
    CORRECTION = 'BY'

    def __init__(self, network_name, feature, metric, method, aspect):
        self.network_name = network_name
        self.feature = feature
        self.metric  = metric
        self.method  = method
        self.aspect  = aspect

class Paths():
    DATA_DIRECTORY = "/Users/markusyoussef/Desktop/git/supplements/data"
    RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
    YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed_data/yeast"
    NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
    ANNOTATION_DIRECTORY = f"{YEAST_DIRECTORY}/annotations"

    def __init__(self, in_parms):
        self.NETWORK_FILE    = f"{self.NETWORK_DIRECTORY}/{in_parms.network_name}.txt"
        self.ANNOTATION_FILE = f"{self.ANNOTATION_DIRECTORY}/GO_{in_parms.aspect}_systematic_SGD.csv"

        network_to_method = f"{in_parms.network_name}/{in_parms.feature}/{in_parms.metric}/{in_parms.method}"
        self.CLUSTER_DIRECTORY    = f"{self.YEAST_DIRECTORY}/clusterings/"   \
                                    f"{network_to_method}"
        self.PVALUE_DIRECTORY     = f"{self.YEAST_DIRECTORY}/pvalues/"       \
                                    f"{network_to_method}/{in_parms.aspect}"
        self.ENRICHMENT_DIRECTORY = f"{self.YEAST_DIRECTORY}/enrichments/"   \
                                    f"{network_to_method}/{in_parms.aspect}/{in_parms.CORRECTION}"

        if not os.path.exists(self.ENRICHMENT_DIRECTORY):
            os.makedirs(self.ENRICHMENT_DIRECTORY)

In [3]:
# =============================================================================
#  ----------------------------------- INIT -----------------------------------
# =============================================================================

network_names = ['systematic_PPI_BioGRID', 'GI_Constanzo2016',
                 'systematic_CoEx_COEXPRESdb']
features = ['GCV-DG-3']
metrics  = [#'mahalanobis', 'GDV_similarity',
            'cityblock', 'euclidean', 'chebyshev', 'canberra', 
            'cosine', 'correlation', 'braycurtis', 'sqeuclidean', 
            'hellinger', 'js_divergence', 'seuclidean'
]
methods  = ['kmedoid']
aspects  = ['BP', 'MF', 'CC']

In [25]:
GDV_dfs = {}
for aspect in aspects:
    GDV_dfs[aspect] = pd.DataFrame()

loop_product = product(network_names, features, metrics, methods, aspects)
for network_name, feature, metric, method, aspect in loop_product:
    in_parms = InputParameters(network_name, feature, metric, method, aspect)
    ENRICHMENT_DIRECTORY = Paths(in_parms).ENRICHMENT_DIRECTORY
    AUCs = [np.mean(np.loadtxt(f"{ENRICHMENT_DIRECTORY}/{file}"))
         for file in os.listdir(ENRICHMENT_DIRECTORY) if file.endswith('genes.csv')]
    GDV_dfs[aspect].loc[metric,network_name] = np.mean(AUCs) 
    
    assert len(os.listdir(ENRICHMENT_DIRECTORY))/3 == 10, len(os.listdir(ENRICHMENT_DIRECTORY))/3

In [8]:
network = 'systematic_PPI_BioGRID'
df = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in aspects], axis=1)
df.columns = pd.MultiIndex.from_product([aspects, ('metric','enrichment')])
df.index = range(1,12)

In [9]:
network = 'systematic_CoEx_COEXPRESdb'
df = pd.concat([GDV_dfs[aspect].nlargest(5, network)[[network]].reset_index() for aspect in aspects], axis=1)
df.columns = pd.MultiIndex.from_product([aspects, ('metric','enrichment')])
df.index = range(1,6)

In [10]:
network = 'GI_Constanzo2016'
df = pd.concat([GDV_dfs[aspect].nlargest(11, network)[[network]].reset_index() for aspect in aspects], axis=1)
df.columns = pd.MultiIndex.from_product([aspects, ('metric','enrichment')])
df.index = range(1,12)

### Combined tables

In [122]:
import re
import pdfkit
import imgkit

In [26]:
network = 'systematic_PPI_BioGRID'
df_PPI = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in aspects], axis=1)
df_PPI.columns = pd.MultiIndex.from_product([('PPI',), aspects, ('metric','enrichment')])
df_PPI.columns.names = ['network', 'annotation', ' ']
df.index = pd.MultiIndex.from_product([('PPI',), range(1,len(df_PPI)+1)])

network = 'systematic_CoEx_COEXPRESdb'
df_CoEx = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in aspects], axis=1)
df_CoEx.columns = pd.MultiIndex.from_product([('CoEx',), aspects, ('metric','enrichment')])
df_CoEx.columns.names = ['network', 'annotation', ' ']
df.index = pd.MultiIndex.from_product([('CoEx',), range(1,len(df_PPI)+1)])

network = 'GI_Constanzo2016'
df_GI = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in aspects], axis=1)
df_GI.columns = pd.MultiIndex.from_product([('GI',), aspects, ('metric','enrichment')])
df_GI.columns.names = ['network', 'annotation', ' ']

In [30]:
df = pd.concat([df_PPI, df_CoEx, df_GI], axis=1)
df.index = pd.MultiIndex.from_product([(features[0],), range(1,len(df_PPI)+1)])

In [31]:
metric = 'js_divergence'
html = df.to_html().replace(f"<td>{metric}</td>",
                            f"<th bgcolor='FFAAAA'>{metric}</th>")

metric_idx_gen = (m.start() for m in re.finditer(metric, html))
values = [html[idx+len(metric)+16:idx+len(metric)+24] for idx in metric_idx_gen]

for value in values:
    html = html.replace(f"<td>{value}</td>",
                            f"<th bgcolor='FFAAAA'>{value}</th>")

In [32]:
file_in  = '/Users/markusyoussef/Desktop/jupyter_output/test.html'
file_out = '/Users/markusyoussef/Desktop/jupyter_output/test.pdf'

with open(file_in, 'w') as f:
    f.write(html) 
    
pdf.from_file(file_in, file_out)

Loading pages (1/6)
Printing pages (6/6)


True

In [33]:
df

Unnamed: 0_level_0,network,PPI,PPI,PPI,PPI,PPI,PPI,CoEx,CoEx,CoEx,CoEx,CoEx,CoEx,GI,GI,GI,GI,GI,GI
Unnamed: 0_level_1,annotation,BP,BP,MF,MF,CC,CC,BP,BP,MF,MF,CC,CC,BP,BP,MF,MF,CC,CC
Unnamed: 0_level_2,Unnamed: 1_level_2,metric,enrichment,metric,enrichment,metric,enrichment,metric,enrichment,metric,enrichment,metric,enrichment,metric,enrichment,metric,enrichment,metric,enrichment
GCV-DG-3,1,canberra,0.235491,canberra,0.129287,canberra,0.206356,js_divergence,0.145262,canberra,0.077627,js_divergence,0.146301,js_divergence,0.044375,correlation,0.278515,sqeuclidean,0.040648
GCV-DG-3,2,js_divergence,0.234719,hellinger,0.121927,hellinger,0.199499,canberra,0.142002,js_divergence,0.076456,canberra,0.143133,cosine,0.043731,seuclidean,0.269973,js_divergence,0.039524
GCV-DG-3,3,hellinger,0.232222,js_divergence,0.118225,js_divergence,0.197564,hellinger,0.137996,hellinger,0.074144,hellinger,0.140915,hellinger,0.04254,braycurtis,0.269075,cosine,0.038001
GCV-DG-3,4,seuclidean,0.155,seuclidean,0.089756,seuclidean,0.173336,sqeuclidean,0.129546,correlation,0.071774,sqeuclidean,0.137294,sqeuclidean,0.042442,euclidean,0.262469,canberra,0.03775
GCV-DG-3,5,cityblock,0.15273,cityblock,0.070266,cityblock,0.139984,cosine,0.128143,braycurtis,0.07013,cityblock,0.135548,canberra,0.040387,chebyshev,0.259513,hellinger,0.034958
GCV-DG-3,6,braycurtis,0.151658,euclidean,0.070155,braycurtis,0.137036,seuclidean,0.126906,cityblock,0.069363,cosine,0.135148,chebyshev,0.039819,cityblock,0.25672,correlation,0.034718
GCV-DG-3,7,euclidean,0.143119,braycurtis,0.069335,euclidean,0.132217,cityblock,0.12661,sqeuclidean,0.069149,braycurtis,0.134477,euclidean,0.038439,cosine,0.253029,euclidean,0.034371
GCV-DG-3,8,chebyshev,0.136381,chebyshev,0.06547,chebyshev,0.127896,braycurtis,0.125288,seuclidean,0.068745,euclidean,0.132555,cityblock,0.038311,sqeuclidean,0.23662,braycurtis,0.032066
GCV-DG-3,9,sqeuclidean,0.129519,sqeuclidean,0.064114,sqeuclidean,0.116837,chebyshev,0.123126,cosine,0.06786,chebyshev,0.132369,braycurtis,0.038194,hellinger,0.217942,cityblock,0.031641
GCV-DG-3,10,cosine,0.101001,cosine,0.052773,cosine,0.091332,euclidean,0.121863,euclidean,0.067545,seuclidean,0.131831,correlation,0.032096,canberra,0.203655,chebyshev,0.030508


---

## All together

In [8]:
# =============================================================================
#  ----------------------------------- INIT -----------------------------------
# =============================================================================

network_names = ['systematic_PPI_BioGRID', 'GI_Constanzo2016',
                 'systematic_CoEx_COEXPRESdb']
features = ['GDV', 'GCV-3', 'GCV-DG', 'GCV-O', 'GCV-O+', 
            'GCV-A', 'GCV-G', 'GCV-DA', 'GCV-DG', 'GCV-DAG', 'GCV-DAG-reduced',
            'GCV-DG-2', 'GCV-DG-3', 'triangle', 'single_descriptor']
metrics  = ['mahalanobis', 'GDV_similarity', 'triangle',
            'cityblock', 'euclidean', 'chebyshev', 'canberra', 
            'cosine', 'correlation', 'braycurtis', 'sqeuclidean', 
            'hellinger', 'js_divergence', 'seuclidean'
]
methods  = ['kmedoid']
aspects  = ['BP', 'MF', 'CC']

In [9]:
with open("/Users/markusyoussef/Desktop/git/supplements/data/processed_data/yeast/clusterings/output.txt", 'r') as f:
    unbalanced_clusters = [clustering.split() for clustering in f]

In [10]:
def edit_unbalanced_cells(html):
    color = 'FFC0C0'
    before, substr, after = html.partition('<td>-1')
    
    before_idx = before.rfind('<td>')
    after_idx  = after.find( '</td>')
    find_str = before[before_idx:] + substr + after[:after_idx+5]
    repl_str = f'<td style="background-color:#{color}">' + before[before_idx+4:] + \
               f'<td style="background-color:#{color}; text-align: center"> - </td>'
    
    return html.replace(find_str, repl_str)

In [11]:
final_html = ''

for feature in features:
    
    GDV_dfs = {}
    for aspect in aspects:
        GDV_dfs[aspect] = pd.DataFrame()

    for network_name, aspect in product(network_names, aspects):
        path = f"{Paths.DATA_DIRECTORY}/processed_data/yeast/enrichments/{network_name}/{feature}"
        for metric in os.listdir(path):
            if metric.startswith('current') and network_name == 'systematic_CoEx_COEXPRESdb':
                continue
            if metric.startswith('.'):
                continue
            in_parms = InputParameters(network_name, feature, metric, 'kmedoid', aspect)
            ENRICHMENT_DIRECTORY = Paths(in_parms).ENRICHMENT_DIRECTORY
            AUCs = [np.mean(np.loadtxt(f"{ENRICHMENT_DIRECTORY}/{file}"))
                 for file in os.listdir(ENRICHMENT_DIRECTORY) if file.endswith('genes.csv')]

            if [network_name, feature, metric] in unbalanced_clusters:
                GDV_dfs[aspect].loc[metric,network_name] = -1
            else:
                GDV_dfs[aspect].loc[metric,network_name] = np.mean(AUCs)  

            assert len(os.listdir(ENRICHMENT_DIRECTORY))/3 == 10, len(os.listdir(ENRICHMENT_DIRECTORY))/3
    
    
    network = 'systematic_PPI_BioGRID'
    df_PPI = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in aspects], axis=1)
    df_PPI.columns = pd.MultiIndex.from_product([('PPI',), aspects, ('metric','enrichment')])
    df_PPI.columns.names = ['network', 'annotation', ' ']

    network = 'systematic_CoEx_COEXPRESdb'
    df_CoEx = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in aspects], axis=1)
    df_CoEx.columns = pd.MultiIndex.from_product([('CoEx',), aspects, ('metric','enrichment')])
    df_CoEx.columns.names = ['network', 'annotation', ' ']

    network = 'GI_Constanzo2016'
    df_GI = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in aspects], axis=1)
    df_GI.columns = pd.MultiIndex.from_product([('GI',), aspects, ('metric','enrichment')])
    df_GI.columns.names = ['network', 'annotation', ' ']
    
    df = pd.concat([df_PPI, df_CoEx, df_GI], axis=1)
    df.index = pd.MultiIndex.from_product([(feature,), range(1,len(df_GI)+1)])
    
    if   feature == 'GDV':
        metric1 = 'mahalanobis'
        metric2 = 'canberra'
        metric3 = 'GDV_similarity'
    else:
        metric1 = 'js_divergence'
        metric2 = 'hellinger'
        metric3 = 'canberra'
        
    html = df.to_html()
    
    while html.find('<td>-1') != -1:
        html = edit_unbalanced_cells(html)
    
    for top_metric, color in zip([metric1, metric2, metric3], ['FFDDC0', 'B8F4B8', 'B0E9E9']):
        html = html.replace(f'<td>{top_metric}</td>',
                            f'<td style="background-color:#{color}">{top_metric}</td>')

        metric_idx_gen = (m.start() for m in re.finditer(top_metric, html))
        values = [html[idx+len(top_metric)+16:idx+len(top_metric)+24] for idx in metric_idx_gen]

        for value in values:
            html = html.replace(f'<td>{value}</td>',
                                f'<td style="background-color:#{color}">{value}</td>')
        
    final_html += html + '\n\n<hr>\n\n'
final_html = final_html.replace('GDV_similarity','GDV_distance')

In [12]:
file_in  = '/Users/markusyoussef/Desktop/jupyter_output/test.html'
file_out = '/Users/markusyoussef/Desktop/jupyter_output/test.jpg'

with open(file_in, 'w') as f:
    f.write(final_html) 
    
imgkit.from_file(file_in, file_out)

Loading page (1/2)


True

In [45]:
13*3*3*10

1170

In [75]:
df_PPI

network,PPI,PPI,PPI,PPI,PPI,PPI
annotation,BP,BP,MF,MF,CC,CC
Unnamed: 0_level_2,metric,enrichment,metric,enrichment,metric,enrichment
0,js_divergence,0.249667,canberra,0.132995,canberra,0.214683
1,hellinger,0.246867,hellinger,0.130024,hellinger,0.211099
2,canberra,0.240663,js_divergence,0.122042,js_divergence,0.207351
3,braycurtis,0.164551,seuclidean,0.094326,seuclidean,0.175526
4,cityblock,0.164492,euclidean,0.075728,cityblock,0.146285
5,seuclidean,0.16413,cityblock,0.075208,braycurtis,0.142897
6,euclidean,0.1547,braycurtis,0.073555,euclidean,0.139854
7,chebyshev,0.146603,chebyshev,0.068815,chebyshev,0.135523
8,sqeuclidean,0.134808,sqeuclidean,0.064346,sqeuclidean,0.123332
9,cosine,0.104346,cosine,0.051704,cosine,0.093944
