In [1]:
from pathlib import Path
from itertools import product
from collections import Counter, defaultdict

import re
import os
import sys
import graco
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
simple_name = {
    'systematic_PPI_BioGRID'     : 'PPI', 
    'GI_Constanzo2016'           : 'GI',
    'systematic_CoEx_COEXPRESdb' : 'CoEx'
}

In [3]:
class InputParameters():
    RUN   = sys.argv[1]
    RANGE = 10

    ALPHA = 0.05
    MIN_GO = 5
    MAX_GO = 500
    MIN_LVL = 0
    MAX_LVL = np.inf
    CORRECTION = 'BY'

    def __init__(self, network_name, feature, metric, method, aspect):
        self.network_name = network_name
        self.feature = feature
        self.metric  = metric
        self.method  = method
        self.aspect  = aspect

class Paths():
    DATA_DIRECTORY = "/Users/markusyoussef/Desktop/git/supplements/data"
    RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
    YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed_data/yeast"
    NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
    ANNOTATION_DIRECTORY = f"{YEAST_DIRECTORY}/annotations"

    def __init__(self, in_parms):
        self.NETWORK_FILE    = f"{self.NETWORK_DIRECTORY}/{in_parms.network_name}.txt"
        self.ANNOTATION_FILE = f"{self.ANNOTATION_DIRECTORY}/GO_{in_parms.aspect}_systematic_SGD.csv"

        network_to_method = f"{in_parms.network_name}/{in_parms.feature}/{in_parms.metric}/{in_parms.method}"
        self.CLUSTER_DIRECTORY    = f"{self.YEAST_DIRECTORY}/clusterings/"   \
                                    f"{network_to_method}"
        self.PVALUE_DIRECTORY     = f"{self.YEAST_DIRECTORY}/pvalues/"       \
                                    f"{network_to_method}/{in_parms.aspect}"
        self.ENRICHMENT_DIRECTORY = f"{self.YEAST_DIRECTORY}/enrichments/"   \
                                    f"{network_to_method}/{in_parms.aspect}/{in_parms.CORRECTION}"

        if not os.path.exists(self.ENRICHMENT_DIRECTORY):
            os.makedirs(self.ENRICHMENT_DIRECTORY)

In [4]:
# =============================================================================
#  ----------------------------------- INIT -----------------------------------
# =============================================================================

NETWORK_NAMES = ['systematic_PPI_BioGRID', 'systematic_CoEx_COEXPRESdb','GI_Constanzo2016']
FEATURES = ['GDV', 
            'GCV-A' , 'GCV-G' , 'GCV-O', 
            'GCV-DA', 'GCV-DG', 'GCV-DO',
            'GCV-O+', 'GCV-all', 'GCV-3',
            'triangle']
METRICS  = ['mahalanobis', 'GDV_similarity', 'triangle',
            'cityblock', 'euclidean', 'chebyshev', 'canberra', 
            'cosine', 'correlation', 'braycurtis', 'sqeuclidean', 
            'hellinger', 'js_divergence', 'seuclidean']
METHODS  = ['kmedoid']
ASPECTS  = ['BP', 'MF', 'CC']

In [5]:
with open("/Users/markusyoussef/Desktop/git/supplements/data/processed_data/yeast/clusterings/output.txt", 'r') as f:
    unbalanced_clusters = pd.DataFrame(map(str.split,f), 
                                       columns=['network_name', 'feature', 'metric'])

In [6]:
def id_metric_cells(html, metric, id_name):
    """ 
    Adds id=id_name to all td tags containing ‘metric' and the one immediately after.
    """
    old_strings = re.findall(f'<td>{metric}</td>\s*<td>\s*.*</td>', html)
    for old in old_strings:
        new = old.replace('<td>',f'<td id={id_name}>')
        html = html.replace(old,new)
    return html

In [7]:
def edit_unbalanced_cells(html):
    color = 'FFC0C0'
    before, substr, after = html.partition('<td>-1')
    
    before_idx = before.rfind('<td>')
    after_idx  = after.find( '</td>')
    find_str = before[before_idx:] + substr + after[:after_idx+5]
    repl_str = f'<td style="background-color:#{color}">' + before[before_idx+4:] + \
               f'<td style="background-color:#{color}; text-align: center"> - </td>'
    
    return html.replace(find_str, repl_str)

In [8]:
final_html = ''

for feature in features:
    GDV_dfs = {}
    for aspect in aspects:
        GDV_dfs[aspect] = pd.DataFrame()

    for network_name, aspect in product(network_names, aspects):
        path = Path(Paths.DATA_DIRECTORY)/'processed_data'/'yeast'/'enrichments'/network_name/feature
        for metric_dir in path.glob('*'):
            metric = metric_dir.stem
            in_parms = InputParameters(network_name, feature, metric, 'kmedoid', aspect)
            enrichment_dir = Path(Paths(in_parms).ENRICHMENT_DIRECTORY)
            AUCs = [np.mean(np.loadtxt(enrichment_dir/file)) for file in enrichment_dir.glob('*genes.csv')]

            #if [network_name, feature, metric] in unbalanced_clusters.values:
            #    GDV_dfs[aspect].loc[metric,network_name] = -1
            GDV_dfs[aspect].loc[metric,network_name] = np.mean(AUCs)  
            
            n_files = len(list(enrichment_dir.glob('*')))/3
            assert n_files == 10, n_files
    
    network_df_list = []
    for network in network_names:
        network_df = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in aspects], axis=1)
        network_df.columns = pd.MultiIndex.from_product([(simple_name[network],), aspects, ('metric','enrichment')])
        network_df.columns.names = ['network', 'annotation', ' ']
        network_df_list.append(network_df)
    
    feature_df = pd.concat(network_df_list, axis=1)
    feature_df.index = pd.MultiIndex.from_product([(feature,), range(1,len(network_df)+1)])
    
    if feature == 'GDV':
        top_metrics = ['mahalanobis', 'canberra', 'GDV_similarity']
    else:
        top_metrics = ['js_divergence', 'hellinger', 'canberra']
        
    html = feature_df.to_html()
    
    for nr, top_metric in enumerate(top_metrics,1):
        html = id_metric_cells(html, top_metric, f'metric{nr}')
        
    final_html += html + '\n\n<hr>\n\n'
    
final_html = final_html.replace('GDV_similarity','GDV_distance')

NameError: name 'features' is not defined

In [9]:
header = """
<!DOCTYPE html>
<html>
<style>
#metric1 {background-color:#FFDDC0}
#metric2 {background-color:#B8F4B8}
#metric3 {background-color:#B0E9E9}
</style>

"""

footer = '</html>'

In [10]:
output_path = '/Users/markusyoussef/Desktop/jupyter_output/test.html'

with open(output_path, 'w') as f:
    f.write(header + final_html + footer)

### New

In [9]:
def id_unbalanced_cells(html, feature_unbalanced_clusters):
    """ 
    Adds id=id_name to all td tags containing ‘metric' and the one immediately after.
    """
    
    for idx, (network_name, feature, metric) in feature_unbalanced_clusters.iterrows():
        idx = NETWORK_NAMES.index(network_name)
        select = slice(3*idx,3*(idx+1))

        for block_str in re.findall(f"(<tr>\s*(?:<th\s.*?)?<th>\d+.*?</tr>)", html, flags=re.DOTALL):
            for old_str in re.findall(2*'<td.*</td>\s*', block_str)[select]:
                if metric in old_str:
                    new_str = old_str.replace('<td>',f'<td id=unbalanced>')
                    html = html.replace(old_str,new_str)
    return html

In [10]:
final_html = ''

for feature in FEATURES:
    GDV_dfs = {}
    for aspect in ASPECTS:
        GDV_dfs[aspect] = pd.DataFrame()

    for network_name, aspect in product(NETWORK_NAMES, ASPECTS):
        path = Path(Paths.DATA_DIRECTORY)/'processed_data'/'yeast'/'enrichments'/network_name/feature
        for metric_dir in path.glob('*'):
            metric = metric_dir.stem
            in_parms = InputParameters(network_name, feature, metric, 'kmedoid', aspect)
            enrichment_dir = Path(Paths(in_parms).ENRICHMENT_DIRECTORY)
            AUCs = [np.mean(np.loadtxt(enrichment_dir/file)) for file in enrichment_dir.glob('*genes.csv')]

            GDV_dfs[aspect].loc[metric,network_name] = np.mean(AUCs)  
            
            n_files = len(list(enrichment_dir.glob('*')))/3
            assert n_files == 10, n_files
    
    network_df_list = []
    for network in NETWORK_NAMES:
        network_df = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in ASPECTS], axis=1)
        network_df.columns = pd.MultiIndex.from_product([(simple_name[network],), ASPECTS, ('metric','enrichment')])
        network_df.columns.names = ['network', 'annotation', ' ']
        network_df_list.append(network_df)
    
    feature_df = pd.concat(network_df_list, axis=1)
    feature_df.index = pd.MultiIndex.from_product([(feature,), range(1,len(network_df)+1)])
    
    if feature == 'GDV':
        top_metrics = ['mahalanobis', 'canberra', 'GDV_similarity']
    else:
        top_metrics = ['js_divergence', 'hellinger', 'canberra']
        
    html = feature_df.to_html()
    
    feature_unbalanced_clusters = unbalanced_clusters[unbalanced_clusters.feature==feature]
    html = id_unbalanced_cells(html, feature_unbalanced_clusters)
    
    for nr, top_metric in enumerate(top_metrics,1):
        html = id_metric_cells(html, top_metric, f'metric{nr}')
        
    final_html += html + '\n\n<hr>\n\n'
    
final_html = final_html.replace('GDV_similarity','GDV_distance')

In [11]:
header = """
<!DOCTYPE html>
<html>
<head>
<style>
#metric1 {background-color:#FFDDC0}
#metric2 {background-color:#B8F4B8}
#metric3 {background-color:#B0E9E9}
#unbalanced {background-color:#FFC0C0}
</style>
"""

In [12]:
output_path = '/Users/markusyoussef/Desktop/jupyter_output/test.html'

with open(output_path, 'w') as f:
    f.write(header + final_html)

In [20]:
unbalanced_clusters[unbalanced_clusters.feature=='GCV-O']

Unnamed: 0,network_name,feature,metric
2,systematic_CoEx_COEXPRESdb,GCV-O,correlation
13,systematic_PPI_BioGRID,GCV-O,correlation
14,systematic_PPI_BioGRID,GCV-O,cosine
47,GI_Constanzo2016,GCV-O,sqeuclidean
48,GI_Constanzo2016,GCV-O,chebyshev
49,GI_Constanzo2016,GCV-O,correlation
50,GI_Constanzo2016,GCV-O,seuclidean
51,GI_Constanzo2016,GCV-O,braycurtis


In [23]:
[0,1,2,3,4,5][slice(0,3)]

[0, 1, 2]