In [1]:
from itertools import product
from collections import Counter, defaultdict

import os
import sys
import graco
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
class InputParameters():
    RUN   = sys.argv[1]
    RANGE = 10

    ALPHA = 0.05
    MIN_GO = 5
    MAX_GO = 500
    MIN_LVL = 0
    MAX_LVL = np.inf
    CORRECTION = 'BY'

    def __init__(self, network_name, feature, metric, method, aspect):
        self.network_name = network_name
        self.feature = feature
        self.metric  = metric
        self.method  = method
        self.aspect  = aspect

class Paths():
    DATA_DIRECTORY = "/Users/markusyoussef/Desktop/git/supplements/data"
    RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
    YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed_data/yeast"
    NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
    ANNOTATION_DIRECTORY = f"{YEAST_DIRECTORY}/annotations"

    def __init__(self, in_parms):
        self.NETWORK_FILE    = f"{self.NETWORK_DIRECTORY}/{in_parms.network_name}.txt"
        self.ANNOTATION_FILE = f"{self.ANNOTATION_DIRECTORY}/GO_{in_parms.aspect}_systematic_SGD.csv"

        network_to_method = f"{in_parms.network_name}/{in_parms.feature}/{in_parms.metric}/{in_parms.method}"
        self.CLUSTER_DIRECTORY    = f"{self.YEAST_DIRECTORY}/clusterings/"   \
                                    f"{network_to_method}"
        self.PVALUE_DIRECTORY     = f"{self.YEAST_DIRECTORY}/pvalues/"       \
                                    f"{network_to_method}/{in_parms.aspect}"
        self.ENRICHMENT_DIRECTORY = f"{self.YEAST_DIRECTORY}/enrichments/"   \
                                    f"{network_to_method}/{in_parms.aspect}/{in_parms.CORRECTION}"

        if not os.path.exists(self.ENRICHMENT_DIRECTORY):
            os.makedirs(self.ENRICHMENT_DIRECTORY)

In [39]:
# =============================================================================
#  ----------------------------------- INIT -----------------------------------
# =============================================================================

network_names = ['systematic_PPI_BioGRID', 'GI_Constanzo2016',
                 'systematic_CoEx_COEXPRESdb']
features = ['GCV-DG-3']
metrics  = [#'mahalanobis', 'GDV_similarity',
            'cityblock', 'euclidean', 'chebyshev', 'canberra', 
            'cosine', 'correlation', 'braycurtis', 'sqeuclidean', 
            'hellinger', 'js_divergence', 'seuclidean'
]
methods  = ['kmedoid']
aspects  = ['BP', 'MF', 'CC']

In [44]:
GDV_dfs = {}
for aspect in aspects:
    GDV_dfs[aspect] = pd.DataFrame()

loop_product = product(network_names, features, metrics, methods, aspects)
for network_name, feature, metric, method, aspect in loop_product:
    in_parms = InputParameters(network_name, feature, metric, method, aspect)
    ENRICHMENT_DIRECTORY = Paths(in_parms).ENRICHMENT_DIRECTORY
    AUCs = [np.mean(np.loadtxt(f"{ENRICHMENT_DIRECTORY}/{file}"))
         for file in os.listdir(ENRICHMENT_DIRECTORY) if file.endswith('genes.csv')]
    GDV_dfs[aspect].loc[metric,network_name] = np.mean(AUCs) 
    
    assert len(os.listdir(ENRICHMENT_DIRECTORY))/3 == 10, len(os.listdir(ENRICHMENT_DIRECTORY))/3

In [45]:
network = 'systematic_PPI_BioGRID'
df = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in aspects], axis=1)
df.columns = pd.MultiIndex.from_product([aspects, ('metric','enrichment')])
df.index = range(1,6)

Unnamed: 0_level_0,BP,BP,MF,MF,CC,CC
Unnamed: 0_level_1,metric,enrichment,metric,enrichment,metric,enrichment
1,canberra,0.235491,canberra,0.129287,canberra,0.206356
2,js_divergence,0.234719,hellinger,0.121927,hellinger,0.199499
3,hellinger,0.232222,js_divergence,0.118225,js_divergence,0.197564
4,seuclidean,0.155,seuclidean,0.089756,seuclidean,0.173336
5,cityblock,0.15273,cityblock,0.070266,cityblock,0.139984


In [46]:
network = 'systematic_CoEx_COEXPRESdb'
df = pd.concat([GDV_dfs[aspect].nlargest(5, network)[[network]].reset_index() for aspect in aspects], axis=1)
df.columns = pd.MultiIndex.from_product([aspects, ('metric','enrichment')])
df.index = range(1,6)

Unnamed: 0_level_0,BP,BP,MF,MF,CC,CC
Unnamed: 0_level_1,metric,enrichment,metric,enrichment,metric,enrichment
1,js_divergence,0.145262,canberra,0.077627,js_divergence,0.146301
2,canberra,0.142002,js_divergence,0.076456,canberra,0.143133
3,hellinger,0.137996,hellinger,0.074144,hellinger,0.140915
4,sqeuclidean,0.129546,correlation,0.071774,sqeuclidean,0.137294
5,cosine,0.128143,braycurtis,0.07013,cityblock,0.135548


In [47]:
network = 'GI_Constanzo2016'
df = pd.concat([GDV_dfs[aspect].nlargest(11, network)[[network]].reset_index() for aspect in aspects], axis=1)
df.columns = pd.MultiIndex.from_product([aspects, ('metric','enrichment')])
df.index = range(1,12)

Unnamed: 0_level_0,BP,BP,MF,MF,CC,CC
Unnamed: 0_level_1,metric,enrichment,metric,enrichment,metric,enrichment
1,js_divergence,0.044375,correlation,0.278515,sqeuclidean,0.040648
2,cosine,0.043731,seuclidean,0.269973,js_divergence,0.039524
3,hellinger,0.04254,braycurtis,0.269075,cosine,0.038001
4,sqeuclidean,0.042442,euclidean,0.262469,canberra,0.03775
5,canberra,0.040387,chebyshev,0.259513,hellinger,0.034958
6,chebyshev,0.039819,cityblock,0.25672,correlation,0.034718
7,euclidean,0.038439,cosine,0.253029,euclidean,0.034371
8,cityblock,0.038311,sqeuclidean,0.23662,braycurtis,0.032066
9,braycurtis,0.038194,hellinger,0.217942,cityblock,0.031641
10,correlation,0.032096,canberra,0.203655,chebyshev,0.030508


### Combined tables

In [118]:
import re
import pdfkit as pdf

In [119]:
network = 'systematic_PPI_BioGRID'
df_PPI = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in aspects], axis=1)
df_PPI.columns = pd.MultiIndex.from_product([('PPI',), aspects, ('metric','enrichment')])
df_PPI.columns.names = ['network', 'annotation', ' ']
df.index = pd.MultiIndex.from_product(('PPI',) range(1,len(df_PPI)+1))

network = 'systematic_CoEx_COEXPRESdb'
df_CoEx = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in aspects], axis=1)
df_CoEx.columns = pd.MultiIndex.from_product([('CoEx',), aspects, ('metric','enrichment')])
df_CoEx.columns.names = ['network', 'annotation', ' ']
df.index = pd.MultiIndex.from_product(('PPI',) range(1,len(df_PPI)+1))

network = 'GI_Constanzo2016'
df_GI = pd.concat([GDV_dfs[aspect].nlargest(15, network)[[network]].reset_index() for aspect in aspects], axis=1)
df_GI.columns = pd.MultiIndex.from_product([('GI',), aspects, ('metric','enrichment')])
df_GI.columns.names = ['network', 'annotation', ' ']
df.index = pd.MultiIndex.from_product(('PPI',) range(1,len(df_PPI)+1))

In [163]:
df = pd.concat([df_PPI, df_CoEx, df_GI], axis=1)

In [187]:
metric = 'js_divergence'
html = df.to_html().replace(f"<td>{metric}</td>",
                            f"<th bgcolor='FFAAAA'>{metric}</th>")

metric_idx_gen = (m.start() for m in re.finditer(metric, html))
values = [html[idx+len(metric)+16:idx+len(metric)+24] for idx in metric_idx_gen]

for value in values:
    html = html.replace(f"<td>{value}</td>",
                            f"<th bgcolor='FFAAAA'>{value}</th>")

In [188]:
file_in  = '/Users/markusyoussef/Desktop/jupyter_output/test.html'
file_out = '/Users/markusyoussef/Desktop/jupyter_output/test.pdf'

with open(file_in, 'w') as f:
    f.write(html) 
    
pdf.from_file(file_in, file_out)

Loading pages (1/6)
Printing pages (6/6)


True

In [189]:
? pdf.from_file

---

# Testing

This is the testing place... *test* - *test*