In [1]:
from itertools import product
from collections import Counter, defaultdict

import os
import sys
import numpy as np
import pandas as pd

In [2]:
# =============================================================================
#  --------------------- INPUT PARAMETER AND PATH CLASSES ---------------------
# =============================================================================

class InputParameters():
    RUN   = 0  #sys.argv[1]
    RANGE = 10
    CORRECTION = 'BY'
    
    def __init__(self, network_name, feature, metric, method, aspect):
        self.network_name = network_name
        self.feature = feature
        self.metric  = metric
        self.method  = method
        self.aspect  = aspect
            
class Paths():
    DATA_DIRECTORY = "/Users/markusyoussef/Desktop/git/supplements/data"
    RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
    YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed_data/yeast"
    NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
    ANNOTATION_DIRECTORY = f"{YEAST_DIRECTORY}/annotations"
    
    def __init__(self, in_parms):
        self.NETWORK_FILE    = f"{self.NETWORK_DIRECTORY}/{in_parms.network_name}.txt"
        self.ANNOTATION_FILE = f"{self.ANNOTATION_DIRECTORY}/GO_{in_parms.aspect}_systematic_SGD.csv"
        
        network_to_method = f"{in_parms.network_name}/{in_parms.feature}/{in_parms.metric}/{in_parms.method}"
        
        self.CLUSTER_DIRECTORY = f"{self.YEAST_DIRECTORY}/clusterings/"  \
                                 f"{network_to_method}"
        self.PVALUE_DIRECTORY  = f"{self.YEAST_DIRECTORY}/pvalues/"      \
                                 f"{network_to_method}/{in_parms.aspect}"
        self.ENRICHMENT_DIRECTORY = f"{self.YEAST_DIRECTORY}/enrichments/"   \
                                    f"{network_to_method}/{in_parms.aspect}/{in_parms.CORRECTION}"

In [3]:
# =============================================================================
#  ----------------------------------- INIT -----------------------------------
# =============================================================================

network_names = {'systematic_PPI_BioGRID', 'GI_Constanzo2016',
                 'systematic_CoEx_COEXPRESdb'}
features = {'GDV'}
metrics  = {'mahalanobis', 'GDV_similarity', 'seuclidean', 'hellinger',
            'cityblock', 'euclidean', 'chebyshev', 'canberra', 'cosine',
            'correlation', 'braycurtis', 'sqeuclidean'}
methods  = {'kmedoid'}
aspects  = {'CC'}

In [16]:
GDV_df = pd.DataFrame()

loop_product = product(network_names, features, metrics, methods, aspects)
for network_name, feature, metric, method, aspect in loop_product:
    in_parms = InputParameters(network_name, feature, metric, method, aspect)
    ENRICHMENT_DIRECTORY = Paths(in_parms).ENRICHMENT_DIRECTORY
    AUCs = [np.mean(np.loadtxt(f"{ENRICHMENT_DIRECTORY}/{file}"))
         for file in os.listdir(ENRICHMENT_DIRECTORY) if file.endswith('GOterms.csv')]
    GDV_df.loc[metric,network_name] = np.mean(AUCs)

In [17]:
network = 'systematic_PPI_BioGRID'
GDV_df.nlargest(10, network)[[network]]

Unnamed: 0,systematic_PPI_BioGRID
mahalanobis,0.216226
GDV_similarity,0.148632
canberra,0.136514
seuclidean,0.096847
hellinger,0.085392
correlation,0.083594
cosine,0.077629
braycurtis,0.058601
cityblock,0.051974
euclidean,0.049323


In [18]:
network = 'systematic_CoEx_COEXPRESdb'
GDV_df.nlargest(10, network)[[network]]

Unnamed: 0,systematic_CoEx_COEXPRESdb
mahalanobis,0.207551
seuclidean,0.161165
canberra,0.107001
hellinger,0.103301
braycurtis,0.101973
sqeuclidean,0.098463
cityblock,0.098084
GDV_similarity,0.096661
euclidean,0.095143
cosine,0.08708


In [19]:
network = 'GI_Constanzo2016'
GDV_df.nlargest(10, network)[[network]]

Unnamed: 0,GI_Constanzo2016
mahalanobis,0.196268
seuclidean,0.176105
cityblock,0.148541
braycurtis,0.148541
sqeuclidean,0.143072
euclidean,0.142214
canberra,0.13846
chebyshev,0.133097
GDV_similarity,0.122694
cosine,0.064136


In [34]:
in_parms = InputParameters('systematic_CoEx_COEXPRESdb','GDV', 'mahalanobis', 'kmedoid', 'BP')
path = Paths(in_parms).ENRICHMENT_DIRECTORY

In [40]:
pd.read_csv(f"{path}/0_43-63_GOterms.csv", names=['DUMMY'])

Unnamed: 0,DUMMY
0,0.135865
1,0.128692
2,0.142194
3,0.151899
4,0.127426
5,0.106751
6,0.127426
7,0.137553
8,0.118987
9,0.131646


In [4]:
feature_counter = Counter()
counter = Counter()

enrichments = defaultdict(dict)

for network, aspect in product(networks, aspects):
    df = pd.DataFrame()
    features = {'GDV', 
                #'GCV-DG-sym',
#                 'GCV-G', 'GCV-A', 'GCV-3', 'GCV-G-sym', 'GCV-O', 'GCV-O+', 
#                 'GCV-DAG', 'GCV-DA', 'GCV-DG'
               }
    metrics  = {'mahalanobis'
        #'cityblock', 'hellinger',
#                 'seuclidean', 'sqeuclidean', 'GDV_similarity'
#                 'euclidean', 'chebyshev', 'sqeuclidean', 'canberra', 
#                 'cosine', 'correlation', 'braycurtis', 
               }

    for feature, metric in  product(features, metrics):
        if not feature == 'GDV' and metric in {'GDV_similarity', 'mahalanobis', 'seuclidean'}:
            continue
        ENRICHMENT_DIRECTORY = f"/media/clusterduck123/joe/new_MareNostrum/enrichments/" \
                        f"{network}/{feature}/{metric}/{method}/{aspect}/{correction}"

        df.loc[feature,metric] = np.mean(
            [np.mean(np.loadtxt(f"{ENRICHMENT_DIRECTORY}/{file}")) 
                for file in os.listdir(ENRICHMENT_DIRECTORY) if file.endswith('GO-terms.csv')])
    
    enrichments[network][aspect] = df.copy()
    
    break
        
    for i in range(5):
        arr = np.nan_to_num(df.values)
        idx, col = np.unravel_index(arr.argmax(), arr.shape)
        counter.update([(df.index[idx], df.columns[col])])
        df.iloc[idx, col] = 0
        
        feature_counter.update([df.index[idx]])

FileNotFoundError: [Errno 2] No such file or directory: '/media/clusterduck123/joe/new_MareNostrum/enrichments/systematic_PPI_BioGRID/GDV/mahalanobis/kmedoid/BP/BY'

In [15]:
df

Unnamed: 0,hellinger,cityblock
GDV,0.105993,0.198774
GCV-DG-sym,2.103019,2.103019


In [13]:
network

'GI_Constanzo2016'

In [10]:
feature_counter

Counter({'GCV-DG-sym': 8, 'GDV': 12})

In [22]:
feature_counter.most_common(5)

[('GCV-O', 3), ('GCV-G', 3), ('GCV-O+', 3), ('GCV-DG-sym', 2), ('GCV-DG', 2)]

In [13]:
df.T.max()

GDV           0.129322
GCV-DG        0.079581
GCV-G         0.082358
GCV-O         0.087043
GCV-DA        0.066541
GCV-O+        0.082366
GCV-A         0.059231
GCV-DG-sym    0.079245
GCV-DAG       0.077285
GCV-G-sym     0.085842
GCV-3         0.046147
dtype: float64

In [26]:
m = map(lambda x : x**2, range(10))

In [27]:
asdf 

map

In [4]:
counter.most_common(5)

[(('GDV', 'mahalanobis'), 3),
 (('GDV', 'seuclidean'), 3),
 (('GCV-G-sym', 'hellinger'), 3),
 (('GCV-G-sym', 'canberra'), 2),
 (('GCV-G', 'hellinger'), 2)]

In [159]:
enrichments['systematic_CoEx_COEXPRESdb']['BP'].loc[['GCV-O', 'GCV-O+']]

Unnamed: 0,braycurtis,sqeuclidean,euclidean,correlation,cosine,cityblock,hellinger,chebyshev,canberra,mahalanobis,seuclidean,GDV_similarity
GCV-O,0.250993,0.260695,0.258136,0.221505,0.254354,0.268831,0.265584,0.224714,0.25974,,,
GCV-O+,0.249503,0.24725,0.263331,0.215279,0.247212,0.240985,0.252979,0.231513,0.247785,,,


In [112]:
df

Unnamed: 0,braycurtis,sqeuclidean,euclidean,correlation,cosine,cityblock,hellinger,chebyshev,canberra,mahalanobis,seuclidean,GDV_similarity
GCV-A,0.260915,0.257986,0.250961,0.250841,0.253922,0.261725,0.304488,0.25254,0.277967,,,
GDV,0.098104,0.057275,0.077568,0.198949,0.198503,0.09765,0.237046,0.059331,0.287556,0.293964,0.208229,0.287072
GCV-3,0.231821,0.211896,0.256946,0.061799,0.193551,0.203442,0.27445,0.313038,0.302917,,,
GCV-G,0.0,0.0,0.0,0.278181,0.0,0.32616,0.0,0.252755,0.329502,,,


In [73]:
for i in range(5):
    arr = np.nan_to_num(df.values)
    idx, col = np.unravel_index(arr.argmax(), arr.shape)
    print(df.index[idx], df.columns[col])
    print(df.iloc[idx, col])
    print()
    df.iloc[idx, col] = 0

GCV-G braycurtis
0.6525728266687054

GCV-G euclidean
0.6286795626576955

GCV-G sqeuclidean
0.4768522058261335

GCV-G cosine
0.4669890664423886

GCV-G cityblock
0.4302316690878508



In [74]:
df

Unnamed: 0,braycurtis,sqeuclidean,euclidean,correlation,cosine,cityblock,hellinger,chebyshev,canberra,mahalanobis,seuclidean,GDV_similarity
GCV-A,0.278901,0.285668,0.285209,0.283584,0.293581,0.292702,0.364783,0.281635,0.307114,,,
GDV,0.105184,0.064084,0.08131,0.189047,0.190729,0.104419,0.234857,0.062788,0.321504,0.406709,0.22418,0.329563
GCV-3,0.337182,0.259462,0.414749,0.065544,0.239812,0.257053,0.323075,0.332403,0.379119,,,
GCV-G,0.0,0.0,0.0,0.288248,0.0,0.0,0.295894,0.300157,0.390339,,,


In [57]:
idx, col = np.unravel_index(arr.argmax(), arr.shape)

In [58]:
df.index[idx], df.columns[col]

('GDV', 'mahalanobis')

In [54]:
df

Unnamed: 0,braycurtis,sqeuclidean,euclidean,correlation,cosine,cityblock,hellinger,chebyshev,canberra,mahalanobis,seuclidean,GDV_similarity
GCV-A,0.151184,0.172307,0.164133,0.192666,0.171218,0.157086,0.174408,0.168583,0.16484,,,
GDV,0.180982,0.168919,0.163388,0.146784,0.148323,0.17869,0.179179,0.134228,0.198293,0.330355,0.251505,0.187078
GCV-3,0.122364,0.127063,0.125134,0.067609,0.123988,0.118793,0.124637,0.126623,0.120264,,,
GCV-G,0.243163,0.251986,0.254889,0.2572,0.26207,0.244328,0.269347,0.261077,0.255405,,,


In [18]:
df.loc[idx]

Unnamed: 0,braycurtis,sqeuclidean,euclidean,correlation,cosine,cityblock,hellinger,chebyshev,canberra,mahalanobis,seuclidean,GDV_similarity
GCV-G,0.243163,0.251986,0.254889,0.2572,0.26207,0.244328,0.269347,0.261077,0.255405,,,
GCV-G,0.243163,0.251986,0.254889,0.2572,0.26207,0.244328,0.269347,0.261077,0.255405,,,
GCV-G,0.243163,0.251986,0.254889,0.2572,0.26207,0.244328,0.269347,0.261077,0.255405,,,
GCV-G,0.243163,0.251986,0.254889,0.2572,0.26207,0.244328,0.269347,0.261077,0.255405,,,
GCV-G,0.243163,0.251986,0.254889,0.2572,0.26207,0.244328,0.269347,0.261077,0.255405,,,
GCV-G,0.243163,0.251986,0.254889,0.2572,0.26207,0.244328,0.269347,0.261077,0.255405,,,
GCV-G,0.243163,0.251986,0.254889,0.2572,0.26207,0.244328,0.269347,0.261077,0.255405,,,
GCV-G,0.243163,0.251986,0.254889,0.2572,0.26207,0.244328,0.269347,0.261077,0.255405,,,
GCV-G,0.243163,0.251986,0.254889,0.2572,0.26207,0.244328,0.269347,0.261077,0.255405,,,
GDV,0.180982,0.168919,0.163388,0.146784,0.148323,0.17869,0.179179,0.134228,0.198293,0.330355,0.251505,0.187078


In [9]:
df

Unnamed: 0,braycurtis,sqeuclidean,euclidean,correlation,cosine,cityblock,hellinger,chebyshev,canberra,mahalanobis,seuclidean,GDV_similarity
GCV-A,0.151184,0.172307,0.164133,0.192666,0.171218,0.157086,0.174408,0.168583,0.16484,,,
GDV,0.180982,0.168919,0.163388,0.146784,0.148323,0.17869,0.179179,0.134228,0.198293,0.330355,0.251505,0.187078
GCV-3,0.122364,0.127063,0.125134,0.067609,0.123988,0.118793,0.124637,0.126623,0.120264,,,
GCV-G,0.243163,0.251986,0.254889,0.2572,0.26207,0.244328,0.269347,0.261077,0.255405,,,


In [24]:
df.T.nlargest(6, 'GDV')

Unnamed: 0,GDV
mahalanobis,0.286341
seuclidean,0.23945
canberra,0.192285
braycurtis,0.188662
cityblock,0.184667
GDV_similarity,0.182297
