In [1]:
from itertools import islice, combinations, product
from collections import defaultdict
from goatools import obo_parser
from functools import partial

import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
sns.set()
pd.set_option("display.max_columns", 50)

DATA_DIRECTORY = "/media/clusterduck123/joe/data"
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw-data"
YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed-data/yeast"
NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
MATRIX_DIRECTORY  = f"{YEAST_DIRECTORY}/distance-matrices"
ANNOTATION_DIRECTORY  = f"{YEAST_DIRECTORY}/annotations"

# Preprocessing

In [3]:
aspect = 'BP'
correction = 'BH'

alpha = 0.05
lb_GO = 5
ub_GO = 500
min_lvl = 0
max_lvl = 100

In [4]:
PPI = nx.read_edgelist(f"{NETWORK_DIRECTORY}/PPI_BioGRID.txt")

annotation_df = pd.read_csv(f"{ANNOTATION_DIRECTORY}/GO_{aspect}_BioGRID-SGD.csv")

go_dag = obo_parser.GODag(f"{RAW_DATA_DIRECTORY}/go-basic.obo")

gene_population = set(PPI.nodes())
GO_population = {go_id for go_id in set(annotation_df.GO_ID) 
                           if (lb_GO <= len(annotation_df[annotation_df.GO_ID == go_id]) <= ub_GO and
                               min_lvl <= go_dag[go_id].level <= max_lvl)}

annotation_df = annotation_df[annotation_df.GO_ID.isin(GO_population)]

/media/clusterduck123/joe/data/raw-data/go-basic.obo: fmt(1.2) rel(2019-10-07) 47,285 GO Terms


In [5]:
# Conversion dictionaries
GO2genes = pd.Series({go_id: set(genes.Systematic_ID) for go_id, genes in annotation_df.groupby('GO_ID')}, 
                     name='nb_genes')
gene2GO = defaultdict(set)
gene2GO  = {gene : set(go_ids.GO_ID)        for gene, go_ids in annotation_df.groupby('Systematic_ID')}
global_GO_counter = GO2genes.apply(len)

In [6]:
def get_number_of_pre_runs(PVALUE_DIRECTORY, n_clusters = 99):
    splitted_file_names = [name.split('_') for name in os.listdir(PVALUE_DIRECTORY)]
    pre_runs = [int(run) for run, ncluster, db_txt in splitted_file_names if ncluster == str(n_clusters)]
    if pre_runs:
        return max(pre_runs)+1
    else:
        return 0

# Load

In [7]:
def get_enrichments(alpha, p_values, cluster_list, correction):
    relevant_p_values = [p_values[str(cluster_idx)][cluster2GO(cluster)] 
                             for cluster_idx,cluster in enumerate(cluster_list)] 
    
    sorted_p_values = sorted(p for p_cluster in relevant_p_values
                               for p in p_cluster)
    m = len(sorted_p_values)
    if   correction == 'BY':
        c = np.log(m) + np.euler_gamma + 1/(2*m)
    elif correction == 'BH':
        c = 1
    else:
        print("Correction not known!")
        raise Exception
    for k,P_k in enumerate(sorted_p_values,1):
        if P_k > k/(m*c) * alpha:
            break
    threshold = sorted_p_values[k-2]
    return p_values < threshold


def cluster2GO(cluster):
    return set.union(*(gene2GO.get(gene, set()) for gene in cluster))

def is_annotated_in(gene, GO_set):
    return not gene2GO.get(gene,set()).isdisjoint(GO_set)

In [8]:
cluster_coverages = {}
GO_coverages      = {}
gene_coverages    = {}

In [9]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 99
MAX_RUNS     = 30

## GDV

In [10]:
feature = 'GDV'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [11]:
cluster_coverages['GDV'] = defaultdict(pd.DataFrame)
GO_coverages[     'GDV'] = defaultdict(pd.DataFrame)
gene_coverages[   'GDV'] = defaultdict(pd.DataFrame)

In [12]:
method = 'kmedoid'

for distance in all_distances:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GDV/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GDV/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GDV'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS)) 
        GO_coverages[     'GDV'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        gene_coverages[   'GDV'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GDV'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GDV'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GDV'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GDV/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GDV'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GDV'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GDV'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

GDV_similarity 0
GDV_similarity 1
GDV_similarity 2
GDV_similarity 3
GDV_similarity 4
GDV_similarity 5
GDV_similarity 6
GDV_similarity 7
GDV_similarity 8
GDV_similarity 9
98: 11.25sec
braycurtis 0
braycurtis 1
braycurtis 2
braycurtis 3
braycurtis 4
braycurtis 5
braycurtis 6
braycurtis 7
braycurtis 8
braycurtis 9
98: 11.47sec
canberra 0
canberra 1ec
canberra 2ec
canberra 3ec
canberra 4ec
canberra 5ec
canberra 6ec
canberra 7ec
canberra 8ec
canberra 9ec
98: 11.57sec
chebyshev 0
chebyshev 1c
chebyshev 2c
chebyshev 3c
chebyshev 4c
chebyshev 5c
chebyshev 6c
chebyshev 7c
chebyshev 8c
chebyshev 9c
98: 16.06sec
cityblock 0
cityblock 1c
cityblock 2c
cityblock 3c
cityblock 4c
cityblock 5c
cityblock 6c
cityblock 7c
cityblock 8c
cityblock 9c
98: 17.34sec
correlation 0
correlation 1
correlation 2
correlation 3
correlation 4
correlation 5
correlation 6
correlation 7
correlation 8
correlation 9
98: 18.78sec
cosine 0
cosine 17sec
cosine 22sec
cosine 36sec
cosine 49sec
cosine 58sec
cosine 60sec
cosine 71

## GCV-A

In [13]:
cluster_coverages['GCV-A'] = defaultdict(pd.DataFrame)
GO_coverages[     'GCV-A'] = defaultdict(pd.DataFrame)
gene_coverages[   'GCV-A'] = defaultdict(pd.DataFrame)

In [14]:
method = 'kmedoid'

for distance in ['canberra']:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-A/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GCV-A/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GCV-A'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     'GCV-A'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   'GCV-A'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GCV-A'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GCV-A'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GCV-A'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GCV-A/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GCV-A'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GCV-A'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GCV-A'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

canberra 0
canberra 1ec
canberra 2ec
canberra 3ec
canberra 4ec
canberra 5ec
canberra 6ec
canberra 7ec
canberra 8ec
canberra 9ec
canberra 10c
canberra 11c
canberra 12c
canberra 13c
canberra 14c
canberra 15c
canberra 16c
canberra 17c
canberra 18c
canberra 19c
canberra 20c
canberra 21c
canberra 22c
canberra 23c
canberra 24c
canberra 25c
canberra 26c
canberra 27c
canberra 28c
canberra 29c
98: 11.34sec


# GCV-G

In [15]:
cluster_coverages['GCV-G'] = defaultdict(pd.DataFrame)
GO_coverages[     'GCV-G'] = defaultdict(pd.DataFrame)
gene_coverages[   'GCV-G'] = defaultdict(pd.DataFrame)

In [16]:
method = 'kmedoid'

for distance in ['canberra']:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-G/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GCV-G/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GCV-G'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     'GCV-G'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   'GCV-G'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GCV-G'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GCV-G'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GCV-G'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GCV-G/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GCV-G'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GCV-G'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GCV-G'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

canberra 0
canberra 1ec
canberra 2ec
canberra 3ec
canberra 4ec
canberra 5ec
canberra 6ec
canberra 7ec
canberra 8ec
canberra 9ec
canberra 10c
canberra 11c
canberra 12c
canberra 13c
canberra 14c
canberra 15c
canberra 16c
canberra 17c
canberra 18c
canberra 19c
canberra 20c
canberra 21c
canberra 22c
canberra 23c
canberra 24c
canberra 25c
canberra 26c
canberra 27c
canberra 28c
canberra 29c
98: 11.29sec


# GCV-DG

In [17]:
feature = 'GCV-DG'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [18]:
cluster_coverages[feature] = defaultdict(pd.DataFrame)
GO_coverages[     feature] = defaultdict(pd.DataFrame)
gene_coverages[   feature] = defaultdict(pd.DataFrame)

In [19]:
method = 'kmedoid'

for distance in {'canberra'}:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages[feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[feature][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     feature][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   feature][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/{feature}/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages[feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

canberra 0
canberra 1ec
canberra 2ec
canberra 3ec
canberra 4ec
canberra 5ec
canberra 6ec
canberra 7ec
canberra 8ec
canberra 9ec
98: 11.56sec


# GCV-AD

In [20]:
cluster_coverages['GCV-AD'] = defaultdict(pd.DataFrame)
GO_coverages[     'GCV-AD'] = defaultdict(pd.DataFrame)
gene_coverages[   'GCV-AD'] = defaultdict(pd.DataFrame)

In [21]:
method = 'kmedoid'

for distance in ['canberra']:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-AD/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GCV-AD/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GCV-AD'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     'GCV-AD'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   'GCV-AD'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GCV-AD'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GCV-AD'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GCV-AD'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GCV-AD/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GCV-AD'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GCV-AD'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GCV-AD'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

canberra 0
canberra 1ec
canberra 2ec
canberra 3ec
canberra 4ec
canberra 5ec
canberra 6ec
canberra 7ec
canberra 8ec
canberra 9ec
98: 15.54sec


## GCV-all

In [22]:
feature = 'GCV-all'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [23]:
cluster_coverages[feature] = defaultdict(pd.DataFrame)
GO_coverages[     feature] = defaultdict(pd.DataFrame)
gene_coverages[   feature] = defaultdict(pd.DataFrame)

In [24]:
method = 'kmedoid'

for distance in {'canberra'}:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages[feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[feature][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     feature][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   feature][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/{feature}/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages[feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

canberra 0
canberra 1ec
canberra 2ec
canberra 3ec
canberra 4ec
98: 11.62sec


## GCV-nonredundant

In [25]:
feature = 'GCV-nonredundant'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [26]:
cluster_coverages[feature] = defaultdict(pd.DataFrame)
GO_coverages[     feature] = defaultdict(pd.DataFrame)
gene_coverages[   feature] = defaultdict(pd.DataFrame)

In [27]:
method = 'kmedoid'

for distance in {'canberra'}:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages[feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[feature][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     feature][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   feature][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/{feature}/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages[feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

canberra 0
canberra 1ec
canberra 2ec
canberra 3ec
canberra 4ec
98: 15.77sec


## GCV-orca

In [34]:
feature = 'GCV-orca'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [35]:
cluster_coverages[feature] = defaultdict(pd.DataFrame)
GO_coverages[     feature] = defaultdict(pd.DataFrame)
gene_coverages[   feature] = defaultdict(pd.DataFrame)

In [36]:
method = 'kmedoid'

for distance in {'hellinger'}:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages[feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[feature][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     feature][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   feature][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/{feature}/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages[feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

hellinger 0
hellinger 1c
hellinger 2c
hellinger 3c
hellinger 4c
hellinger 5c
hellinger 6c
hellinger 7c
hellinger 8c
hellinger 9c
98: 10.39sec


## GCV-orca+

In [31]:
feature = 'GCV-orca+'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [32]:
cluster_coverages[feature] = defaultdict(pd.DataFrame)
GO_coverages[     feature] = defaultdict(pd.DataFrame)
gene_coverages[   feature] = defaultdict(pd.DataFrame)

In [33]:
method = 'kmedoid'

for distance in {'cityblock'}:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages[feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[feature][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     feature][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   feature][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/{feature}/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages[feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

cityblock 0
cityblock 1c
cityblock 2c
cityblock 3c
cityblock 4c
cityblock 5c
cityblock 6c
cityblock 7c
cityblock 8c
cityblock 9c
98: 10.41sec
