In [5]:
from itertools import islice, combinations, product
from collections import defaultdict
from goatools import obo_parser
from functools import partial

import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt

In [6]:
%matplotlib inline
sns.set()
pd.set_option("display.max_columns", 50)

DATA_DIRECTORY = "/media/clusterduck123/joe/data"
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw-data"
YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed-data/organisms/yeast"
NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
MATRIX_DIRECTORY  = f"{YEAST_DIRECTORY}/distance-matrices"
ANNOTATION_DIRECTORY  = f"{YEAST_DIRECTORY}/annotations"

# Preprocessing

In [7]:
aspect = 'BP'
correction = 'BY'

alpha = 0.05
lb_GO = 5
ub_GO = 500
min_lvl = 0
max_lvl = 100

In [8]:
PPI = nx.read_edgelist(f"{NETWORK_DIRECTORY}/PPI_BioGRID.txt")

annotation_df = pd.read_csv(f"{ANNOTATION_DIRECTORY}/GO_{aspect}_BioGRID-SGD.csv")

go_dag = obo_parser.GODag(f"{RAW_DATA_DIRECTORY}/go-basic.obo")

gene_population = set(PPI.nodes())
GO_population = {go_id for go_id in set(annotation_df.GO_ID) 
                           if (lb_GO <= len(annotation_df[annotation_df.GO_ID == go_id]) <= ub_GO and
                               min_lvl <= go_dag[go_id].level <= max_lvl)}

annotation_df = annotation_df[annotation_df.GO_ID.isin(GO_population)]

/media/clusterduck123/joe/data/raw-data/go-basic.obo: fmt(1.2) rel(2019-10-07) 47,285 GO Terms


In [9]:
# Conversion dictionaries
GO2genes = pd.Series({go_id: set(genes.Systematic_ID) for go_id, genes in annotation_df.groupby('GO_ID')}, 
                     name='nb_genes')
gene2GO = defaultdict(set)
gene2GO  = {gene : set(go_ids.GO_ID)        for gene, go_ids in annotation_df.groupby('Systematic_ID')}
global_GO_counter = GO2genes.apply(len)

In [10]:
def get_number_of_pre_runs(PVALUE_DIRECTORY, n_clusters = 99):
    splitted_file_names = [name.split('_') for name in os.listdir(PVALUE_DIRECTORY)]
    pre_runs = [int(run) for run, ncluster, db_txt in splitted_file_names if ncluster == str(n_clusters)]
    if pre_runs:
        return max(pre_runs)+1
    else:
        return 0

# Load

In [11]:
def get_enrichments(alpha, p_values, cluster_list, correction):
    relevant_p_values = [p_values[str(cluster_idx)][cluster2GO(cluster)] 
                             for cluster_idx,cluster in enumerate(cluster_list)] 
    
    sorted_p_values = sorted(p for p_cluster in relevant_p_values
                               for p in p_cluster)
    m = len(sorted_p_values)
    if   correction == 'BY':
        c = np.log(m) + np.euler_gamma + 1/(2*m)
    elif correction == 'BH':
        c = 1
    else:
        print("Correction not known!")
        raise Exception
    for k,P_k in enumerate(sorted_p_values,1):
        if P_k > k/(m*c) * alpha:
            break
    threshold = sorted_p_values[k-2]
    return p_values < threshold


def cluster2GO(cluster):
    return set.union(*(gene2GO.get(gene, set()) for gene in cluster))

def is_annotated_in(gene, GO_set):
    return not gene2GO.get(gene,set()).isdisjoint(GO_set)

In [12]:
cluster_coverages = {}
GO_coverages      = {}
gene_coverages    = {}

In [13]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 99
MAX_RUNS     = 30

## GDV

In [14]:
cluster_coverages['GDV'] = defaultdict(pd.DataFrame)
GO_coverages[     'GDV'] = defaultdict(pd.DataFrame)
gene_coverages[   'GDV'] = defaultdict(pd.DataFrame)

In [15]:
method = 'kmedoid'

for distance in ['sqeuclidean', 'braycurtis', 'seuclidean', 'cosine', 'correlation',
                 'canberra',
                 'mahalanobis', 
                 'GDV-similarity',
    
                 'cityblock',
                 'euclidean',
                 'chebyshev',
                
                 'normalized1-l1',
                 'normalized1-l2',
                 'normalized1-linf',
]:
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GDV/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GDV/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GDV'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS)) 
        GO_coverages[     'GDV'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        gene_coverages[   'GDV'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GDV'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GDV'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GDV'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GDV/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GDV'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GDV'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GDV'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

sqeuclidean 0
sqeuclidean 1
sqeuclidean 2
sqeuclidean 3
sqeuclidean 4
sqeuclidean 5
sqeuclidean 6
sqeuclidean 7
sqeuclidean 8
sqeuclidean 9
sqeuclidean 10
sqeuclidean 11
sqeuclidean 12
sqeuclidean 13
sqeuclidean 14
sqeuclidean 15
sqeuclidean 16
sqeuclidean 17
sqeuclidean 18
sqeuclidean 19
sqeuclidean 20
sqeuclidean 21
sqeuclidean 22
sqeuclidean 23
sqeuclidean 24
sqeuclidean 25
sqeuclidean 26
sqeuclidean 27
sqeuclidean 28
sqeuclidean 29
98: 11.90sec
braycurtis 0
braycurtis 1
braycurtis 2
braycurtis 3
braycurtis 4
braycurtis 5
braycurtis 6
braycurtis 7
braycurtis 8
braycurtis 9
braycurtis 10
braycurtis 11
braycurtis 12
braycurtis 13
braycurtis 14
braycurtis 15
braycurtis 16
braycurtis 17
braycurtis 18
braycurtis 19
braycurtis 20
braycurtis 21
braycurtis 22
braycurtis 23
braycurtis 24
braycurtis 25
braycurtis 26
braycurtis 27
braycurtis 28
braycurtis 29
98: 12.03sec
seuclidean 0
seuclidean 1
seuclidean 2
seuclidean 3
seuclidean 4
seuclidean 5
seuclidean 6
seuclidean 7
seuclidean 8
seuclid

## GCV-A

In [16]:
cluster_coverages['GCV-A'] = defaultdict(pd.DataFrame)
GO_coverages[     'GCV-A'] = defaultdict(pd.DataFrame)
gene_coverages[   'GCV-A'] = defaultdict(pd.DataFrame)

In [17]:
method = 'kmedoid'

for distance in [
                 'normalized1-l1',
                 'normalized1-l2',
                 'normalized1-linf'
                ]:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-A/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GCV-A/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GCV-A'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     'GCV-A'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   'GCV-A'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GCV-A'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GCV-A'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GCV-A'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GCV-A/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GCV-A'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GCV-A'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GCV-A'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

normalized1-l1 0
normalized1-l1 1
normalized1-l1 2
normalized1-l1 3
normalized1-l1 4
normalized1-l1 5
normalized1-l1 6
normalized1-l1 7
normalized1-l1 8
normalized1-l1 9
normalized1-l1 10
normalized1-l1 11
normalized1-l1 12
normalized1-l1 13
normalized1-l1 14
normalized1-l1 15
normalized1-l1 16
normalized1-l1 17
normalized1-l1 18
normalized1-l1 19
normalized1-l1 20
normalized1-l1 21
normalized1-l1 22
normalized1-l1 23
normalized1-l1 24
normalized1-l1 25
normalized1-l1 26
normalized1-l1 27
normalized1-l1 28
normalized1-l1 29
98: 11.32sec
normalized1-l2 0
normalized1-l2 1
normalized1-l2 2
normalized1-l2 3
normalized1-l2 4
normalized1-l2 5
normalized1-l2 6
normalized1-l2 7
normalized1-l2 8
normalized1-l2 9
normalized1-l2 10
normalized1-l2 11
normalized1-l2 12
normalized1-l2 13
normalized1-l2 14
normalized1-l2 15
normalized1-l2 16
normalized1-l2 17
normalized1-l2 18
normalized1-l2 19
normalized1-l2 20
normalized1-l2 21
normalized1-l2 22
normalized1-l2 23
normalized1-l2 24
normalized1-l2 25

# GCV-G

In [18]:
cluster_coverages['GCV-G'] = defaultdict(pd.DataFrame)
GO_coverages[     'GCV-G'] = defaultdict(pd.DataFrame)
gene_coverages[   'GCV-G'] = defaultdict(pd.DataFrame)

In [19]:
method = 'kmedoid'

for distance in [
                 'normalized1-l1',
                 'normalized1-l2',
                 'normalized1-linf'
                ]:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-G/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GCV-G/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GCV-G'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     'GCV-G'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   'GCV-G'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GCV-G'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GCV-G'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GCV-G'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GCV-G/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GCV-G'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GCV-G'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GCV-G'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

normalized1-l1 0
normalized1-l1 1
normalized1-l1 2
normalized1-l1 3
normalized1-l1 4
normalized1-l1 5
normalized1-l1 6
normalized1-l1 7
normalized1-l1 8
normalized1-l1 9
normalized1-l1 10
normalized1-l1 11
normalized1-l1 12
normalized1-l1 13
normalized1-l1 14
normalized1-l1 15
normalized1-l1 16
normalized1-l1 17
normalized1-l1 18
normalized1-l1 19
normalized1-l1 20
normalized1-l1 21
normalized1-l1 22
normalized1-l1 23
normalized1-l1 24
normalized1-l1 25
normalized1-l1 26
normalized1-l1 27
normalized1-l1 28
normalized1-l1 29
98: 11.63sec
normalized1-l2 0
normalized1-l2 1
normalized1-l2 2
normalized1-l2 3
normalized1-l2 4
normalized1-l2 5
normalized1-l2 6
normalized1-l2 7
normalized1-l2 8
normalized1-l2 9
normalized1-l2 10
normalized1-l2 11
normalized1-l2 12
normalized1-l2 13
normalized1-l2 14
normalized1-l2 15
normalized1-l2 16
normalized1-l2 17
normalized1-l2 18
normalized1-l2 19
normalized1-l2 20
normalized1-l2 21
normalized1-l2 22
normalized1-l2 23
normalized1-l2 24
normalized1-l2 25

# GCV-DG

In [20]:
cluster_coverages['GCV-DG'] = defaultdict(pd.DataFrame)
GO_coverages[     'GCV-DG'] = defaultdict(pd.DataFrame)
gene_coverages[   'GCV-DG'] = defaultdict(pd.DataFrame)

In [21]:
method = 'kmedoid'

for distance in [
                 'normalized1-l1',
                 'normalized1-l2',
                 'normalized1-linf'
                ]:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-DG/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GCV-DG/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GCV-DG'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     'GCV-DG'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   'GCV-DG'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GCV-DG'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GCV-DG'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GCV-DG'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GCV-DG/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GCV-DG'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GCV-DG'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GCV-DG'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

normalized1-l1 0
normalized1-l1 1
normalized1-l1 2
normalized1-l1 3
normalized1-l1 4
normalized1-l1 5
normalized1-l1 6
normalized1-l1 7
normalized1-l1 8
normalized1-l1 9
normalized1-l1 10
normalized1-l1 11
normalized1-l1 12
normalized1-l1 13
normalized1-l1 14
normalized1-l1 15
normalized1-l1 16
normalized1-l1 17
normalized1-l1 18
normalized1-l1 19
normalized1-l1 20
normalized1-l1 21
normalized1-l1 22
normalized1-l1 23
normalized1-l1 24
normalized1-l1 25
normalized1-l1 26
normalized1-l1 27
normalized1-l1 28
normalized1-l1 29
98: 10.85sec
normalized1-l2 0
normalized1-l2 1
normalized1-l2 2
normalized1-l2 3
normalized1-l2 4
normalized1-l2 5
normalized1-l2 6
normalized1-l2 7
normalized1-l2 8
normalized1-l2 9
normalized1-l2 10
normalized1-l2 11
normalized1-l2 12
normalized1-l2 13
normalized1-l2 14
normalized1-l2 15
normalized1-l2 16
normalized1-l2 17
normalized1-l2 18
normalized1-l2 19
normalized1-l2 20
normalized1-l2 21
normalized1-l2 22
normalized1-l2 23
normalized1-l2 24
normalized1-l2 25

# GCV-AD

In [22]:
cluster_coverages['GCV-AD'] = defaultdict(pd.DataFrame)
GO_coverages[     'GCV-AD'] = defaultdict(pd.DataFrame)
gene_coverages[   'GCV-AD'] = defaultdict(pd.DataFrame)

In [23]:
method = 'kmedoid'

for distance in [
                 'normalized1-l1',
                 'normalized1-l2',
                 'normalized1-linf'
                ]:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-AD/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GCV-AD/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GCV-AD'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     'GCV-AD'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   'GCV-AD'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GCV-AD'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GCV-AD'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GCV-AD'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GCV-AD/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GCV-AD'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GCV-AD'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GCV-AD'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

normalized1-l1 0
normalized1-l1 1
normalized1-l1 2
normalized1-l1 3
normalized1-l1 4
normalized1-l1 5
normalized1-l1 6
normalized1-l1 7
normalized1-l1 8
normalized1-l1 9
normalized1-l1 10
normalized1-l1 11
normalized1-l1 12
normalized1-l1 13
normalized1-l1 14
normalized1-l1 15
normalized1-l1 16
normalized1-l1 17
normalized1-l1 18
normalized1-l1 19
normalized1-l1 20
normalized1-l1 21
normalized1-l1 22
normalized1-l1 23
normalized1-l1 24
normalized1-l1 25
normalized1-l1 26
normalized1-l1 27
normalized1-l1 28
normalized1-l1 29
98: 10.90sec
normalized1-l2 0
normalized1-l2 1
normalized1-l2 2
normalized1-l2 3
normalized1-l2 4
normalized1-l2 5
normalized1-l2 6
normalized1-l2 7
normalized1-l2 8
normalized1-l2 9
normalized1-l2 10
normalized1-l2 11
normalized1-l2 12
normalized1-l2 13
normalized1-l2 14
normalized1-l2 15
normalized1-l2 16
normalized1-l2 17
normalized1-l2 18
normalized1-l2 19
normalized1-l2 20
normalized1-l2 21
normalized1-l2 22
normalized1-l2 23
normalized1-l2 24
normalized1-l2 25