In [1]:
from itertools import islice, combinations, product
from collections import defaultdict
from goatools import obo_parser
from functools import partial

import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
sns.set()
pd.set_option("display.max_columns", 50)

DATA_DIRECTORY = "/media/clusterduck123/joe/data"
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw-data"
YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed-data/yeast"
NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
MATRIX_DIRECTORY  = f"{YEAST_DIRECTORY}/distance-matrices"
ANNOTATION_DIRECTORY  = f"{YEAST_DIRECTORY}/annotations"

# Preprocessing

In [3]:
aspect = 'CC'
correction = 'BH'

alpha = 0.05
lb_GO = 5
ub_GO = 500
min_lvl = 0
max_lvl = 100

In [4]:
PPI = nx.read_edgelist(f"{NETWORK_DIRECTORY}/PPI_BioGRID.txt")

annotation_df = pd.read_csv(f"{ANNOTATION_DIRECTORY}/GO_{aspect}_BioGRID-SGD.csv")

go_dag = obo_parser.GODag(f"{RAW_DATA_DIRECTORY}/go-basic.obo")

gene_population = set(PPI.nodes())
GO_population = {go_id for go_id in set(annotation_df.GO_ID) 
                           if (lb_GO <= len(annotation_df[annotation_df.GO_ID == go_id]) <= ub_GO and
                               min_lvl <= go_dag[go_id].level <= max_lvl)}

annotation_df = annotation_df[annotation_df.GO_ID.isin(GO_population)]

/media/clusterduck123/joe/data/raw-data/go-basic.obo: fmt(1.2) rel(2019-10-07) 47,285 GO Terms


In [5]:
# Conversion dictionaries
GO2genes = pd.Series({go_id: set(genes.Systematic_ID) for go_id, genes in annotation_df.groupby('GO_ID')}, 
                     name='nb_genes')
gene2GO = defaultdict(set)
gene2GO  = {gene : set(go_ids.GO_ID)        for gene, go_ids in annotation_df.groupby('Systematic_ID')}
global_GO_counter = GO2genes.apply(len)

In [6]:
def get_number_of_pre_runs(PVALUE_DIRECTORY, n_clusters = 99):
    splitted_file_names = [name.split('_') for name in os.listdir(PVALUE_DIRECTORY)]
    pre_runs = [int(run) for run, ncluster, db_txt in splitted_file_names if ncluster == str(n_clusters)]
    if pre_runs:
        return max(pre_runs)+1
    else:
        return 0

# Load

In [7]:
def get_enrichments(alpha, p_values, cluster_list, correction):
    relevant_p_values = [p_values[str(cluster_idx)][cluster2GO(cluster)] 
                             for cluster_idx,cluster in enumerate(cluster_list)] 
    
    sorted_p_values = sorted(p for p_cluster in relevant_p_values
                               for p in p_cluster)
    m = len(sorted_p_values)
    if   correction == 'BY':
        c = np.log(m) + np.euler_gamma + 1/(2*m)
    elif correction == 'BH':
        c = 1
    else:
        print("Correction not known!")
        raise Exception
    for k,P_k in enumerate(sorted_p_values,1):
        if P_k > k/(m*c) * alpha:
            break
    threshold = sorted_p_values[k-2]
    return p_values < threshold


def cluster2GO(cluster):
    return set.union(*(gene2GO.get(gene, set()) for gene in cluster))

def is_annotated_in(gene, GO_set):
    return not gene2GO.get(gene,set()).isdisjoint(GO_set)

In [8]:
cluster_coverages = {}
GO_coverages      = {}
gene_coverages    = {}

In [9]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 99
MAX_RUNS     = 30

## GDV

In [10]:
cluster_coverages['GDV'] = defaultdict(pd.DataFrame)
GO_coverages[     'GDV'] = defaultdict(pd.DataFrame)
gene_coverages[   'GDV'] = defaultdict(pd.DataFrame)

In [11]:
method = 'kmedoid'

for distance in ['canberra', 'normalized1_linf']:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GDV/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GDV/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GDV'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS)) 
        GO_coverages[     'GDV'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        gene_coverages[   'GDV'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GDV'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GDV'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GDV'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GDV/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GDV'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GDV'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GDV'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

canberra 0
canberra 1c
canberra 2ec
canberra 3ec
canberra 4ec
canberra 5ec
canberra 6ec
98: 11.95sec
normalized1_linf 0
normalized1_linf 1
normalized1_linf 2
normalized1_linf 3
normalized1_linf 4
normalized1_linf 5
98: 9.79sec


## GCV-A

In [14]:
cluster_coverages['GCV-A'] = defaultdict(pd.DataFrame)
GO_coverages[     'GCV-A'] = defaultdict(pd.DataFrame)
gene_coverages[   'GCV-A'] = defaultdict(pd.DataFrame)

In [15]:
method = 'kmedoid'

for distance in [
                 'normalized1-l1',
                 'normalized1-l2',
                 'normalized1-linf'
                ]:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-A/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GCV-A/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GCV-A'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     'GCV-A'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   'GCV-A'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GCV-A'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GCV-A'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GCV-A'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GCV-A/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GCV-A'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GCV-A'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GCV-A'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

normalized1-l1 0
normalized1-l1 1
normalized1-l1 2
normalized1-l1 3
normalized1-l1 4
normalized1-l1 5
normalized1-l1 6
normalized1-l1 7
normalized1-l1 8
normalized1-l1 9
normalized1-l1 10
normalized1-l1 11
normalized1-l1 12
normalized1-l1 13
normalized1-l1 14
normalized1-l1 15
normalized1-l1 16
normalized1-l1 17
normalized1-l1 18
normalized1-l1 19
normalized1-l1 20
normalized1-l1 21
normalized1-l1 22
normalized1-l1 23
normalized1-l1 24
normalized1-l1 25
normalized1-l1 26
normalized1-l1 27
normalized1-l1 28
normalized1-l1 29
98: 10.42sec
normalized1-l2 0
normalized1-l2 1
normalized1-l2 2
normalized1-l2 3
normalized1-l2 4
normalized1-l2 5
normalized1-l2 6
normalized1-l2 7
normalized1-l2 8
normalized1-l2 9
normalized1-l2 10
normalized1-l2 11
normalized1-l2 12
normalized1-l2 13
normalized1-l2 14
normalized1-l2 15
normalized1-l2 16
normalized1-l2 17
normalized1-l2 18
normalized1-l2 19
normalized1-l2 20
normalized1-l2 21
normalized1-l2 22
normalized1-l2 23
normalized1-l2 24
normalized1-l2 25

# GCV-G

In [16]:
cluster_coverages['GCV-G'] = defaultdict(pd.DataFrame)
GO_coverages[     'GCV-G'] = defaultdict(pd.DataFrame)
gene_coverages[   'GCV-G'] = defaultdict(pd.DataFrame)

In [17]:
method = 'kmedoid'

for distance in [
                 'normalized1-l1',
                 'normalized1-l2',
                 'normalized1-linf'
                ]:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-G/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GCV-G/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GCV-G'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     'GCV-G'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   'GCV-G'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GCV-G'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GCV-G'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GCV-G'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GCV-G/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GCV-G'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GCV-G'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GCV-G'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

normalized1-l1 0
normalized1-l1 1
normalized1-l1 2
normalized1-l1 3
normalized1-l1 4
normalized1-l1 5
normalized1-l1 6
normalized1-l1 7
normalized1-l1 8
normalized1-l1 9
normalized1-l1 10
normalized1-l1 11
normalized1-l1 12
normalized1-l1 13
normalized1-l1 14
normalized1-l1 15
normalized1-l1 16
normalized1-l1 17
normalized1-l1 18
normalized1-l1 19
normalized1-l1 20
normalized1-l1 21
normalized1-l1 22
normalized1-l1 23
normalized1-l1 24
normalized1-l1 25
normalized1-l1 26
normalized1-l1 27
normalized1-l1 28
normalized1-l1 29
98: 10.39sec
normalized1-l2 0
normalized1-l2 1
normalized1-l2 2
normalized1-l2 3
normalized1-l2 4
normalized1-l2 5
normalized1-l2 6
normalized1-l2 7
normalized1-l2 8
normalized1-l2 9
normalized1-l2 10
normalized1-l2 11
normalized1-l2 12
normalized1-l2 13
normalized1-l2 14
normalized1-l2 15
normalized1-l2 16
normalized1-l2 17
normalized1-l2 18
normalized1-l2 19
normalized1-l2 20
normalized1-l2 21
normalized1-l2 22
normalized1-l2 23
normalized1-l2 24
normalized1-l2 25

# GCV-DG

In [10]:
feature = 'GCV-DG'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [11]:
cluster_coverages[feature] = defaultdict(pd.DataFrame)
GO_coverages[     feature] = defaultdict(pd.DataFrame)
gene_coverages[   feature] = defaultdict(pd.DataFrame)

In [12]:
method = 'kmedoid'

for distance in all_distances:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages[feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[feature][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     feature][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   feature][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/{feature}/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages[feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

braycurtis 0
braycurtis 1
braycurtis 2
braycurtis 3
braycurtis 4
braycurtis 5
braycurtis 6
braycurtis 7
braycurtis 8
braycurtis 9
braycurtis 10
braycurtis 11
braycurtis 12
braycurtis 13
braycurtis 14
braycurtis 15
braycurtis 16
braycurtis 17
braycurtis 18
braycurtis 19
braycurtis 20
braycurtis 21
braycurtis 22
braycurtis 23
braycurtis 24
braycurtis 25
braycurtis 26
braycurtis 27
braycurtis 28
98: 8.01sec
canberra 0
canberra 1c
canberra 2c
canberra 3c
canberra 4c
canberra 5c
canberra 6c
canberra 7c
canberra 8c
canberra 9c
canberra 10
canberra 11
canberra 12
canberra 13
canberra 14
canberra 15
canberra 16
canberra 17
canberra 18
canberra 19
canberra 20
canberra 21
canberra 22
canberra 23
canberra 24
canberra 25
canberra 26
canberra 27
canberra 28
98: 8.30sec
chebyshev 0
chebyshev 1
chebyshev 2
chebyshev 3
chebyshev 4
chebyshev 5
chebyshev 6
chebyshev 7
chebyshev 8
chebyshev 9
chebyshev 10
chebyshev 11
chebyshev 12
chebyshev 13
chebyshev 14
chebyshev 15
chebyshev 16
chebyshev 17
chebyshev

# GCV-AD

In [20]:
cluster_coverages['GCV-AD'] = defaultdict(pd.DataFrame)
GO_coverages[     'GCV-AD'] = defaultdict(pd.DataFrame)
gene_coverages[   'GCV-AD'] = defaultdict(pd.DataFrame)

In [21]:
method = 'kmedoid'

for distance in [
                 'normalized1-l1',
                 'normalized1-l2',
                 'normalized1-linf'
                ]:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-AD/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/GCV-AD/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages['GCV-AD'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     'GCV-AD'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   'GCV-AD'][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages['GCV-AD'][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     'GCV-AD'][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   'GCV-AD'][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/GCV-AD/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages['GCV-AD'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     'GCV-AD'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   'GCV-AD'][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

normalized1-l1 0
normalized1-l1 1
normalized1-l1 2
normalized1-l1 3
normalized1-l1 4
normalized1-l1 5
normalized1-l1 6
normalized1-l1 7
normalized1-l1 8
normalized1-l1 9
normalized1-l1 10
normalized1-l1 11
normalized1-l1 12
normalized1-l1 13
normalized1-l1 14
normalized1-l1 15
normalized1-l1 16
normalized1-l1 17
normalized1-l1 18
normalized1-l1 19
normalized1-l1 20
normalized1-l1 21
normalized1-l1 22
normalized1-l1 23
normalized1-l1 24
normalized1-l1 25
normalized1-l1 26
normalized1-l1 27
normalized1-l1 28
normalized1-l1 29
98: 10.21sec
normalized1-l2 0
normalized1-l2 1
normalized1-l2 2
normalized1-l2 3
normalized1-l2 4
normalized1-l2 5
normalized1-l2 6
normalized1-l2 7
normalized1-l2 8
normalized1-l2 9
normalized1-l2 10
normalized1-l2 11
normalized1-l2 12
normalized1-l2 13
normalized1-l2 14
normalized1-l2 15
normalized1-l2 16
normalized1-l2 17
normalized1-l2 18
normalized1-l2 19
normalized1-l2 20
normalized1-l2 21
normalized1-l2 22
normalized1-l2 23
normalized1-l2 24
normalized1-l2 25

## GCV-all1

In [10]:
feature = 'GCV-all1'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [11]:
cluster_coverages[feature] = defaultdict(pd.DataFrame)
GO_coverages[     feature] = defaultdict(pd.DataFrame)
gene_coverages[   feature] = defaultdict(pd.DataFrame)

In [12]:
method = 'kmedoid'

for distance in {'canberra'}:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages[feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[feature][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     feature][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   feature][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/{feature}/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages[feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

canberra 0
canberra 1ec
canberra 2ec
canberra 3ec
canberra 4ec
98: 10.41sec


## GCV-all2

In [19]:
feature = 'GCV-all2'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [20]:
cluster_coverages[feature] = defaultdict(pd.DataFrame)
GO_coverages[     feature] = defaultdict(pd.DataFrame)
gene_coverages[   feature] = defaultdict(pd.DataFrame)

In [21]:
method = 'kmedoid'

for distance in {'canberra'}:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages[feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[feature][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     feature][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   feature][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/{feature}/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages[feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

canberra 0
canberra 1ec
canberra 2ec
canberra 3ec
canberra 4ec
98: 10.65sec


## GCV-orca

In [10]:
feature = 'GCV-orca'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [11]:
cluster_coverages[feature] = defaultdict(pd.DataFrame)
GO_coverages[     feature] = defaultdict(pd.DataFrame)
gene_coverages[   feature] = defaultdict(pd.DataFrame)

In [12]:
method = 'kmedoid'

for distance in {'canberra'}:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages[feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[feature][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     feature][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   feature][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/{feature}/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages[feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

canberra 0
canberra 1c
canberra 2c
canberra 3c
canberra 4c
canberra 5c
canberra 6c
canberra 7c
canberra 8c
canberra 9c
98: 7.18sec


## GCV-orca+

In [10]:
feature = 'GCV-orca+'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [11]:
cluster_coverages[feature] = defaultdict(pd.DataFrame)
GO_coverages[     feature] = defaultdict(pd.DataFrame)
gene_coverages[   feature] = defaultdict(pd.DataFrame)

In [12]:
method = 'kmedoid'

for distance in {'canberra'}:
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    runs = min(get_number_of_pre_runs(PVALUE_DIRECTORY, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        t1 = time.time()
        print(f"{distance} {run}")
        
        cluster_coverages[feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   feature][distance][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})
            
            p_values = pd.read_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", index_col=0)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[feature][distance][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     feature][distance][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   feature][distance][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
            
    ENRICHMENT_DIRECTORY = f"{YEAST_DIRECTORY}/enrichments/{feature}/{distance}/{method}/{aspect}/{correction}"
    if not os.path.exists(ENRICHMENT_DIRECTORY):
        os.makedirs(ENRICHMENT_DIRECTORY)
        
    cluster_coverages[feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/clusters.csv")
    GO_coverages[     feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/GO-terms.csv")
    gene_coverages[   feature][distance].to_csv(f"{ENRICHMENT_DIRECTORY}/genes.csv")
    
    print()

canberra 0
canberra 1c
canberra 2c
canberra 3c
canberra 4c
canberra 5c
canberra 6c
canberra 7c
canberra 8c
canberra 9c
98: 9.45sec
