In [1]:
from itertools import islice, combinations, product
from pyclustering.cluster.kmedoids import kmedoids
from collections import defaultdict
from scipy.stats import hypergeom
from collections import Counter
from goatools import obo_parser
from functools import partial

import os
import time
import random
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
DATA_DIRECTORY = "/media/clusterduck123/joe/data"
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed_data/yeast"
NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
ANNOTATION_DIRECTORY = f"{YEAST_DIRECTORY}/annotations"
MATRIX_DIRECTORY  = f"{YEAST_DIRECTORY}/distance_matrices"

# Script

In [4]:
import os
import sys
import numpy as np
import pandas as pd
import networkx as nx
import multiprocessing

"""
Takes network, feature and metric as input and calculates distance matrix.
"""
# =============================================================================
#  ---------------------------- GLOBAL PARAMETERS ----------------------------
# =============================================================================

RUN = 0
MIN_CLUSTERS = 40
MAX_CLUSTERS = 50

# =============================================================================
#  -------------------------------- FUNCTIONS --------------------------------
# =============================================================================

def cluster2GO(cluster):
    return set.union(*(gene2GO.get(gene, set()) for gene in cluster))

def is_annotated_in(gene, GO_set):
    return not gene2GO.get(gene,set()).isdisjoint(GO_set)

# =============================================================================
#  ---------------------------------- MAIN -----------------------------------
# =============================================================================

def main(network, feature, metric, method):
    print(multiprocessing.current_process().name)
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{network}/{feature}/{metric}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/pvalues/{network}/{feature}/{metric}/{method}/{aspect}"

    G_nx = nx.read_edgelist(f"{YEAST_DIRECTORY}/networks/{network}.txt")
    annotation_df = pd.read_csv(f"{ANNOTATION_DIRECTORY}/GO_{aspect}_systematic_SGD.csv")
    annotation_df = annotation_df[annotation_df.Systematic_ID.isin(G_nx)]
    go_dag = obo_parser.GODag(f"{RAW_DATA_DIRECTORY}/go-basic.obo")

    GO_population = set(annotation_df.GO_ID)

    # Conversion dictionaries
    GO2genes = pd.Series({go_id: set(genes.Systematic_ID)
                            for go_id, genes in annotation_df.groupby('GO_ID')},
                         name='nb_genes')

    gene2GO  = {gene : set(go_ids.GO_ID) for gene, go_ids in annotation_df.groupby('Systematic_ID')}
    global_GO_counter = GO2genes.apply(len)

    for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
        with open(f"{CLUSTER_DIRECTORY}/{RUN}_{n_clusters}.txt", 'r') as f:
             cluster_list = [set(line.split()) for line in f]
        cluster_df = pd.Series({gene:cluster_idx
                                    for cluster_idx,cluster in enumerate(cluster_list)
                                    for gene in cluster})

        n_annotated_genes_in_cluster = pd.DataFrame(np.array(
                [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                   index   = GO2genes.index,
                                                   columns = range(n_clusters))


        k = n_annotated_genes_in_cluster

        K = pd.concat([global_GO_counter[GO2genes.index]]*n_clusters, axis=1)
        K.columns = k.columns

        n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
        n.index = k.index

        N = pd.DataFrame(len(G_nx), columns=k.columns, index=k.index)

        assert K.eq(k.sum(axis=1), axis=0).all().all()
        assert N.eq(n.sum(axis=1), axis=0).all().all()

        # scipy has a really messed up nomeclature...
        p_values = pd.DataFrame(1-hypergeom.cdf(k=k.values-1,
                                                M=N.values,
                                                N=n.values,
                                                n=K.values),
                                index=GO2genes.index)
        p_values.to_csv(f"{PVALUE_DIRECTORY}/{RUN}_{n_clusters}.txt")

        
from itertools import product
from multiprocessing import Pool


# Global constants

DATA_DIRECTORY = "/media/clusterduck123/joe/data/"
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed_data/yeast"
ANNOTATION_DIRECTORY = f"{YEAST_DIRECTORY}/annotations"

# Input parameters
with open(f"input_parameters.py") as f:
    for line in f:
        exec(line.strip())


# Define necessary directories
for network, feature, metric, method, aspect in product(networks, features, metrics, methods, aspects):
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/pvalues/{network}/{feature}/{metric}/{method}/{aspect}"
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)

with Pool(96) as p:
    p.starmap(main,product(networks, features, metrics, methods))

FileNotFoundError: [Errno 2] No such file or directory: '/usr/lib/python36.zip/input_parameters.py'

# Enrichement

#### Load and parse annotation data

In [3]:
aspect  = 'BP'
method  = 'kmedoid'
network = 'systematic_CoEx_COEXPRESdb' 

PPI = nx.read_edgelist(f"{NETWORK_DIRECTORY}/systematic_PPI_BioGRID.txt")
annotation_df = pd.read_csv(f"{ANNOTATION_DIRECTORY}/GO_{aspect}_systematic_BioGRID-SGD.csv")
go_dag = obo_parser.GODag(f"{RAW_DATA_DIRECTORY}/go-basic.obo")

gene_population = set(PPI.nodes())
GO_population = set(annotation_df.GO_ID)

/media/clusterduck123/joe/data/raw_data/go-basic.obo: fmt(1.2) rel(2019-10-07) 47,285 GO Terms


#### Define convenient dictionaries

In [4]:
# Conversion dictionaries
GO2genes = pd.Series({go_id: set(genes.Systematic_ID) for go_id, genes in annotation_df.groupby('GO_ID')}, 
                     name='nb_genes')

gene2GO  = {gene : set(go_ids.GO_ID) for gene, go_ids in annotation_df.groupby('Systematic_ID')}
global_GO_counter = GO2genes.apply(len)

## Here we GO

### Functions

#### Parser fuctions

In [5]:
def get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, n_clusters = 99):
    splitted_file_names = [name.split('_') for name in os.listdir(CLUSTER_DIRECTORY)]
    pre_runs = [int(run) for run, ncluster in splitted_file_names if ncluster == str(n_clusters)]
    if pre_runs:
        return max(pre_runs)+1
    else:
        return -1

#### Loop functions

In [6]:
def cluster2GO(cluster):
    return set.union(*(gene2GO.get(gene, set()) for gene in cluster))

def is_annotated_in(gene, GO_set):
    return not gene2GO.get(gene,set()).isdisjoint(GO_set)

# Test

In [7]:
network = 'systematic_PPI_BioGRID'
feature = 'GCV-O+'
metric  = 'canberra'
method  = 'kmedoid'
aspect  = 'BP'

In [8]:
CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{network}/{feature}/{metric}/{method}"
PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/{network}/{feature}/{metric}/{method}/{aspect}"

run = 3
nb_clusters = 45

with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}.txt", 'r') as f:
     cluster_list = [set(line.split()) for line in f]
cluster_df = pd.Series({gene:cluster_idx 
                            for cluster_idx,cluster in enumerate(cluster_list) 
                            for gene in cluster})

nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
        [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                           index   = GO2genes.index,
                                           columns = range(nb_clusters))


k = nb_annotated_genes_in_cluster

K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
K.columns = k.columns

n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
n.index = k.index

N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)

assert K.eq(k.sum(axis=1), axis=0).all().all()
assert N.eq(n.sum(axis=1), axis=0).all().all()

In [15]:
p_values1 = pd.DataFrame(1-hypergeom.cdf(k=k.values-1, M=N.values, N=n.values, n=K.values), index=GO2genes.index)
p_values2 = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)

In [19]:
K

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
GO:0000001,22,22,22,22,22,22,22,22,22,22,...,22,22,22,22,22,22,22,22,22,22
GO:0000002,36,36,36,36,36,36,36,36,36,36,...,36,36,36,36,36,36,36,36,36,36
GO:0000003,41,41,41,41,41,41,41,41,41,41,...,41,41,41,41,41,41,41,41,41,41
GO:0000011,16,16,16,16,16,16,16,16,16,16,...,16,16,16,16,16,16,16,16,16,16
GO:0000018,29,29,29,29,29,29,29,29,29,29,...,29,29,29,29,29,29,29,29,29,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:2001252,36,36,36,36,36,36,36,36,36,36,...,36,36,36,36,36,36,36,36,36,36
GO:2001253,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
GO:2001255,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
GO:2001276,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


## GDV

In [13]:
feature = 'GDV'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

FileNotFoundError: [Errno 2] No such file or directory: '/media/clusterduck123/joe/data/processed_data/yeast/distance_matrices/GCV-O+'

In [15]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 10

In [16]:
for distance in {'GDV_similarity', 'canberra', 'mahalanobis'}:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{network}/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/{network}/{feature}/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            assert K.eq(k.sum(axis=1), axis=0).all().all()
            assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

canberra

mahalanobis

GDV_similarity



# GCV-A

In [10]:
feature = 'GCV-A'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [11]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 30

In [12]:
method = 'kmedoid'

for distance in {'canberra', 'cityblock'}:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            assert K.eq(k.sum(axis=1), axis=0).all().all()
            assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

canberra

cityblock



## GCV-G

In [13]:
feature = 'GCV-G'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [14]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 30

In [15]:
method = 'kmedoid'

for distance in {'canberra', 'cityblock'}:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            assert K.eq(k.sum(axis=1), axis=0).all().all()
            assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

canberra

cityblock



# GCV-DG

In [16]:
feature = 'GCV-DG'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [17]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 10

In [18]:
method = 'kmedoid'

for distance in {'canberra', 'cityblock'}:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            if not (distance in {'correlation', 'mahalanobis'}):
                assert K.eq(k.sum(axis=1), axis=0).all().all()
                assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

canberra

cityblock



# GCV-DA

In [19]:
feature = 'GCV-DA'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [20]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 30

In [21]:
method = 'kmedoid'

for distance in {'canberra', 'cityblock'}:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            assert K.eq(k.sum(axis=1), axis=0).all().all()
            assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

canberra


  op=op_str, alt_op=unsupported[op_str]


26_99: 148.99sec
cityblock
26_99: 142.00sec


## GCV-all

In [22]:
feature = 'GCV-all'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [23]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 5

In [24]:
method = 'kmedoid'

for distance in {'canberra', 'cityblock'}:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            if not distance in {'correlation', 'mahalanobis'}:
                assert K.eq(k.sum(axis=1), axis=0).all().all()
                assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

canberra

cityblock



## GCV-nonredundant

In [25]:
feature = 'GCV-nonredundant'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [26]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 5

In [27]:
method = 'kmedoid'

for distance in {'canberra', 'cityblock'}:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            if not distance in {'correlation', 'mahalanobis'}:
                assert K.eq(k.sum(axis=1), axis=0).all().all()
                assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

canberra

cityblock



## GCV-orca

In [28]:
feature = 'GCV-orca'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [29]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 10

In [30]:
method = 'kmedoid'

for distance in {'hellinger'}:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            if not distance in {'correlation', 'mahalanobis'}:
                assert K.eq(k.sum(axis=1), axis=0).all().all()
                assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

hellinger



## GCV-orca+

In [31]:
feature = 'GCV-orca+'

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [32]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 10

In [33]:
method = 'kmedoid'

for distance in {'canberra', 'cityblock'}:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/{feature}/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            if not distance in {'correlation', 'mahalanobis'}:
                assert K.eq(k.sum(axis=1), axis=0).all().all()
                assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

canberra

cityblock

