In [1]:
from itertools import islice, combinations, product
from pyclustering.cluster.kmedoids import kmedoids
from collections import defaultdict
from scipy.stats import hypergeom
from collections import Counter
from goatools import obo_parser
from functools import partial

import os
import time
import random
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
DATA_DIRECTORY = "/media/clusterduck123/joe/data"
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw-data"
YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed-data/organisms/yeast"
NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
ANNOTATION_DIRECTORY = f"{YEAST_DIRECTORY}/annotations"

# Enrichement

#### Load and parse annotation data

In [3]:
aspect = 'BP'
PPI = nx.read_edgelist(f"{NETWORK_DIRECTORY}/PPI_BioGRID.txt")
annotation_df = pd.read_csv(f"{ANNOTATION_DIRECTORY}/GO_{aspect}_BioGRID-SGD.csv")
go_dag = obo_parser.GODag(f"{RAW_DATA_DIRECTORY}/go-basic.obo")

gene_population = set(PPI.nodes())
GO_population = set(annotation_df.GO_ID)

/media/clusterduck123/joe/data/raw-data/go-basic.obo: fmt(1.2) rel(2019-10-07) 47,285 GO Terms


#### Define convenient dictionaries

In [4]:
# Conversion dictionaries
GO2genes = pd.Series({go_id: set(genes.Systematic_ID) for go_id, genes in annotation_df.groupby('GO_ID')}, 
                     name='nb_genes')

gene2GO  = {gene : set(go_ids.GO_ID) for gene, go_ids in annotation_df.groupby('Systematic_ID')}
global_GO_counter = GO2genes.apply(len)

## Here we GO

### Functions

#### Parser fuctions

In [5]:
def get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, n_clusters = 99):
    splitted_file_names = [name.split('_') for name in os.listdir(CLUSTER_DIRECTORY)]
    pre_runs = [int(run) for run, ncluster, db_txt in splitted_file_names if ncluster == str(n_clusters)]
    if pre_runs:
        return max(pre_runs)+1
    else:
        return -1

#### Loop functions

In [6]:
def cluster2GO(cluster):
    return set.union(*(gene2GO.get(gene, set()) for gene in cluster))

def is_annotated_in(gene, GO_set):
    return not gene2GO.get(gene,set()).isdisjoint(GO_set)

## GDV

In [7]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 30

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{YEAST_DIRECTORY}/distance-matrices/GDV")]

In [8]:
method = 'kmedoid'

for distance in ['normalized2-linf',
                 'normalized2-l2',
                 'normalized2-l1']:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GDV/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/GDV/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            assert K.eq(k.sum(axis=1), axis=0).all().all()
            assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

normalized2-linf


  op=op_str, alt_op=unsupported[op_str]


1_99: 317.60sec
normalized2-l2
1_99: 292.64sec
normalized2-l1
1_99: 276.22sec


# GCV-A

In [7]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 30

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{YEAST_DIRECTORY}/distance-matrices/GCV-A")]

In [8]:
method = 'kmedoid'

for distance in ['normalized1-l1',
                 'normalized1-l2',
                 'normalized1-linf']:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-A/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/GCV-A/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            assert K.eq(k.sum(axis=1), axis=0).all().all()
            assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

normalized1-l1


  op=op_str, alt_op=unsupported[op_str]


29_99: 2896.61sec
normalized1-l2
29_99: 2867.86sec
normalized1-linf
29_99: 2882.94sec


## GCV-G

In [7]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 30

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{YEAST_DIRECTORY}/distance-matrices/GCV-G")]

In [8]:
method = 'kmedoid'

for distance in ['normalized1-l1',
                 'normalized1-l2',
                 'normalized1-linf']:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-G/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/GCV-G/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            assert K.eq(k.sum(axis=1), axis=0).all().all()
            assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

normalized1-l1
10_2: 0.11sec

  op=op_str, alt_op=unsupported[op_str]


29_99: 657.01sec
normalized1-l2
29_99: 669.60sec
normalized1-linf
29_99: 677.82sec


# GCV-DG

In [7]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 30

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{YEAST_DIRECTORY}/distance-matrices/GCV-DG")]

In [8]:
method = 'kmedoid'

for distance in ['normalized1-l1',
                 'normalized1-l2',
                 'normalized1-linf']:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-DG/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/GCV-DG/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            assert K.eq(k.sum(axis=1), axis=0).all().all()
            assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

normalized1-l1


  op=op_str, alt_op=unsupported[op_str]


29_99: 4236.06sec
normalized1-l2
29_99: 4358.73sec
normalized1-linf
29_99: 4366.46sec


# GCV-AD

In [7]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 15

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{YEAST_DIRECTORY}/distance-matrices/GCV-AD")]

In [8]:
method = 'kmedoid'

for distance in ['normalized1-l1',
                 'normalized1-l2',
                 'normalized1-linf']:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-AD/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/p-values/GCV-AD/{distance}/{method}/{aspect}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            if os.path.exists(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt"):
                continue
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            assert K.eq(k.sum(axis=1), axis=0).all().all()
            assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

normalized1-l1
0_2: 0.10sec

  op=op_str, alt_op=unsupported[op_str]


14_99: 524.05sec
normalized1-l2
2_95: 101.81sec

KeyboardInterrupt: 