In [1]:
from itertools import islice, combinations, product
from pyclustering.cluster.kmedoids import kmedoids
from collections import defaultdict
from scipy.stats import hypergeom
from collections import Counter
from goatools import obo_parser
from functools import partial

import os
import time
import random
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
DATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data"
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw-data"
HUMAN_DIRECTORY = f"{DATA_DIRECTORY}/processed-data/organisms/human"
NETWORK_DIRECTORY = f"{HUMAN_DIRECTORY}/networks"
ANNOTATION_DIRECTORY = f"{HUMAN_DIRECTORY}/annotations"

# Enrichement

#### Load and parse annotation data

In [4]:
namespace = 'BP'
PPI = nx.read_edgelist(f"{NETWORK_DIRECTORY}/PPI_BioGRID.txt")
annotation_df = pd.read_csv(f"{ANNOTATION_DIRECTORY}/GO_{namespace}_BioGRID-EBI.csv")
go_dag = obo_parser.GODag(f"{RAW_DATA_DIRECTORY}/go-basic.obo")

gene_population = set(PPI.nodes())
GO_population = set(annotation_df.GO_ID)

/home/clusterduck123/Desktop/git/supplements/data/raw-data/go-basic.obo: fmt(1.2) rel(2019-10-07) 47,285 GO Terms


#### Define convenient dictionaries

In [8]:
# Conversion dictionaries
GO2genes = pd.Series({go_id: set(genes.DB_Object_Symbol) for go_id, genes in annotation_df.groupby('GO_ID')}, 
                     name='nb_genes')

gene2GO  = {gene : set(go_ids.GO_ID) for gene, go_ids in annotation_df.groupby('DB_Object_Symbol')}
global_GO_counter = GO2genes.apply(len)

## Here we GO

### Functions

#### Parser fuctions

In [9]:
def get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, n_clusters = 99):
    splitted_file_names = [name.split('_') for name in os.listdir(CLUSTER_DIRECTORY)]
    pre_runs = [int(run) for run, ncluster, db_txt in splitted_file_names if ncluster == str(n_clusters)]
    if pre_runs:
        return max(pre_runs)+1
    else:
        return -1

#### Loop functions

In [10]:
def cluster2GO(cluster):
    return set.union(*(gene2GO.get(gene, set()) for gene in cluster))

def is_annotated_in(gene, GO_set):
    return not gene2GO.get(gene,set()).isdisjoint(GO_set)

### GDV

In [15]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 9

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{HUMAN_DIRECTORY}/distance-matrices/GDV")]

In [None]:
method = 'kmedoid'

for distance in ['mahalanobis', 'GDV-similarity', 'normalized1-l2', 'normalized1-l1', 'normalized1-linf']:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{HUMAN_DIRECTORY}/clusterings/GDV/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{HUMAN_DIRECTORY}/enrichments/GDV/{distance}/{method}/{namespace}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            assert K.eq(k.sum(axis=1), axis=0).all().all()
            assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

mahalanobis
1_66: 592.38sec

### GCV-A

In [24]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 1

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{YEAST_DIRECTORY}/distance-matrices/GCV-A")]

In [25]:
method = 'kmedoid'

for distance in ['all2_normalized1-l1']:
    
    t1 = time.time()
    print(distance)
    
    CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-A/{distance}/{method}"
    PVALUE_DIRECTORY  = f"{YEAST_DIRECTORY}/enrichments/GCV-A/{distance}/{method}/{namespace}"
    
    if not os.path.exists(PVALUE_DIRECTORY):
        os.makedirs(PVALUE_DIRECTORY)
    
    runs = min(get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS), MAX_RUNS)

    for run in range(runs):
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTER_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            assert K.eq(k.sum(axis=1), axis=0).all().all()
            assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            p_values.to_csv(f"{PVALUE_DIRECTORY}/{run}_{nb_clusters}_BioGRID.txt")
            t2 = time.time()
            print(f'{run}_{nb_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

all2_normalized1-l1
0_99: 33.73sec
