In [1]:
from itertools import islice, combinations, product
from pyclustering.cluster.kmedoids import kmedoids
from collections import defaultdict
from scipy.stats import hypergeom
from collections import Counter
from goatools import obo_parser
from functools import partial

import os
import time
import graco
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
pd.set_option("display.max_columns", 50)

DATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data"
CPP_DIRECTORY = "/home/clusterduck123/Desktop/git/graco/graco/cpp"

RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
PPI_DIRECTORY = f"{DATA_DIRECTORY}/PPI"
ANNOTATIONS_DIRECTORY = f"{DATA_DIRECTORY}/annotations"
MATRIX_DIRECTORY = f"{DATA_DIRECTORY}/matrix"
CLUSTERS_DIRECTORY = f"{DATA_DIRECTORY}/clusters"

# Enrichement

#### Set parameters

In [3]:
namespace = 'CC'
lb_GO = 5
ub_GO = 500
min_lvl = 0
max_lvl = 100

#### Load and parse annotation data

In [4]:
PPI = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")

annotation_df = pd.read_csv(f"{ANNOTATIONS_DIRECTORY}/BioGRID-SGD_{namespace}_sc.csv")

go_dag = obo_parser.GODag(f"{RAW_DATA_DIRECTORY}/go-basic.obo")

gene_population = set(PPI.nodes())
GO_population = {go_id for go_id in set(annotation_df.GO_ID) 
                           if (lb_GO <= len(annotation_df[annotation_df.GO_ID == go_id]) <= ub_GO and
                               min_lvl <= go_dag[go_id].level <= max_lvl)}

annotation_df = annotation_df[annotation_df.GO_ID.isin(GO_population)]

/home/clusterduck123/Desktop/git/supplements/data/raw_data/go-basic.obo: fmt(1.2) rel(2019-10-07) 47,285 GO Terms


#### Define convenient dictionaries

In [5]:
# Conversion dictionaries
GO_index = pd.Series(iter(GO_population), name='GO-terms')
GO2genes = pd.Series({go_id: set(genes.Systematic_ID) for go_id, genes in annotation_df.groupby('GO_ID')}, 
                     name='gene_sets')
gene2GO  = {gene : set(go_ids.GO_ID)        for gene, go_ids in annotation_df.groupby('Systematic_ID')}
global_GO_counter = GO2genes.apply(len)

## Here we GO

### Functions

#### Parser fuctions

In [6]:
def get_number_of_max_runs(GV, distance, n_clusters):
    runs = max(int(run) for run,species,db,ncluster_txt in 
             map(partial(str.split, sep='_'), os.listdir(f"{CLUSTERS_DIRECTORY}/{GV}/{distance}"))
                if int(ncluster_txt.split('.')[0]) == n_clusters)
    return runs

#### Loop functions

In [7]:
def gene_enriched_in_cluster(gene, cluster, enrichment):
    return bool(gene2GOs[gene] & set(GO_index[enrichment[cluster]]))

def cluster2GO(cluster):
    return set.union(*(gene2GO.get(gene, set()) for gene in cluster))

def is_annotated_in(gene, GO_set):
    return not gene2GO.get(gene,set()).isdisjoint(GO_set)

#### Rest

In [8]:
def get_enrichments(alpha, p_values, cluster_list):
    relevant_p_values = [p_values[cluster_idx][cluster2GO(cluster)] 
                             for cluster_idx,cluster in enumerate(cluster_list)] 
    
    sorted_p_values = sorted(p for p_cluster in relevant_p_values
                               for p in p_cluster)
    m = len(sorted_p_values)
    c = np.log(m) + np.euler_gamma + 1/(2*m)
    for k,P_k in enumerate(sorted_p_values,1):
        if P_k > k/(m*c) * alpha:
            break
    threshold = sorted_p_values[k-2]
    return p_values < threshold

In [9]:
alpha = 0.05
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 10

cluster_coverages = defaultdict(pd.DataFrame)
GO_coverages      = defaultdict(pd.DataFrame)
gene_coverages    = defaultdict(pd.DataFrame)

In [None]:
for method in ['gGCV_normalizedl1', 'gGCV_normalizedl2', 'gGCV_normalizedlinf', 
                'GDV_similarity', 'GCV_tvd', ]:
    
    if not os.path.exists(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}"):
        os.makedirs(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}")
    
    GV, distance = method.split('_')
    runs = min(get_number_of_max_runs(GV, distance, MAX_CLUSTERS-1), MAX_RUNS-1)

    for run in range(runs+1):
        
        t1 = time.time()
        print(f"{GV}-{distance} {run}")
        
        cluster_coverages[method][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        GO_coverages[     method][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        gene_coverages[   method][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            

            with open(f"{CLUSTERS_DIRECTORY}/{GV}/{distance}/{run}_sc_BioGRID_{n_clusters}.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            # For each GO term and cluster we get an experiment 
            enriched_GO_terms_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO_index,
                                                       columns = range(n_clusters))

            K = global_GO_counter.values.reshape(-1,1)
            n = list(map(len, cluster_list))
            k = enriched_GO_terms_in_cluster
            N = sum(n)                               # PPI size, i.e. number of all genes that appear in a cluster

            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO_population)
            

            enrichments = get_enrichments(alpha,p_values, cluster_list)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[method][run][n_clusters] = sum(enrichments.any())      /n_clusters
            GO_coverages[     method][run][n_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   method][run][n_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items())/N
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        
    cluster_coverages[method].to_csv(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}/cluster_coverage.txt")
    GO_coverages[method].to_csv(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}/GO_coverage.txt")
    gene_coverages[method].to_csv(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}/gene_coverage.txt")
    print()

gGCV-normalizedl1 0
gGCV-normalizedl1 1
gGCV-normalizedl1 2
gGCV-normalizedl1 3
gGCV-normalizedl1 4
gGCV-normalizedl1 5
gGCV-normalizedl1 6
gGCV-normalizedl1 7
gGCV-normalizedl1 8
gGCV-normalizedl1 9
99: 23.64sec
gGCV-normalizedl2 0
gGCV-normalizedl2 1
gGCV-normalizedl2 2
gGCV-normalizedl2 3
gGCV-normalizedl2 4
gGCV-normalizedl2 5
gGCV-normalizedl2 6
gGCV-normalizedl2 7
gGCV-normalizedl2 8
gGCV-normalizedl2 9
99: 23.08sec
gGCV-normalizedlinf 0
gGCV-normalizedlinf 1
gGCV-normalizedlinf 2
gGCV-normalizedlinf 3
gGCV-normalizedlinf 4
gGCV-normalizedlinf 5
gGCV-normalizedlinf 6
72: 14.31sec

In [None]:
cluster_coverages[method]