In [1]:
from itertools import islice, combinations, product
from pyclustering.cluster.kmedoids import kmedoids
from collections import defaultdict
from scipy.stats import hypergeom
from goatools import obo_parser
from functools import partial

import os
import time
import graco
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
pd.set_option("display.max_columns", 50)

DATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data"
CPP_DIRECTORY = "/home/clusterduck123/Desktop/git/graco/graco/cpp"

RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
PPI_DIRECTORY = f"{DATA_DIRECTORY}/PPI"
ANNOTATIONS_DIRECTORY = f"{DATA_DIRECTORY}/annotations"
MATRIX_DIRECTORY = f"{DATA_DIRECTORY}/matrix"
CLUSTERS_DIRECTORY = f"{DATA_DIRECTORY}/clusters"

# Enrichement

#### Set parameters

In [3]:
namespace = 'BP'
lb_GO = 5
ub_GO = 500
min_lvl = 0
max_lvl = 100

#### Load and parse annotation data

In [4]:
PPI = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")

annotation_df = all_CC_annotations_df = pd.read_csv(f"{ANNOTATIONS_DIRECTORY}/BioGRID-SGD_{namespace}_sc.csv")

go_dag = obo_parser.GODag(f"{RAW_DATA_DIRECTORY}/go-basic.obo")

GO_population = {go_id for go_id in set(annotation_df.GO_ID) 
                           if (lb_GO <= len(annotation_df[annotation_df.GO_ID == go_id]) <= ub_GO and
                               min_lvl <= go_dag[go_id].level <= max_lvl)}

annotation_df = annotation_df[annotation_df.GO_ID.isin(GO_population)]

/home/clusterduck123/Desktop/git/supplements/data/raw_data/go-basic.obo: fmt(1.2) rel(2019-10-07) 47,285 GO Terms


#### Define convenient dictionaries

In [5]:
# Conversion dictionaries
int2GO = dict(enumerate(GO_population))
GO2int = dict(zip(int2GO.values(), int2GO.keys()))

GO2genes = {go_id:set(annotation_df.Systematic_ID[annotation_df.GO_ID == go_id])
                for go_id in GO_population}

gene2GOs = {gene :set(annotation_df.GO_ID[annotation_df.Systematic_ID == gene]) 
                for gene in PPI}

## Here we GO

#### Functions

In [6]:
def gene_enriched_in_cluster(gene, cluster, enrichment):
    return bool(gene2GOs[gene] & set(GO_index[enrichment[cluster]]))

def get_enrichment_df(alpha, p_values):
    m = p_values.size
    c = np.log(m) + np.euler_gamma + 1/(2*m)
    sorted_p_values = np.sort(p_values.values.flatten())
    for k,P_k in enumerate(sorted_p_values,1):
        if P_k > k/(m*c) * alpha:
            break
    threshold = sorted_p_values[k-2]
    return p_values_df < threshold

def get_number_of_max_runs(GV, distance, n_clusters):
    runs = max(int(run) for run,species,db,ncluster_txt in 
             map(partial(str.split, sep='_'), os.listdir(f"{CLUSTERS_DIRECTORY}/{GV}/{distance}"))
                if int(ncluster_txt.split('.')[0]) == n_clusters)
    return runs

#### Cluster independent arrays

In [7]:
list_of_success_states = list(GO2genes.values())
array_of_total_successes = np.array(list(map(len,list_of_success_states))).reshape(-1,1)

In [8]:
alpha = 0.05
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 20

cluster_coverages = defaultdict(pd.DataFrame)
GO_coverages      = defaultdict(pd.DataFrame)
gene_coverages    = defaultdict(pd.DataFrame)

In [10]:
for method in ['gGCV_normalizedlinf']:
    
    if not os.path.exists(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}"):
        os.makedirs(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}")
    
    GV, distance = method.split('_')
    runs = min(get_number_of_max_runs(GV, distance, MAX_CLUSTERS-1), MAX_RUNS)

    for run in range(runs+1):
        
        t1 = time.time()
        print(f"{GV}-{distance} {run}")
        
        cluster_coverages[method][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        GO_coverages[method][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        gene_coverages[method][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            

            with open(f"{CLUSTERS_DIRECTORY}/{GV}/{distance}/{run}_sc_BioGRID_{n_clusters}.txt", 'r') as f:
                    list_of_experiments = [set(line.split()) for line in f] 

            clusters = dict(enumerate(list_of_experiments))

            # For each GO term and cluster we get an experiment 
            array_of_observed_successes = np.array([[len(draws & success_states) for draws in list_of_experiments]
                                                for success_states in list_of_success_states])



            K = array_of_total_successes             # defined in section 'Preparation'
            n = list(map(len, list_of_experiments))  # cluster lengths
            k = array_of_observed_successes          # number of annotated genes found in cluster
            N = sum(n)                               # PPI size, i.e. number of all genes that appear in a cluster

            # scipy has a really messed up nomeclature... 
            p_values_array = 1-hypergeom.cdf(k=k-1, M=N, N=n, n=K)
            p_values_df    = pd.DataFrame(p_values_array, index=GO_population)
            GO_index = p_values_df.index

            m = p_values_array.size
            enrichment_df = get_enrichment_df(alpha,p_values_df)
            
            cluster_coverages[method][run][n_clusters] = sum(enrichment_df.any())      /n_clusters
            GO_coverages[method][run][n_clusters] = sum(enrichment_df.any(axis=1))/len(GO_population)
            gene_coverages[method][run][n_clusters] = sum(1 for cluster in clusters for gene in clusters[cluster] 
                                                if gene_enriched_in_cluster(gene, cluster, enrichment_df))/N

            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        
    cluster_coverages[method].to_csv(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}/cluster_coverage.txt")
    GO_coverages[method].to_csv(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}/GO_coverage.txt")
    gene_coverages[method].to_csv(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}/gene_coverage.txt")
    print()

gGCV-normalizedlinf 0
99: 125.62sec
