In [1]:
from itertools import islice, combinations, product
from pyclustering.cluster.kmedoids import kmedoids
from collections import defaultdict
from scipy.stats import hypergeom
from collections import Counter
from goatools import obo_parser
from functools import partial

import os
import time
import graco
import random
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
pd.set_option("display.max_columns", 50)

DATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data"
CPP_DIRECTORY = "/home/clusterduck123/Desktop/git/graco/graco/cpp"

RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
PPI_DIRECTORY = f"{DATA_DIRECTORY}/PPI"
ANNOTATIONS_DIRECTORY = f"{DATA_DIRECTORY}/annotations"
MATRIX_DIRECTORY = f"{DATA_DIRECTORY}/matrix"
CLUSTERS_DIRECTORY = f"{DATA_DIRECTORY}/clusters"

# Enrichement

#### Set parameters

In [3]:
namespace = 'CC'
lb_GO = 5
ub_GO = 500
min_lvl = 0
max_lvl = 100

#### Load and parse annotation data

In [4]:
PPI = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")

annotation_df = pd.read_csv(f"{ANNOTATIONS_DIRECTORY}/BioGRID-SGD_{namespace}_sc.csv")

go_dag = obo_parser.GODag(f"{RAW_DATA_DIRECTORY}/go-basic.obo")

gene_population = set(PPI.nodes())
GO_population = {go_id for go_id in set(annotation_df.GO_ID) 
                           if (lb_GO <= len(annotation_df[annotation_df.GO_ID == go_id]) <= ub_GO and
                               min_lvl <= go_dag[go_id].level <= max_lvl)}

annotation_df = annotation_df[annotation_df.GO_ID.isin(GO_population)]

/home/clusterduck123/Desktop/git/supplements/data/raw_data/go-basic.obo: fmt(1.2) rel(2019-10-07) 47,285 GO Terms


#### Define convenient dictionaries

In [5]:
# Conversion dictionaries
GO2genes = pd.Series({go_id: set(genes.Systematic_ID) for go_id, genes in annotation_df.groupby('GO_ID')}, 
                     name='gene_sets')
gene2GO  = {gene : set(go_ids.GO_ID)        for gene, go_ids in annotation_df.groupby('Systematic_ID')}
global_GO_counter = GO2genes.apply(len)

## Here we GO

### Functions

#### Parser fuctions

In [6]:
def get_number_of_max_runs(GV, distance, n_clusters):
    runs = max(int(run) for run,species,db,ncluster_txt in 
             map(partial(str.split, sep='_'), os.listdir(f"{CLUSTERS_DIRECTORY}/{GV}/{distance}"))
                if int(ncluster_txt.split('.')[0]) == n_clusters)
    return runs

#### Loop functions

In [7]:
def cluster2GO(cluster):
    return set.union(*(gene2GO.get(gene, set()) for gene in cluster))

def is_annotated_in(gene, GO_set):
    return not gene2GO.get(gene,set()).isdisjoint(GO_set)

#### Rest

In [8]:
def get_enrichments(alpha, p_values, cluster_list):
    relevant_p_values = [p_values[cluster_idx][cluster2GO(cluster)] 
                             for cluster_idx,cluster in enumerate(cluster_list)] 
    
    sorted_p_values = sorted(p for p_cluster in relevant_p_values
                               for p in p_cluster)
    m = len(sorted_p_values)
    c = np.log(m) + np.euler_gamma + 1/(2*m)
    for k,P_k in enumerate(sorted_p_values,1):
        if P_k > k/(m*c) * alpha:
            break
    threshold = sorted_p_values[k-2]
    return p_values < threshold

### Parameters

In [12]:
alpha = 0.05
MIN_CLUSTERS = 2
MAX_CLUSTERS = 20
MAX_RUNS = 1

cluster_coverages = defaultdict(pd.DataFrame)
GO_coverages      = defaultdict(pd.DataFrame)
gene_coverages    = defaultdict(pd.DataFrame)

In [21]:
for method in ['gGCV_normalizedl1', 'GDV_similarity']:
    
    if not os.path.exists(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}"):
        os.makedirs(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}")
    
    GV, distance = method.split('_')
    runs = min(get_number_of_max_runs(GV, distance, MAX_CLUSTERS-1), MAX_RUNS-1)

    for run in range(runs+1):
        
        t1 = time.time()
        print(f"{GV}-{distance} {run}")
        
        cluster_coverages[method][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        GO_coverages[     method][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        gene_coverages[   method][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS))
        
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTERS_DIRECTORY}/{GV}/{distance}/{run}_sc_BioGRID_{n_clusters}.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(n_clusters))

            K = global_GO_counter.values.reshape(-1,1)
            n = list(map(len, cluster_list))
            k = nb_annotated_genes_in_cluster
            N = sum(n) 
            
            assert (k <= K).all().all()

            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)
            

            enrichments = get_enrichments(alpha,p_values, cluster_list)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[method][run][n_clusters] = sum(enrichments.any())      /n_clusters
            GO_coverages[     method][run][n_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   method][run][n_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items())/N
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        
    cluster_coverages[method].to_csv(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}/cluster_coverage.txt")
    GO_coverages[method].to_csv(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}/GO_coverage.txt")
    gene_coverages[method].to_csv(f"{DATA_DIRECTORY}/enrichments/{namespace}/{method}/gene_coverage.txt")
    print()

gGCV-normalizedl1 0
19: 1.86sec
GDV-similarity 0
19: 2.02sec


# Testing

In [20]:
nb_annotated_genes

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
GO:0000109,3,2,0,0,1,0,0,3,2,0,0,2,0,2,0,0,1,0,0
GO:0000112,2,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0
GO:0000118,2,6,2,0,6,5,0,0,1,0,0,1,0,6,5,1,7,0,0
GO:0000120,1,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0
GO:0000123,1,5,0,0,8,8,0,1,1,0,0,0,0,4,13,0,4,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:1990467,0,2,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0
GO:1990468,0,2,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0
GO:1990726,0,0,0,0,4,2,0,0,0,0,0,0,0,0,2,0,0,0,0
GO:1990816,0,0,2,2,1,1,0,1,1,0,0,0,0,0,2,1,0,0,2


In [9]:
alpha = 0.05
n_clusters = 10
method = 'GDV_similarity'

In [13]:
GV, distance = method.split('_')
        
with open(f"{CLUSTERS_DIRECTORY}/{GV}/{distance}/0_sc_BioGRID_{n_clusters}.txt", 'r') as f:
     cluster_list = [set(line.split()) for line in f]
cluster_df = pd.Series({gene:cluster_idx 
                            for cluster_idx,cluster in enumerate(cluster_list) 
                            for gene in cluster})

nb_annotated_GO_terms_in_cluster = pd.DataFrame(np.array(
        [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                           index   = GO2genes.index,
                                           columns = range(n_clusters))

K = global_GO_counter.values.reshape(-1,1)
n = list(map(len, cluster_list))
k = nb_annotated_GO_terms_in_cluster
N = sum(n) 

assert (k <= K).all().all()

# scipy has a really messed up nomeclature... 
p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes)


enrichments = get_enrichments(alpha,p_values, cluster_list)
enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]

cluster_coverages = sum(enrichments.any())      /n_clusters
GO_coverages      = sum(enrichments.any(axis=1))/len(GO_population)
gene_coverages    = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                 for gene, cluster_idx in cluster_df.items())/N


In [14]:
nb_annotated_GO_terms_in_cluster

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
GO:0000109,0,0,0,7,4,4,1,0,0,0
GO:0000112,0,0,0,4,2,1,0,0,0,0
GO:0000118,5,1,0,16,10,2,6,2,0,0
GO:0000120,0,0,0,2,0,3,0,2,0,0
GO:0000123,13,0,0,9,11,3,10,0,0,0
...,...,...,...,...,...,...,...,...,...,...
GO:1990467,1,0,0,2,1,1,0,0,0,0
GO:1990468,1,0,0,1,2,1,0,0,0,0
GO:1990726,2,0,0,0,2,0,4,0,0,0
GO:1990816,2,0,0,0,2,2,1,4,2,0


In [19]:
cluster = cluster_list[0]
p_values[0][cluster2GO(cluster)]

GO:0043232    1.000000
GO:0005777    1.000000
GO:0005832    1.000000
GO:0032040    1.000000
GO:0022627    1.000000
                ...   
GO:1903293    0.243005
GO:0038201    0.737787
GO:0031261    0.065882
GO:0005665    0.153949
GO:0000243    0.750071
Name: 0, Length: 300, dtype: float64

In [20]:
nb_annotated_GO_terms_in_cluster[0]['GO:0043232']

100

In [14]:
cluster_idx = random.sample(range(n_clusters), 1)

In [25]:
cluster2GO(cluster_list[0])

{'GO:0000131',
 'GO:0000137',
 'GO:0000322',
 'GO:0000323',
 'GO:0000324',
 'GO:0000785',
 'GO:0000790',
 'GO:0005618',
 'GO:0005621',
 'GO:0005682',
 'GO:0005737',
 'GO:0005739',
 'GO:0005768',
 'GO:0005773',
 'GO:0005777',
 'GO:0005782',
 'GO:0005783',
 'GO:0005788',
 'GO:0005789',
 'GO:0005794',
 'GO:0005797',
 'GO:0005802',
 'GO:0005811',
 'GO:0005829',
 'GO:0005886',
 'GO:0005934',
 'GO:0005935',
 'GO:0008287',
 'GO:0009277',
 'GO:0016021',
 'GO:0019897',
 'GO:0019898',
 'GO:0030173',
 'GO:0030312',
 'GO:0030427',
 'GO:0030532',
 'GO:0031090',
 'GO:0031224',
 'GO:0031228',
 'GO:0031300',
 'GO:0031301',
 'GO:0031410',
 'GO:0031907',
 'GO:0031974',
 'GO:0031982',
 'GO:0031984',
 'GO:0031985',
 'GO:0034399',
 'GO:0034708',
 'GO:0042579',
 'GO:0042721',
 'GO:0043228',
 'GO:0043232',
 'GO:0043233',
 'GO:0045121',
 'GO:0046540',
 'GO:0070013',
 'GO:0071944',
 'GO:0097525',
 'GO:0097526',
 'GO:0097708',
 'GO:0098589',
 'GO:0098791',
 'GO:0098796',
 'GO:0098798',
 'GO:0098800',
 'GO:00988

In [29]:
np.argmax(list(map(len,gene2GO)))

27

In [30]:
for i, gene in gene2GO:
    if i == 27:
        break

{'YAL001C': {'GO:0000127', 'GO:0005667', 'GO:0044798', 'GO:0090576'},
 'YAL002W': {'GO:0005768',
  'GO:0005770',
  'GO:0031410',
  'GO:0031982',
  'GO:0033263',
  'GO:0097708',
  'GO:0099023'},
 'YAL005C': {'GO:0000329',
  'GO:0005618',
  'GO:0005737',
  'GO:0005774',
  'GO:0005844',
  'GO:0009277',
  'GO:0030312',
  'GO:0031090',
  'GO:0098588',
  'GO:0098805',
  'GO:0098852',
  'GO:1990904'},
 'YAL007C': {'GO:0005798',
  'GO:0030134',
  'GO:0030135',
  'GO:0031410',
  'GO:0031982',
  'GO:0097708'},
 'YAL008W': {'GO:0016021',
  'GO:0031224',
  'GO:0031300',
  'GO:0031301',
  'GO:0031306',
  'GO:0031307',
  'GO:0032592',
  'GO:0098573'},
 'YAL009W': {'GO:0008287',
  'GO:0016021',
  'GO:0031224',
  'GO:0098796',
  'GO:1903293'},
 'YAL010C': {'GO:0032865', 'GO:0098796', 'GO:0098798', 'GO:0098799'},
 'YAL011W': {'GO:0000118',
  'GO:0000812',
  'GO:0070603',
  'GO:0097346',
  'GO:1904949'},
 'YAL013W': {'GO:0000118', 'GO:0033698', 'GO:0070822'},
 'YAL014C': {'GO:0005768', 'GO:0031410', 'GO