In [31]:
from pyclustering.cluster.kmedoids import kmedoids
from collections import defaultdict
from sklearn.cluster import KMeans
from scipy.stats import hypergeom

import time
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
sns.set()

DATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data"
ANNOTATIONS_DIRECTORY = f"{DATA_DIRECTORY}/annotations"
MATRIX_DIRECTORY = f"{DATA_DIRECTORY}/matrix"
PPI_DIRECTORY = f"{DATA_DIRECTORY}/PPI"

### Functions

In [3]:
def normalized_laplacian(D):
    # can be optimized more
    diag = ((len(D)-1) - np.sum(D, axis=0)).reshape(-1,1)
    L  = D-1
    np.fill_diagonal(L, diag)
    L0 = np.multiply(np.multiply(diag**(-1/2),L).T,diag**(-1/2))
    return L0


# def p_value(n_GO_terms, 
#             n_cluster, 
#             list_of_success_states,
#             list_of_draws,
#             list_of_observed_successes):
#     
#     success_states = list_of_success_states[n_GO_terms]
#     draws = list_of_draws[n_cluster]
#     observed_successes = list_of_observed_successes[n_GO_terms][n_clusters]
# 
#     K = len(success_states)
#     n = len(draws)
#     k = len(observed_successes)
#     
#     return 1-hypergeom.cdf(k-1, N, n, K)

### Cluster independent variables

In [4]:
annotation_df = pd.read_csv(f"{ANNOTATIONS_DIRECTORY}/BioGRID-SGD_BP_sc.csv")
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")
D = np.genfromtxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_tvd0.txt")

GO_population = set(annotation_df.GO_ID)
PPI_population = set(PPI_nx.nodes())

### Name-to-Integer dictionaries

In [7]:
int2gene = dict(enumerate(PPI_population))
gene2int = {gene:n for n,gene in int2gene.items()}

int2GO = dict(enumerate(GO_population))
GO2int = {go_id:n for n,go_id in int2GO.items()}

### Preparation

Let $N$ be the number of genes in the PPI.   
Each GO-term defines a 'state' in which $K$ proteins are annotated with this term; these are seen a _successes_.    
A given cluster defines an 'experiment', in which the number of draws, $n$, corresponds to the length of the cluster.    
The number of _successful draws_ $k$ corresponds to the number of annotated genes in the given cluster.

In [47]:
# Number of drawable elements is fixed throughout the whole analysis
N = len(PPI_population)

list_of_success_states = [set(map(gene2int.get, annotation_df.Systematic_ID[annotation_df.GO_ID == go_id]))
                              for go_id in GO_population]

# This will be our K, see below
array_of_total_successes = np.array(list(map(len,list_of_success_states))).reshape(-1,1)

### Here we GO

In [48]:
MAX_CLUSTERS = 10

In [51]:
tvd0_high_coverage = []
tvd0_middle_coverage = []
tvd0_low_coverage = []

t1 = time.time()

for n_clusters in range(2, MAX_CLUSTERS):
    t2 = time.time()
    print(f'{n_clusters}: {t2-t1:.2f}sec')
    
    # Perform clustering
    initial_medoids = [1, 500]
    kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
    kmedoids_instance.process()
    clusters = kmedoids_instance.get_clusters()
    
    list_of_experiments = [set(cluster) for cluster in clusters]
    
    # For each GO term and cluster we get an experiment 
    array_of_observed_successes = np.array([[len(draws & success_states) for draws in list_of_experiments]
                                        for success_states in list_of_success_states])
    
    
    N    = N                                    # defined in section 'Preparation'
    K    = array_of_total_successes             # defined in section 'Preparation'
    _, n = array_of_observed_successes.shape    # #rows = #GO-terms
    k    = array_of_observed_successes 
    
    # scipy has a really messed up nomeclature... 
    p_values_array = 1-hypergeom.cdf(k=k-1, M=N, N=n, n=K)
    p_values_df    = pd.DataFrame(p_values_array, index=GO_population)
    
    m2 = cluster_nr
    m = m1*m2
    
    hc_enrichment_df = GO_enrichment_df < alpha[0]/m
    mc_enrichment_df = GO_enrichment_df < alpha[1]/m
    lc_enrichment_df = GO_enrichment_df < alpha[2]/m
    
    tvd0_high_coverage.append(  sum(spectral_high_significant_enrichment_df.any())  /cluster_nr)
    tvd0_middle_coverage.append(sum(spectral_middle_significant_enrichment_df.any())/cluster_nr)
    tvd0_low_coverage.append(   sum(spectral_low_significant_enrichment_df.any())   /cluster_nr)

2: 0.00sec


NameError: name 'cluster_nr' is not defined

Unnamed: 0,0,1
count,4658.0,4658.0
mean,0.1764241,0.13539
std,0.3810011,0.341974
min,0.0,0.0
25%,0.0,0.0
50%,6.099941e-08,0.0
75%,0.0006985069,0.000349
max,1.0,1.0


# Test

In [9]:
# Perform clustering
initial_medoids = [1, 500]
kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
kmedoids_instance.process()
clusters = kmedoids_instance.get_clusters()

list_of_experiments = [set(cluster) for cluster in clusters]

# For each GO term and cluster we get an experiment 
array_of_observed_successes = np.array([[len(draws & success_states) for draws in list_of_experiments]
                                    for success_states in list_of_success_states])

In [28]:
_, n = array_of_observed_successes.shape
K = np.array(list(map(len,list_of_success_states)))

In [29]:
array_of_observed_successes

array([[36, 42],
       [ 0,  3],
       [ 1,  0],
       ...,
       [ 2,  3],
       [ 1,  0],
       [ 8,  7]])

In [38]:
test = 1-hypergeom.cdf(array_of_observed_successes-1, N, n, K.reshape(-1,1))

In [39]:
test.shape

(4658, 2)

In [41]:
array_of_observed_successes.sum(axis=1)

array([78,  3,  1, ...,  5,  1, 15])

In [42]:
K

array([78,  3,  1, ...,  5,  1, 15])

In [43]:
? hypergeom.cdf