In [1]:
from itertools import islice, combinations, product
from pyclustering.cluster.kmedoids import kmedoids
from collections import defaultdict
from scipy.stats import hypergeom
from collections import Counter
from goatools import obo_parser
from functools import partial

import os
import time
import graco
import random
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
pd.set_option("display.max_columns", 50)

DATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data"
CPP_DIRECTORY = "/home/clusterduck123/Desktop/git/graco/graco/cpp"

RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
PPI_DIRECTORY = f"{DATA_DIRECTORY}/PPI"
ANNOTATIONS_DIRECTORY = f"{DATA_DIRECTORY}/annotations"
MATRIX_DIRECTORY = f"{DATA_DIRECTORY}/matrix"
CLUSTERS_DIRECTORY = f"{DATA_DIRECTORY}/clusters"

# Enrichement

#### Set parameters

In [3]:
namespace = 'MF'
correction = 'BH'
lb_GO = 5
ub_GO = 500
min_lvl = 0
max_lvl = 100

#### Load and parse annotation data

In [4]:
PPI = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")

annotation_df = pd.read_csv(f"{ANNOTATIONS_DIRECTORY}/BioGRID-SGD_{namespace}_sc.csv")

go_dag = obo_parser.GODag(f"{RAW_DATA_DIRECTORY}/go-basic.obo")

gene_population = set(PPI.nodes())
GO_population = {go_id for go_id in set(annotation_df.GO_ID) 
                           if (lb_GO <= len(annotation_df[annotation_df.GO_ID == go_id]) <= ub_GO and
                               min_lvl <= go_dag[go_id].level <= max_lvl)}

annotation_df = annotation_df[annotation_df.GO_ID.isin(GO_population)]

/home/clusterduck123/Desktop/git/supplements/data/raw_data/go-basic.obo: fmt(1.2) rel(2019-10-07) 47,285 GO Terms


#### Define convenient dictionaries

In [5]:
# Conversion dictionaries
GO2genes = pd.Series({go_id: set(genes.Systematic_ID) for go_id, genes in annotation_df.groupby('GO_ID')}, 
                     name='nb_genes')
gene2GO = defaultdict(set)
gene2GO  = {gene : set(go_ids.GO_ID)        for gene, go_ids in annotation_df.groupby('Systematic_ID')}
global_GO_counter = GO2genes.apply(len)

## Here we GO

### Functions

#### Parser fuctions

In [6]:
def get_number_of_max_runs(GV, distance, nb_clusters):
    runs = max(int(run) for run,species,db,ncluster_txt in 
             map(partial(str.split, sep='_'), os.listdir(f"{CLUSTERS_DIRECTORY}/{GV}/{distance}"))
                if int(ncluster_txt.split('.')[0]) == nb_clusters)
    return runs

#### Loop functions

In [7]:
def cluster2GO(cluster):
    return set.union(*(gene2GO.get(gene, set()) for gene in cluster))

def is_annotated_in(gene, GO_set):
    return not gene2GO.get(gene,set()).isdisjoint(GO_set)

#### Rest

In [8]:
def get_enrichments(alpha, p_values, cluster_list, correction):
    relevant_p_values = [p_values[cluster_idx][cluster2GO(cluster)] 
                             for cluster_idx,cluster in enumerate(cluster_list)] 
    
    sorted_p_values = sorted(p for p_cluster in relevant_p_values
                               for p in p_cluster)
    m = len(sorted_p_values)
    if   correction == 'BY':
        c = np.log(m) + np.euler_gamma + 1/(2*m)
    elif correction == 'BH':
        c = 1
    else:
        print("Correction not known!")
        raise Exception
    for k,P_k in enumerate(sorted_p_values,1):
        if P_k > k/(m*c) * alpha:
            break
    threshold = sorted_p_values[k-2]
    return p_values < threshold

### Parameters

In [9]:
alpha = 0.05
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100
MAX_RUNS = 50

cluster_coverages = defaultdict(pd.DataFrame)
GO_coverages      = defaultdict(pd.DataFrame)
gene_coverages    = defaultdict(pd.DataFrame)

In [10]:
ENRICHMENT_PATH = f"{DATA_DIRECTORY}/enrichments/{namespace}/{correction}"

for method in ['gGCV_normalizedl1', 'gGCV_normalizedl2', 'gGCV_normalizedlinf', 'GDV_similarity', 'GDV_mahalanobis']:
    
    if not os.path.exists(f"{ENRICHMENT_PATH}/{method}"):
        os.makedirs(f"{ENRICHMENT_PATH}/{method}")
    
    GV, distance = method.split('_')
    runs = min(get_number_of_max_runs(GV, distance, MAX_CLUSTERS-1), MAX_RUNS-1)

    for run in range(runs+1):
        
        t1 = time.time()
        print(f"{GV}-{distance} {run}")
        
        cluster_coverages[method][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2)) 
        GO_coverages[     method][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        gene_coverages[   method][run] = pd.Series(np.nan, index=range(MIN_CLUSTERS, MAX_CLUSTERS+2))
        
        for nb_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            with open(f"{CLUSTERS_DIRECTORY}/{GV}/{distance}/{run}_sc_BioGRID_{nb_clusters}.txt", 'r') as f:
                 cluster_list = [set(line.split()) for line in f]
            cluster_df = pd.Series({gene:cluster_idx 
                                        for cluster_idx,cluster in enumerate(cluster_list) 
                                        for gene in cluster})

            nb_annotated_genes_in_cluster = pd.DataFrame(np.array(
                    [ [len(go_genes & cluster) for cluster in cluster_list] for go_genes in GO2genes]),
                                                       index   = GO2genes.index,
                                                       columns = range(nb_clusters))

            
            k = nb_annotated_genes_in_cluster
            
            K = pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1)
            K.columns = k.columns
            
            n = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
            n.index = k.index
            
            N = pd.DataFrame(len(PPI), columns=k.columns, index=k.index)
            
            #assert K.eq(k.sum(axis=1), axis=0).all().all()
            #assert N.eq(n.sum(axis=1), axis=0).all().all()
            
            # scipy has a really messed up nomeclature... 
            p_values = pd.DataFrame(1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), index=GO2genes.index)

            enrichments = get_enrichments(alpha, p_values, cluster_list, correction)
            enrichmet_list = [set(enrichments[i][enrichments[i]].index) for i in enrichments.columns]
            
            cluster_coverages[method][run][nb_clusters] = sum(enrichments.any())      / nb_clusters
            GO_coverages[     method][run][nb_clusters] = sum(enrichments.any(axis=1))/len(GO_population)
            gene_coverages[   method][run][nb_clusters] = sum(is_annotated_in(gene,enrichmet_list[cluster_idx])
                                                             for gene, cluster_idx in cluster_df.items()) / len(PPI)
            t2 = time.time()
            print(f'{nb_clusters}: {t2-t1:.2f}sec', end='\r')
        
    cluster_coverages[method].to_csv(f"{ENRICHMENT_PATH}/{method}/cluster_coverage.txt")
    GO_coverages[method].to_csv(f"{ENRICHMENT_PATH}/{method}/GO_coverage.txt")
    gene_coverages[method].to_csv(f"{ENRICHMENT_PATH}/{method}/gene_coverage.txt")
    print()

gGCV-normalizedl1 0
2: 0.16sec

  op=op_str, alt_op=unsupported[op_str]


gGCV-normalizedl1 1
gGCV-normalizedl1 2
gGCV-normalizedl1 3
gGCV-normalizedl1 4
gGCV-normalizedl1 5
gGCV-normalizedl1 6
gGCV-normalizedl1 7
gGCV-normalizedl1 8
gGCV-normalizedl1 9
gGCV-normalizedl1 10
gGCV-normalizedl1 11
gGCV-normalizedl1 12
gGCV-normalizedl1 13
gGCV-normalizedl1 14
gGCV-normalizedl1 15
gGCV-normalizedl1 16
gGCV-normalizedl1 17
gGCV-normalizedl1 18
gGCV-normalizedl1 19
gGCV-normalizedl1 20
gGCV-normalizedl1 21
gGCV-normalizedl1 22
gGCV-normalizedl1 23
gGCV-normalizedl1 24
gGCV-normalizedl1 25
gGCV-normalizedl1 26
gGCV-normalizedl1 27
gGCV-normalizedl1 28
gGCV-normalizedl1 29
gGCV-normalizedl1 30
gGCV-normalizedl1 31
gGCV-normalizedl1 32
gGCV-normalizedl1 33
gGCV-normalizedl1 34
gGCV-normalizedl1 35
gGCV-normalizedl1 36
gGCV-normalizedl1 37
gGCV-normalizedl1 38
gGCV-normalizedl1 39
gGCV-normalizedl1 40
gGCV-normalizedl1 41
gGCV-normalizedl1 42
gGCV-normalizedl1 43
gGCV-normalizedl1 44
gGCV-normalizedl1 45
gGCV-normalizedl1 46
gGCV-normalizedl1 47
gGCV-normalizedl1 48
g

# Testing enrichment

In [86]:
relevant_p_values = [p_values[cluster_idx][cluster2GO(cluster)] 
                         for cluster_idx,cluster in enumerate(cluster_list)] 

sorted_p_values = sorted(p for p_cluster in relevant_p_values
                           for p in p_cluster)
m = len(sorted_p_values)
#c = np.log(m) + np.euler_gamma + 1/(2*m)
c = 1
for k,P_k in enumerate(sorted_p_values,1):
    if P_k > k/(m*c) * alpha:
        break
threshold = sorted_p_values[k-2]
df =  p_values < threshold

In [74]:
for cluster_idx in range(nb_clusters):
    for p_value in relevant_p_values[cluster_idx]:
        if p_value == 0.:
            print(cluster_idx)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
3


In [76]:
p_values[0]['GO:1990904']

0.0

In [16]:
list(map(len,relevant_p_values))

[401, 309, 120, 451, 399, 311, 173, 193, 181]

In [17]:
n

Unnamed: 0,0,1,2,3,4,5,6,7,8
GO:0000109,825,595,542,932,749,538,509,403,633
GO:0000112,825,595,542,932,749,538,509,403,633
GO:0000118,825,595,542,932,749,538,509,403,633
GO:0000120,825,595,542,932,749,538,509,403,633
GO:0000123,825,595,542,932,749,538,509,403,633
...,...,...,...,...,...,...,...,...,...
GO:1990467,825,595,542,932,749,538,509,403,633
GO:1990468,825,595,542,932,749,538,509,403,633
GO:1990726,825,595,542,932,749,538,509,403,633
GO:1990816,825,595,542,932,749,538,509,403,633


# Testing p_values

In [66]:
go_term,     = random.sample(list(nb_annotated_genes_in_cluster.index)  , 1)
cluster_idx, = random.sample(list(nb_annotated_genes_in_cluster.columns), 1)

In [77]:
go_term = 'GO:1990904'
cluster_idx = 0

In [81]:
res1 = p_values[cluster_idx][go_term]
res2 = 1-hypergeom.cdf(
            k=k[cluster_idx][go_term]-1, 
            M=N[cluster_idx][go_term], 
            N=n[cluster_idx][go_term], 
            n=K[cluster_idx][go_term])

print(res1, res2)

0.0 0.0


In [82]:
k[cluster_idx][go_term]

236

In [83]:
K[cluster_idx][go_term]

469

In [84]:
n[cluster_idx][go_term]

825

In [85]:
go_dag['GO:1990904']

GOTerm('GO:1990904'):
  id:GO:1990904
  item_id:GO:1990904
  name:ribonucleoprotein complex
  namespace:cellular_component
  _parents: 1 items
    GO:0032991
  parents: 1 items
    GO:0032991	level-01	depth-01	protein-containing complex [cellular_component]
  children: 21 items
  level:2
  depth:2
  is_obsolete:False
  alt_ids: 2 items
    GO:1990903
    GO:0030529

# Testing nb_annotated_genes_in_cluster

In [51]:
go_term,     = random.sample(list(nb_annotated_genes_in_cluster.index)  , 1)
cluster_idx, = random.sample(list(nb_annotated_genes_in_cluster.columns), 1)
cluster = cluster_list[cluster_idx]

In [52]:
nb1 = len(GO2genes[go_term] & cluster_list[cluster_idx])
nb2 = nb_annotated_genes_in_cluster[cluster_idx][go_term]

print(nb1, nb2)

0 0


In [53]:
cluster2GO(cluster) == set(annotation_df[annotation_df.Systematic_ID.isin(cluster_df[cluster_df == cluster_idx].index)].GO_ID)

True

In [55]:
nb_annotated_genes_in_cluster

Unnamed: 0,0,1,2,3,4,5,6,7,8
GO:0000109,1,0,0,5,7,3,0,0,0
GO:0000112,0,0,0,3,4,0,0,0,0
GO:0000118,11,2,0,14,13,1,0,1,0
GO:0000120,0,1,0,0,3,1,2,0,0
GO:0000123,23,1,0,14,7,1,0,0,0
...,...,...,...,...,...,...,...,...,...
GO:1990467,1,0,0,1,3,0,0,0,0
GO:1990468,1,0,0,2,2,0,0,0,0
GO:1990726,6,0,0,2,0,0,0,0,0
GO:1990816,3,2,0,2,0,1,3,0,2


In [43]:
cluster_df[cluster_df == cluster_idx].index

Index(['YDR284C', 'YLR440C', 'YLR015W', 'YER111C', 'YPL075W', 'YKR028W',
       'YFR030W', 'YML038C', 'YFL047W', 'YOR076C',
       ...
       'YDR118W', 'YHR105W', 'YLR354C', 'YER132C', 'YBR155W', 'YNL106C',
       'YDR135C', 'YLR019W', 'YFR053C', 'YLR095C'],
      dtype='object', length=749)

In [33]:
for i in range(500*19):
    go_term,     = random.sample(list(nb_annotated_genes_in_cluster.index)  , 1)
    cluster_idx, = random.sample(list(nb_annotated_genes_in_cluster.columns), 1)
    assert len(GO2genes[go_term] & cluster_list[cluster_idx]) == nb_annotated_genes_in_cluster[cluster_idx][go_term]

In [34]:
len(GO2genes[go_term] & cluster_list[cluster_idx]), nb_annotated_genes_in_cluster[cluster_idx][go_term]

(38, 38)

In [77]:
go_term,     = random.sample(list(nb_annotated_genes_in_cluster.index)  , 1)
cluster_idx, = random.sample(list(nb_annotated_genes_in_cluster.columns), 1)

In [78]:
N = len(PPI)
K = len(GO2genes[go_term])
n = len(cluster_list[cluster_idx])
k = nb_annotated_genes_in_cluster[cluster_idx][go_term]

In [79]:
K

5

In [80]:
global_GO_counter[go_term]

5

In [81]:
p_values[cluster_idx][go_term], 1-hypergeom.cdf(k=k-1, M=N, N=n, n=K)

(0.17192191115264888, 0.17192191115264888)

In [82]:
1-hypergeom.cdf(k=k-1, M=N, N=n, n=K), hypergeom.sf(k=k-1, M=N, N=n, n=K)

(0.17192191115264888, 0.171921911162144)

In [21]:
? hypergeom.sf

In [102]:
(global_GO_counter[GO2genes.index] == global_GO_counter.values).shape

(500,)

In [96]:
global_GO_counter.values.reshape(-1,1)

array([[ 16],
       [  7],
       [ 42],
       [  7],
       [ 46],
       [ 20],
       [  6],
       [ 66],
       [  6],
       [  6],
       [  6],
       [  5],
       [ 23],
       [ 12],
       [ 14],
       [  8],
       [ 72],
       [ 23],
       [ 12],
       [  5],
       [  7],
       [ 10],
       [ 11],
       [ 12],
       [ 13],
       [ 38],
       [ 11],
       [  6],
       [ 10],
       [ 18],
       [  6],
       [ 35],
       [ 44],
       [ 57],
       [ 57],
       [ 57],
       [ 11],
       [117],
       [ 19],
       [ 30],
       [  5],
       [ 38],
       [  7],
       [ 27],
       [ 28],
       [ 48],
       [ 34],
       [ 34],
       [ 12],
       [ 12],
       [ 29],
       [  6],
       [  6],
       [ 18],
       [ 91],
       [  5],
       [ 63],
       [ 16],
       [ 28],
       [ 28],
       [  5],
       [  5],
       [  5],
       [  6],
       [ 14],
       [ 12],
       [ 11],
       [  7],
       [ 11],
       [  5],
       [ 41],
      

In [103]:
global_GO_counter[GO2genes.index]

GO:0000109     16
GO:0000112      7
GO:0000118     42
GO:0000120      7
GO:0000123     46
             ... 
GO:1990467      5
GO:1990468      5
GO:1990726      8
GO:1990816     13
GO:1990904    469
Name: gene_sets, Length: 500, dtype: int64

In [58]:
K

Unnamed: 0,0,1,2,3,4,5,6,7,8
GO:0000109,16,16,16,16,16,16,16,16,16
GO:0000112,7,7,7,7,7,7,7,7,7
GO:0000118,42,42,42,42,42,42,42,42,42
GO:0000120,7,7,7,7,7,7,7,7,7
GO:0000123,46,46,46,46,46,46,46,46,46
...,...,...,...,...,...,...,...,...,...
GO:1990467,5,5,5,5,5,5,5,5,5
GO:1990468,5,5,5,5,5,5,5,5,5
GO:1990726,8,8,8,8,8,8,8,8,8
GO:1990816,13,13,13,13,13,13,13,13,13


In [64]:
k.sum(axis=1)

GO:0000109     16
GO:0000112      7
GO:0000118     42
GO:0000120      7
GO:0000123     46
             ... 
GO:1990467      5
GO:1990468      5
GO:1990726      8
GO:1990816     13
GO:1990904    469
Length: 500, dtype: int64

In [61]:
k.sum(axis=1)

GO:0000109     16
GO:0000112      7
GO:0000118     42
GO:0000120      7
GO:0000123     46
             ... 
GO:1990467      5
GO:1990468      5
GO:1990726      8
GO:1990816     13
GO:1990904    469
Length: 500, dtype: int64

In [121]:
k.le(K, axis=0).all().all()

True

In [128]:
pd.DataFrame(K)

Unnamed: 0,gene_sets
GO:0000109,16
GO:0000112,7
GO:0000118,42
GO:0000120,7
GO:0000123,46
...,...
GO:1990467,5
GO:1990468,5
GO:1990726,8
GO:1990816,13


In [11]:
? pd.concat

In [17]:
pd.concat([global_GO_counter[GO2genes.index]]*nb_clusters, axis=1, keys=range(nb_clusters))

Unnamed: 0,0,1
GO:0000109,16,16
GO:0000112,7,7
GO:0000118,42,42
GO:0000120,7,7
GO:0000123,46,46
...,...,...
GO:1990467,5,5
GO:1990468,5,5
GO:1990726,8,8
GO:1990816,13,13


In [16]:
? pd.concat

In [20]:
n

[2587, 3139]

In [55]:
N = pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))
N.index = GO2genes.index
N

Unnamed: 0,0,1
GO:0000109,2587,3139
GO:0000112,2587,3139
GO:0000118,2587,3139
GO:0000120,2587,3139
GO:0000123,2587,3139
...,...,...
GO:1990467,2587,3139
GO:1990468,2587,3139
GO:1990726,2587,3139
GO:1990816,2587,3139


In [50]:
? pd.concat

In [11]:
k

NameError: name 'k' is not defined

In [13]:
k

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
GO:0000109,0,0,2,2,0,0,0,0,3,0,1,6,0,0,2
GO:0000112,0,0,0,1,0,0,0,0,2,0,0,2,0,0,2
GO:0000118,3,10,1,3,0,0,0,5,11,0,5,0,0,0,4
GO:0000120,0,0,0,1,0,0,2,0,3,0,0,0,0,0,1
GO:0000123,4,10,1,0,0,0,0,8,6,0,13,1,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:1990467,1,0,0,0,0,0,0,0,3,0,1,0,0,0,0
GO:1990468,2,0,0,0,0,0,0,0,2,0,1,0,0,0,0
GO:1990726,0,2,0,0,0,0,0,3,0,0,3,0,0,0,0
GO:1990816,0,0,1,1,0,0,4,2,0,0,2,1,1,0,1


In [19]:
k.sum(axis=1)

GO:0000109     16
GO:0000112      7
GO:0000118     42
GO:0000120      7
GO:0000123     46
             ... 
GO:1990467      5
GO:1990468      5
GO:1990726      8
GO:1990816     13
GO:1990904    469
Length: 500, dtype: int64

In [27]:
K.eq(k.sum(axis=1), axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
GO:0000109,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
GO:0000112,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
GO:0000118,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
GO:0000120,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
GO:0000123,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:1990467,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
GO:1990468,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
GO:1990726,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
GO:1990816,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [11]:
k

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
GO:0000109,0,0,2,2,0,0,0,0,3,0,1,6,0,0,2
GO:0000112,0,0,0,1,0,0,0,0,2,0,0,2,0,0,2
GO:0000118,3,10,1,3,0,0,0,5,11,0,5,0,0,0,4
GO:0000120,0,0,0,1,0,0,2,0,3,0,0,0,0,0,1
GO:0000123,4,10,1,0,0,0,0,8,6,0,13,1,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:1990467,1,0,0,0,0,0,0,0,3,0,1,0,0,0,0
GO:1990468,2,0,0,0,0,0,0,0,2,0,1,0,0,0,0
GO:1990726,0,2,0,0,0,0,0,3,0,0,3,0,0,0,0
GO:1990816,0,0,1,1,0,0,4,2,0,0,2,1,1,0,1


In [14]:
K

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
GO:0000109,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
GO:0000112,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
GO:0000118,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42
GO:0000120,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
GO:0000123,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:1990467,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
GO:1990468,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
GO:1990726,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
GO:1990816,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13


In [22]:
n.sum(axis=1) == N

GO:0000109    True
GO:0000112    True
GO:0000118    True
GO:0000120    True
GO:0000123    True
              ... 
GO:1990467    True
GO:1990468    True
GO:1990726    True
GO:1990816    True
GO:1990904    True
Length: 500, dtype: bool

In [21]:
N

5726

In [25]:
pd.DataFrame(N, columns=k.columns, index=k.index)

Unnamed: 0,0,1
GO:0000109,5726,5726
GO:0000112,5726,5726
GO:0000118,5726,5726
GO:0000120,5726,5726
GO:0000123,5726,5726
...,...,...
GO:1990467,5726,5726
GO:1990468,5726,5726
GO:1990726,5726,5726
GO:1990816,5726,5726


In [30]:
n

Unnamed: 0,0,1
GO:0000109,2587,3139
GO:0000112,2587,3139
GO:0000118,2587,3139
GO:0000120,2587,3139
GO:0000123,2587,3139
...,...,...
GO:1990467,2587,3139
GO:1990468,2587,3139
GO:1990726,2587,3139
GO:1990816,2587,3139


In [39]:
N.eq(n.sum(axis=1), axis=0)

Unnamed: 0,0,1
GO:0000109,True,True
GO:0000112,True,True
GO:0000118,True,True
GO:0000120,True,True
GO:0000123,True,True
...,...,...
GO:1990467,True,True
GO:1990468,True,True
GO:1990726,True,True
GO:1990816,True,True


In [37]:
n.sum(axis=1)

GO:0000109    5726
GO:0000112    5726
GO:0000118    5726
GO:0000120    5726
GO:0000123    5726
              ... 
GO:1990467    5726
GO:1990468    5726
GO:1990726    5726
GO:1990816    5726
GO:1990904    5726
Length: 500, dtype: int64

In [12]:
pd.concat([pd.DataFrame(map(len, cluster_list)).T]*len(GO2genes))

Unnamed: 0,0,1
0,2587,3139
0,2587,3139
0,2587,3139
0,2587,3139
0,2587,3139
...,...,...
0,2587,3139
0,2587,3139
0,2587,3139
0,2587,3139
