# Bicluster statistics

In [1]:
import os
import ast

import metrics
import validation

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.externals import joblib

plt.rcParams['font.size'] = 16
plt.rcParams['axes.facecolor'] = 'white'

%matplotlib inline

In [2]:
def fetch_model_dumps(path_to_models, labels):
    """Read model dumps from disk."""
    
    models = {}
    for num, path in enumerate(path_to_models):
        models[labels[num]] = joblib.load(path) 
        
    return models

In [3]:
def biclusters(models, ref_data, n_clusters):
    # Create Bicluster instances tracking detected clusters 
    
    biclusters = {}
    for label, data in ref_data.items():
        
        # Set number of clusters to detect.
        models[label].n_clusters = n_clusters
        # Fit model to data.
        models[label].fit(data.values)
        # Extract result.
        rows = models[label].rows_
        cols = models[label].columns_
        assert np.shape(rows)[0] == np.shape(cols)[0]
        # Collect Bicluster instances 
        biclusters[label] = validation.Biclusters(
            rows=rows, cols=cols, data=ref_data[label]
        )
        
    return biclusters

In [4]:
ref_labels = [
    'orig_pvalues', 'sel_pvalues','orig_pcc', 'sel_pcc'
]
ref_data = {    
    ref_labels[0]: pd.read_csv(
        './../data/train/orig_pvalues_prep.csv', sep=',', index_col=0
    ).T,
    ref_labels[1]: pd.read_csv(
        './../data/train/sel_pvalues_prep.csv', sep=',', index_col=0
    ).T,
    ref_labels[2]: pd.read_csv(
        './../data/train/orig_pcc_prep.csv', sep=',', index_col=0
    ).T,
    ref_labels[3]: pd.read_csv(
        './../data/train/sel_pcc_prep.csv', sep=',', index_col=0
    ).T,
}

In [5]:
# NB: Swapped sk bic with sk checker
path_to_skcheck = [
    './../_model_dumps/sk_bic_orig_prep_pvalues.pkl',
    './../_model_dumps/sk_bic_sel_prep_pvalues.pkl',
    './../_model_dumps/sk_bic_orig_prep_pcc.pkl',
    './../_model_dumps/sk_bic_sel_prep_pcc.pkl',
]
# NB: Swapped sk bic with sk checker
path_to_skbic = [
    './../_model_dumps/sk_checker_orig_prep_pvalues.pkl',
    './../_model_dumps/sk_checker_sel_prep_pvalues.pkl',
    './../_model_dumps/sk_checker_orig_prep_pcc.pkl',
    './../_model_dumps/sk_checker_sel_prep_pcc.pkl',
]
path_to_rbic = [
    './../_model_dumps/r_bic_orig_prep_pvalues.pkl',
    './../_model_dumps/r_bic_sel_prep_pvalues.pkl',
    './../_model_dumps/r_bic_orig_prep_pcc.pkl',
    './../_model_dumps/r_bic_sel_prep_pcc.pkl',
]
path_to_rcheck = [
    './../_model_dumps/r_checker_orig_prep_pvalues.pkl',
    './../_model_dumps/r_checker_sel_prep_pvalues.pkl',
    './../_model_dumps/r_checker_orig_prep_pcc.pkl',
    './../_model_dumps/r_checker_sel_prep_pcc.pkl',
]

In [6]:
# Input spectral biclustering
n_checkers = [(2, 1), (2, 2), (3, 1), (2, 3), (3, 2)]
# Input spectral coclustering
n_bics = [2, 3, 4, 5, 6]

In [7]:
for num, n_clusters in enumerate(n_bics):
    
    print('Bic num {}'.format(num))
    
    # Spectral biclustering.
    sk_bic_models = fetch_model_dumps(
        path_to_skbic, ref_labels
    )
    sk_bic_clusters = biclusters(
        sk_bic_models, ref_data, n_checkers[num]
    )
    for label in ref_labels:
        print('{0}\n{1}'.format(label, '-' * 50))
        print(sk_bic_clusters[label].stats)
        print()
        
    print('#'* 50)
    
    # Spectral coclustering.
    sk_check_models = fetch_model_dumps(
        path_to_skcheck, ref_labels
    )
    sk_check_clusters = biclusters(
        sk_check_models, ref_data, n_clusters
    )
    for label in ref_labels:
        print('{0}\n{1}'.format(label, '-' * 50))
        print(sk_check_clusters[label].stats)
        print()

Bic num 0
orig_pvalues
--------------------------------------------------
           max           min   ncols    nrows  rel_size        size       std  \
num                                                                             
0    72.617983  7.079401e-09  2664.0  15891.0  0.576576  42333624.0  6.043910   
1    73.435290  4.915739e-08  2664.0  11670.0  0.423424  31088880.0  5.672154   

     zeros  
num         
0      0.0  
1      0.0  

sel_pvalues
--------------------------------------------------
           max  min   ncols    nrows  rel_size        size       std  \
num                                                                    
0    73.435290  0.0  2664.0  24867.0  0.902253  66245688.0  2.310285   
1    57.268668  0.0  2664.0   2694.0  0.097747   7176816.0  5.970771   

          zeros  
num              
0    65807494.0  
1     6875402.0  

orig_pcc
--------------------------------------------------
          max       min   ncols    nrows  rel_size        size 

           max  min   ncols    nrows  rel_size        size       std  \
num                                                                    
0    73.435290  0.0  2664.0  24493.0  0.888683  65249352.0  2.498271   
1    68.856615  0.0  2664.0   2334.0  0.084685   6217776.0  4.971649   
2    72.617983  0.0  2664.0    734.0  0.026632   1955376.0  5.182767   

          zeros  
num              
0    64754950.0  
1     6032714.0  
2     1895232.0  

orig_pcc
--------------------------------------------------
          max       min   ncols    nrows  rel_size        size       std  \
num                                                                        
0    0.827146 -0.858061  2664.0   5163.0  0.187330  13754232.0  0.280324   
1    0.822428 -0.866965  2664.0  12097.0  0.438917  32226408.0  0.324488   
2    0.852443 -0.813883  2664.0  10301.0  0.373753  27441864.0  0.324338   

     zeros  
num         
0      0.0  
1      0.0  
2      0.0  

sel_pcc
---------------------------------

          max       min  ncols   nrows  rel_size       size       std  zeros
num                                                                         
0    0.852443 -0.511519  484.0  5696.0  0.037548  2756864.0  0.195167    0.0
1    0.822428 -0.736304  889.0  7513.0  0.090967  6679057.0  0.138316    0.0
2    0.757663 -0.657026  326.0  6063.0  0.026920  1976538.0  0.164359    0.0
3    0.756486 -0.719795  417.0  3746.0  0.021275  1562082.0  0.200283    0.0
4    0.812042 -0.755542  548.0  4543.0  0.033907  2489564.0  0.123777    0.0

sel_pcc
--------------------------------------------------
          max       min   ncols    nrows  rel_size        size       std  \
num                                                                        
0    0.852443 -0.724472   188.0   2973.0  0.007612    558924.0  0.221857   
1    0.827146 -0.866965  2364.0  20103.0  0.647261  47523492.0  0.022610   
2    0.804678  0.000000    65.0   1515.0  0.001341     98475.0  0.296057   
4    0.776917 -0.6909

In [None]:
for num, n_clusters in enumerate(n_bics):
    
    print('Bic num {}'.format(num))
    
    r_bic_models = fetch_model_dumps(
        path_to_rbic, ref_labels
    )
    r_bic_clusters = biclusters(
        r_bic_models, ref_data, n_clusters
    )
    for label in ref_labels:
        print('{0}\n{1}'.format(label, '-' * 50))
        print(r_bic_clusters[label].stats)
        print()
        
    print('#'* 50)
    
    r_check_models = fetch_model_dumps(
        path_to_rcheck, ref_labels
    )
    r_check_clusters = biclusters(
        r_check_models, ref_data, n_clusters
    )
    for label in ref_labels:
        print('{0}\n{1}'.format(label, '-' * 50))
        print(r_check_clusters[label].stats)
        print()

Bic num 0
orig_pvalues
--------------------------------------------------
           max           min  ncols   nrows  rel_size       size       std  \
num                                                                          
0    57.268668  2.593338e+00  332.0  6006.0  0.027158  1993992.0  6.123798   
1    33.007152  8.290854e-08  439.0  5134.0  0.030697  2253826.0  2.477966   

     zeros  
num         
0      0.0  
1      0.0  

sel_pvalues
--------------------------------------------------
           max  min   ncols    nrows  rel_size        size       std  \
num                                                                    
0    39.615577  0.0  1883.0  17518.0  0.449268  32986394.0  0.227273   
1    27.338115  0.0  1557.0   5094.0  0.108024   7931358.0  0.138371   

          zeros  
num              
0    32984075.0  
1     7931130.0  

orig_pcc
--------------------------------------------------
          max       min  ncols   nrows  rel_size     size       std  zeros
