# Bicluster patterns

Shift, scale and shift-scale patterns in detected biclusters.

In [1]:
import os
import ast

import metrics
import validation

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.externals import joblib

plt.rcParams['font.size'] = 16
plt.rcParams['axes.facecolor'] = 'white'

%matplotlib inline

In [2]:
def fetch_model_dumps(path_to_models, labels):
    """Read model dumps from disk."""
    
    models = {}
    for num, path in enumerate(path_to_models):
        models[labels[num]] = joblib.load(path) 
        
    return models

In [3]:
def biclusters(models, ref_data, n_clusters):
    # Create Bicluster instances tracking detected clusters 
    
    biclusters = {}
    for label, data in ref_data.items():
        
        # Set number of clusters to detect.
        models[label].n_clusters = n_clusters
        # Fit model to data.
        models[label].fit(data.values)
        # Extract result.
        rows = models[label].rows_
        cols = models[label].columns_
        assert np.shape(rows)[0] == np.shape(cols)[0]
        # Collect Bicluster instances 
        biclusters[label] = validation.Biclusters(
            rows=rows, cols=cols, data=ref_data[label]
        )
        
    return biclusters

In [4]:
# Paths to ref cluster indications.
path_target_genes = './../data/test/emQTL_Cluster_genes.txt'
path_target_cpgs = './../data/test/emQTL_Clusters_CpGs.txt'

In [5]:
# Indicator labels for classes of reference data.
ref_labels = [
    'orig_pvalues', 'sel_pvalues','orig_pcc', 'sel_pcc'
]

In [6]:
# NOTE: Clustering num samples (genes) < num features (CpGs) produces
# lesser recovery/relevance results compared to transposing data.

# NOTE: Transpose to (cpgs x genes) ensuring num samples > num features.
ref_data = {
    ref_labels[0]: pd.read_csv(
        './../data/train/orig_pvalues_prep.csv', sep=',', index_col=0
    ).T,
    ref_labels[1]: pd.read_csv(
        './../data/train/sel_pvalues_prep.csv', sep=',', index_col=0
    ).T,
    ref_labels[2]: pd.read_csv(
        './../data/train/orig_pcc_prep.csv', sep=',', index_col=0
    ).T,
    ref_labels[3]: pd.read_csv(
        './../data/train/sel_pcc_prep.csv', sep=',', index_col=0
    ).T,
}

In [7]:
refs = validation.References.from_files(
    path_target_cpgs, path_target_genes
)

## Reference biclusters

In [8]:
for label, data in ref_data.items():
    print('{0}\n{1}'.format(label, '-' * 50))
    print(refs.external_metrics(data))
    print()

orig_pvalues
--------------------------------------------------
                    msr      smsr       tve
ref_cluster1  50.235300  0.000151  1.126992
ref_cluster2  69.661867  0.000834  1.123449

sel_pvalues
--------------------------------------------------
                     msr         smsr       tve
ref_cluster1  325.219403     0.227421  1.085904
ref_cluster2  189.560209  4302.611436  0.999837

orig_pcc
--------------------------------------------------
                   msr         smsr       tve
ref_cluster1  0.002750     0.234705  1.116280
ref_cluster2  0.295604  2234.157860  1.012908

sel_pcc
--------------------------------------------------
                   msr          smsr       tve
ref_cluster1  0.092744  1.603310e+05  1.154635
ref_cluster2  0.076028  1.743069e+11  1.043211



## Detected biclusters

In [9]:
# NB: Swapped sk bic with sk checker
path_to_skcheck = [
    './../_model_dumps/sk_bic_orig_prep_pvalues.pkl',
    './../_model_dumps/sk_bic_sel_prep_pvalues.pkl',
    './../_model_dumps/sk_bic_orig_prep_pcc.pkl',
    './../_model_dumps/sk_bic_sel_prep_pcc.pkl',
]
# NB: Swapped sk bic with sk checker
path_to_skbic = [
    './../_model_dumps/sk_checker_orig_prep_pvalues.pkl',
    './../_model_dumps/sk_checker_sel_prep_pvalues.pkl',
    './../_model_dumps/sk_checker_orig_prep_pcc.pkl',
    './../_model_dumps/sk_checker_sel_prep_pcc.pkl',
]
path_to_rbic = [
    './../_model_dumps/r_bic_orig_prep_pvalues.pkl',
    './../_model_dumps/r_bic_sel_prep_pvalues.pkl',
    './../_model_dumps/r_bic_orig_prep_pcc.pkl',
    './../_model_dumps/r_bic_sel_prep_pcc.pkl',
]
path_to_rcheck = [
    './../_model_dumps/r_checker_orig_prep_pvalues.pkl',
    './../_model_dumps/r_checker_sel_prep_pvalues.pkl',
    './../_model_dumps/r_checker_orig_prep_pcc.pkl',
    './../_model_dumps/r_checker_sel_prep_pcc.pkl',
]

In [10]:
# Input spectral biclustering
n_checkers = [(2, 1), (2, 2), (3, 1), (2, 3), (3, 2)]
# Input spectral coclustering
n_bics = [2, 3, 4, 5, 6]

In [11]:
for num, n_clusters in enumerate(n_bics):
    
    print('Bic num {}'.format(num))
    
    # Spectral biclustering.
    sk_bic_models = fetch_model_dumps(
        path_to_skbic, ref_labels
    )
    sk_bic_clusters = biclusters(
        sk_bic_models, ref_data, n_checkers[num]
    )
    for label in ref_labels:
        print('{0}\n{1}'.format(label, '-' * 50))
        print(sk_bic_clusters[label].external_metrics)
        print()
        
    print('#'* 50)
    
    # Spectral coclustering.
    sk_check_models = fetch_model_dumps(
        path_to_skcheck, ref_labels
    )
    sk_check_clusters = biclusters(
        sk_check_models, ref_data, n_clusters
    )
    for label in ref_labels:
        print('{0}\n{1}'.format(label, '-' * 50))
        print(sk_check_clusters[label].external_metrics)
        print()

Bic num 0
orig_pvalues
--------------------------------------------------
           msr      smsr       tve
num                               
0    77.738506  0.181922  1.108055
1    62.652484  0.145121  1.113739

sel_pvalues
--------------------------------------------------


  smsr_values = msr_values / (avg_rows ** 2 * avg_cols ** 2)


           msr  smsr       tve
num                           
0     7.823659   inf  0.686477
1    55.364734   inf  0.914050

orig_pcc
--------------------------------------------------
          msr          smsr       tve
num                                  
0    0.112000  6.048038e+07  1.097870
1    0.117593  7.857069e+09  1.112029

sel_pcc
--------------------------------------------------
          msr  smsr       tve
num                          
0    0.003110   inf  0.583048
1    0.046216   inf  1.004793

##################################################
orig_pvalues
--------------------------------------------------
            msr      smsr       tve
num                                
0     70.952899  0.136974  1.113825
1    124.615917  0.044901  1.129334

sel_pvalues
--------------------------------------------------
           msr          smsr       tve
num                                   
0    10.275905  6.232533e+07  0.767349
1    11.939986  1.034958e+03  0.693548

or

orig_pvalues
--------------------------------------------------
           msr      smsr       tve
num                               
0    33.480955  0.192724  1.102735
1    10.574725  0.119539  1.030937
2    87.085447  0.213421  1.118615
3    29.292777  0.039381  1.113723
4    53.199460  0.367594  1.119479
5    74.858267  0.001723  1.133817

sel_pvalues
--------------------------------------------------
            msr      smsr       tve
num                                
0      2.643406       inf  0.724096
1     18.257402       inf  0.660638
2     10.355453       inf  0.847936
3    286.907791  0.004012  1.100161
4     61.912769       inf  0.907840
5     24.653587       inf  0.629811

orig_pcc
--------------------------------------------------
          msr          smsr       tve
num                                  
0    0.054140  1.449171e+03  1.109632
1    0.081229  2.363349e+04  1.115492
2    0.054218  1.957496e+02  1.095400
3    0.069749  6.399938e+05  1.097500
4    0.053796  

  smsr_values = msr_values / (avg_rows ** 2 * avg_cols ** 2)


In [None]:
for num, n_clusters in enumerate(n_bics):
    
    print('Bic num {}'.format(num))
    
    r_bic_models = fetch_model_dumps(
        path_to_rbic, ref_labels
    )
    r_bic_clusters = biclusters(
        r_bic_models, ref_data, n_clusters
    )
    for label in ref_labels:
        print('{0}\n{1}'.format(label, '-' * 50))
        print(r_bic_clusters[label].external_metrics)
        print()
    
    print('#'* 50)
    
    r_check_models = fetch_model_dumps(
        path_to_rcheck, ref_labels
    )
    r_check_clusters = biclusters(
        r_check_models, ref_data, n_clusters
    )
    for label in ref_labels:
        print('{0}\n{1}'.format(label, '-' * 50))
        print(r_check_clusters[label].external_metrics)
        print()

Bic num 0
orig_pvalues
--------------------------------------------------
            msr      smsr       tve
num                                
0     80.217093  0.000720  1.137178
1      8.334898  0.294056  1.090539
2     21.034913  1.011289  1.128215
3     82.406561  0.085028  1.116917
4    107.978945  0.001262  1.131134

sel_pvalues
--------------------------------------------------


  smsr_values = msr_values / (avg_rows ** 2 * avg_cols ** 2)


          msr  smsr       tve
num                          
0    0.051643   inf  0.647230
1    0.019144   inf  0.415399

orig_pcc
--------------------------------------------------
          msr        smsr       tve
num                                
0    0.014772  189.978341  1.117893
1    0.008990   17.820997  1.121513

sel_pcc
--------------------------------------------------
          msr          smsr       tve
num                                  
0    0.004087  9.534707e+20  0.608865

##################################################
orig_pvalues
--------------------------------------------------
            msr      smsr       tve
num                                
0     80.217093  0.000720  1.137178
1      9.911217  0.148109  1.098073
2     20.209698  0.976898  1.128765
3     60.187932  1.148342  1.082856
4    103.328099  0.001391  1.134008

sel_pvalues
--------------------------------------------------
            msr      smsr       tve
num                              