# Bicluster reconstruction 

In [1]:
import os
import ast

import metrics
import validation

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.externals import joblib

plt.rcParams['font.size'] = 16
plt.rcParams['axes.facecolor'] = 'white'

%matplotlib inline

In [2]:
def fetch_model_dumps(path_to_models, labels):
    """Read model dumps from disk."""
    
    models = {}
    for num, path in enumerate(path_to_models):
        models[labels[num]] = joblib.load(path) 
        
    return models

In [3]:
def biclusters(models, ref_data, n_clusters):
    # Create Bicluster instances tracking detected clusters 
    
    biclusters = {}
    for label, data in ref_data.items():
        
        # Set number of clusters to detect.
        models[label].n_clusters = n_clusters
        # Fit model to data.
        models[label].fit(data.values)
        # Extract result.
        rows = models[label].rows_
        cols = models[label].columns_
        assert np.shape(rows)[0] == np.shape(cols)[0]
        # Collect Bicluster instances 
        biclusters[label] = validation.Biclusters(
            rows=rows, cols=cols, data=ref_data[label]
        )
        
    return biclusters

In [4]:
# Paths to ref cluster indications.
path_target_genes = './../data/test/emQTL_Cluster_genes.txt'
path_target_cpgs = './../data/test/emQTL_Clusters_CpGs.txt'

In [5]:
# Indicator labels for classes of reference data.
ref_labels = [
    'orig_pvalues', 'sel_pvalues','orig_pcc', 'sel_pcc'
]

In [6]:
# NOTE: Clustering num samples (genes) < num features (CpGs) produces
# lesser recovery/relevance results compared to transposing data.

# NOTE: Transpose to (cpgs x genes) ensuring num samples > num features.
ref_data = {
    ref_labels[0]: pd.read_csv(
        './../data/train/orig_pvalues_prep.csv', sep=',', index_col=0
    ).T,
    ref_labels[1]: pd.read_csv(
        './../data/train/sel_pvalues_prep.csv', sep=',', index_col=0
    ).T,
    ref_labels[2]: pd.read_csv(
        './../data/train/orig_pcc_prep.csv', sep=',', index_col=0
    ).T,
    ref_labels[3]: pd.read_csv(
        './../data/train/sel_pcc_prep.csv', sep=',', index_col=0
    ).T,
}

In [7]:
refs = validation.References.from_files(
    path_target_cpgs, path_target_genes
)

In [8]:
# NB: Swapped sk bic with sk checker
path_to_skcheck = [
    './../_model_dumps/sk_bic_orig_prep_pvalues.pkl',
    './../_model_dumps/sk_bic_sel_prep_pvalues.pkl',
    './../_model_dumps/sk_bic_orig_prep_pcc.pkl',
    './../_model_dumps/sk_bic_sel_prep_pcc.pkl',
]
# NB: Swapped sk bic with sk checker
path_to_skbic = [
    './../_model_dumps/sk_checker_orig_prep_pvalues.pkl',
    './../_model_dumps/sk_checker_sel_prep_pvalues.pkl',
    './../_model_dumps/sk_checker_orig_prep_pcc.pkl',
    './../_model_dumps/sk_checker_sel_prep_pcc.pkl',
]
path_to_rbic = [
    './../_model_dumps/r_bic_orig_prep_pvalues.pkl',
    './../_model_dumps/r_bic_sel_prep_pvalues.pkl',
    './../_model_dumps/r_bic_orig_prep_pcc.pkl',
    './../_model_dumps/r_bic_sel_prep_pcc.pkl',
]
path_to_rcheck = [
    './../_model_dumps/r_checker_orig_prep_pvalues.pkl',
    './../_model_dumps/r_checker_sel_prep_pvalues.pkl',
    './../_model_dumps/r_checker_orig_prep_pcc.pkl',
    './../_model_dumps/r_checker_sel_prep_pcc.pkl',
]

In [9]:
# Input spectral biclustering
n_checkers = [(2, 1), (2, 2), (3, 1), (2, 3), (3, 2)]
# Input spectral coclustering
n_bics = [2, 3, 4, 5, 6]

## Reference matching

In [10]:
for num, n_clusters in enumerate(n_bics):
    
    print('Bic num {}'.format(num))
    
    # Spectral biclustering.
    sk_bic_models = fetch_model_dumps( 
        path_to_skbic, ref_labels
    )
    sk_bic_clusters = biclusters(
        sk_bic_models, ref_data, n_checkers[num]
    )
    for label in ref_labels:
        print('{0}\n{1}'.format(label, '-' * 50))
        print(validation.compare_clusters(sk_bic_clusters[label], refs))
        print()
         
    print('#'* 50)
    
    # Spectral coclustering.
    sk_check_models = fetch_model_dumps(
        path_to_skcheck, ref_labels
    )
    sk_check_clusters = biclusters(
        sk_check_models, ref_data, n_clusters
    )
    for label in ref_labels:
        print('{0}\n{1}'.format(label, '-' * 50))
        print(validation.compare_clusters(sk_check_clusters[label], refs))
        print()

Bic num 0
orig_pvalues
--------------------------------------------------
num                   1            2       
kind               cpgs  genes  cpgs  genes
cluster score                              
ref1    recovery   51.2  100.0  48.8  100.0
        relevance  14.9    6.0  10.5    6.0
ref2    recovery    2.4  100.0  97.6  100.0
        relevance   0.8   10.2  22.1   10.2

sel_pvalues
--------------------------------------------------
num                   1            2       
kind               cpgs  genes  cpgs  genes
cluster score                              
ref1    recovery   63.6  100.0  36.4  100.0
        relevance   8.9    6.0  38.2    6.0
ref2    recovery   93.0  100.0   7.0  100.0
        relevance  13.8   10.2   7.8   10.2

orig_pcc
--------------------------------------------------
num                   1            2       
kind               cpgs  genes  cpgs  genes
cluster score                              
ref1    recovery   65.1  100.0  34.9  100.0
        r

num                   1            2            3       
kind               cpgs  genes  cpgs  genes  cpgs  genes
cluster score                                           
ref1    recovery   48.3  100.0  20.0  100.0  31.8  100.0
        relevance  98.0    6.0   2.9    6.0  42.6    6.0
ref2    recovery    0.0  100.0  68.1  100.0  31.9  100.0
        relevance   0.1   10.2  10.5   10.2  45.2   10.2

##################################################
orig_pvalues
--------------------------------------------------
num                   1            2           3          4      
kind               cpgs genes   cpgs  genes cpgs genes cpgs genes
cluster score                                                    
ref1    recovery    0.0   0.0  100.0  100.0  0.0   0.0  0.0   0.0
        relevance   0.0   0.0   33.1   24.6  0.0   0.0  0.0   0.0
ref2    recovery   98.3  91.9    1.6    8.1  0.0   0.0  0.2   0.0
        relevance  24.8  16.1    0.6    3.4  0.0   0.0  0.3   0.0

sel_pvalues
----------

orig_pvalues
--------------------------------------------------
num                    1            2            3           4            5  \
kind                cpgs genes   cpgs  genes  cpgs genes  cpgs  genes  cpgs   
cluster score                                                                 
ref1    recovery   100.0   0.0  100.0  100.0   0.0   0.0   0.0  100.0   0.0   
        relevance   34.6   0.0   34.6   33.5   0.0   0.0   0.0   33.5   0.0   
ref2    recovery     1.6  93.4    1.6    6.6  11.3  93.4  11.3    6.6  87.1   
        relevance    0.6  11.6    0.6    3.7   5.6  11.6   5.6    3.7  29.8   

num                         6         
kind              genes  cpgs  genes  
cluster score                         
ref1    recovery    0.0   0.0  100.0  
        relevance   0.0   0.0   33.5  
ref2    recovery   93.4  87.1    6.6  
        relevance  11.6  29.8    3.7  

sel_pvalues
--------------------------------------------------
num                   1           2          

In [None]:
for num, n_clusters in enumerate(n_bics):
    
    print('Bic num {}'.format(num))
    
    r_bic_models = fetch_model_dumps(
        path_to_rbic, ref_labels
    )
    r_bic_clusters = biclusters(
        r_bic_models, ref_data, n_clusters
    )
    for label in ref_labels:
        print('{0}\n{1}'.format(label, '-' * 50))
        print(validation.compare_clusters(r_bic_clusters[label], refs))
        print()
        
    print('#'* 50)
    
    r_check_models = fetch_model_dumps(
        path_to_rcheck, ref_labels
    )
    r_check_clusters = biclusters(
        r_check_models, ref_data, n_clusters
    )
    for label in ref_labels:
        print('{0}\n{1}'.format(label, '-' * 50))
        print(validation.compare_clusters(r_check_clusters[label], refs))
        print()

Bic num 0
orig_pvalues
--------------------------------------------------
num                   1            2            3           4           5  \
kind               cpgs  genes  cpgs  genes  cpgs genes  cpgs genes  cpgs   
cluster score                                                               
ref1    recovery   97.7  100.0   0.0  100.0  52.1   0.0  17.3   0.0   0.0   
        relevance  55.3   48.5   0.0   36.7  65.4   0.0  75.9   0.0   0.0   
ref2    recovery    0.1    1.1  16.2    3.3   0.0  53.9   0.0  23.6   0.1   
        relevance   0.0    0.9  11.3    2.1   0.0  17.8   0.0   7.5  22.2   

num                         6          7        
kind              genes  cpgs genes cpgs genes  
cluster score                                   
ref1    recovery    0.0   0.0   0.0  0.1   1.2  
        relevance   0.0   0.0   0.0  4.3   0.3  
ref2    recovery   40.6   0.2  17.0  0.0  67.9  
        relevance  18.4  10.0   9.0  0.0  23.9  

sel_pvalues
------------------------------