# Predictions


In [None]:
import os
import ast
import subprocess

import validation

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.externals import joblib

plt.rcParams['font.size'] = 16
plt.rcParams['axes.facecolor'] = 'white'

%matplotlib inline

In [None]:
def fetch_model_dumps(path_to_models, labels):
    """Read model dumps from disk."""
    
    models = {}
    for num, path in enumerate(path_to_models):
        models[labels[num]] = joblib.load(path) 
    
    return models

In [None]:
def biclusters(models, ref_labels):
    # Create Bicluster instances tracking detected clusters 
    
    biclusters = {}
    for label in ref_labels:
        rows = models[label].rows_
        cols = models[label].columns_

        assert np.shape(rows)[0] == np.shape(cols)[0]
    
        biclusters[label] = validation.Biclusters(
            rows=rows, cols=cols, data=ref_data[label]
        )
        
    return biclusters

In [None]:
def sklearn_graphics(data, title, out_path):
    """Generate a heatmap and save figure to disk."""
    
    plt.figure(figsize=(10, 10))
    plt.title(title)
    sns.heatmap(
        data, robust=True, 
        cmap=plt.cm.RdBu_r, fmt='f', 
        vmin=np.min(data), vmax=np.max(data),
    )
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(out_path)
    
    return None

In [None]:
def r_graphics():
    
    # NOTE: Necessary to execute biclust_graphics.R 
    # with model, hparams and dataset to produce hm.
    pass

In [None]:
# Globals
N_CLUSTERS = 2

In [None]:
# Paths to ref cluster indications.
path_target_genes = './../data/test/emQTL_Cluster_genes.txt'
path_target_cpgs = './../data/test/emQTL_Clusters_CpGs.txt'

In [None]:
# Indicator labels for classes of reference data.
ref_labels = [
    'orig_pvalues', 'sel_pvalues','orig_pcc', 'sel_pcc'
]

In [None]:
# Read experimental data
ref_data = {
    ref_labels[0]: pd.read_csv(
        './../data/train/orig_pvalues_prep.csv', sep=',', index_col=0
    ),
    # NOTE: Transpose to (genes x cpgs).
    ref_labels[1]: pd.read_csv(
        './../data/train/sel_pvalues_prep.csv', sep=',', index_col=0
    ).T,
    ref_labels[2]: pd.read_csv(
        './../data/train/orig_pcc_prep.csv', sep=',', index_col=0
    ),
    # NOTE: Transpose to (genes x cpgs).
    ref_labels[3]: pd.read_csv(
        './../data/train/sel_pcc_prep.csv', sep=',', index_col=0
    ).T,
}

In [None]:
ref_clusters = validation.References.from_files(
    path_target_cpgs, path_target_genes
)

## Models: sklearn

In [None]:
path_to_skmodels = [
    './../model_dumps/sk_orig_prep_pvalues.pkl',
    './../model_dumps/sk_sel_prep_pvalues.pkl',
    './../model_dumps/sk_orig_prep_pcc.pkl',
    './../model_dumps/sk_sel_prep_pcc.pkl',
]
sk_models = fetch_model_dumps(path_to_skmodels, ref_labels)

In [None]:
# Set number of clusters to detect.
for model in sk_models.values():
    model.n_clusters = N_CLUSTERS

In [None]:
# Fit selected models to data.
for name, model in sk_models.items():
    model.fit(ref_data[name])

In [None]:
# Create Bicluster instances tracking detected clusters 
sk_biclusters = biclusters(sk_models, ref_labels)

### Bicluster statistics

In [None]:
for label in ref_labels:
    print('{0}\n{1}'.format(label, '-' * 50))
    print(sk_biclusters[label].stats)
    print()

### Reference comparison

In [None]:
for label in ref_labels:
    print('{0}\n{1}'.format(label, '-' * 50))
    print(validation.compare_clusters(sk_biclusters[label], ref_clusters))
    print()

### Visualization

In [None]:
# Reconstruct data matrices by sorting data according to
# predicted biclusters.
reconstr_data = {}
for label in ref_labels:
    
    # Extract fitted model
    model = sk_models[label]
    
    # Sort reference data
    data = ref_data[label].values
    row_sorted_data = data[np.argsort(model.row_labels_), :]
    sorted_col_idx = np.argsort(model.column_labels_)
    reconstr_data[label] = row_sorted_data[:, sorted_col_idx]

In [None]:
"""sklearn_graphics(
    reconstr_data[ref_labels[0]],
    'Biclustering results of preprocessed\n'
    'Bonferroni corrected p-values', 
    './../predictions/imgs/org_prep_pvalues.png'
)"""

In [None]:
"""sklearn_graphics(
    reconstr_data[ref_labels[1]],
    'Biclustering results of selected preprocessed\n'
    'Bonferroni corrected p-values', 
    './../predictions/imgs/sel_prep_pvalues.png'
)"""

In [None]:
"""sklearn_graphics(
    reconstr_data[ref_labels[2]],
    'Biclustering results of preprocessed\n'
    'Pearson`s correlation coefficients', 
    './../predictions/imgs/org_prep_pcc.png'
)"""

In [None]:
"""sklearn_graphics(
    reconstr_data[ref_labels[3]],
    'Biclustering results of selected preprocessed\n'
    'Pearson`s correlation coefficients', 
    './../predictions/imgs/sel_prep_pcc.png'
)"""

## Models: R

In [None]:
# Collect pickled wrapped R models
path_to_rmodels = [
    './../model_dumps/r_orig_prep_pcc.pkl',
    './../model_dumps/r_sel_prep_pvalues.pkl',
    './../model_dumps/r_orig_prep_pcc.pkl',
    './../model_dumps/r_sel_prep_pcc.pkl',
]
r_models = fetch_model_dumps(path_to_rmodels, ref_labels)

In [None]:
# Set number of clusters to detect.
for model in r_models.values():
    model.n_clusters = N_CLUSTERS 

In [None]:
for name, model in r_models.items():
    # NOTE: Convert from <pandas.DataFrame> to <numpy.ndarray>
    model.fit(ref_data[name].values)

In [None]:
# Create Bicluster instances tracking detected clusters 
r_biclusters = biclusters(r_models, ref_labels)

### Bicluster statistics

In [None]:
for label in ref_labels:
    print('{0}\n{1}'.format(label, '-' * 50))
    print(r_biclusters[label].stats)
    print()

### Reference comparison

In [None]:
for label in ref_labels:
    print('{0}\n{1}'.format(label, '-' * 50))
    print(compare_clusters(r_biclusters[label], ref_clusters))
    print()

### Visualization

In [None]:
# Graphics generated with biclust_graphics.R.

In [None]:
# TEMP
def preds_to_disk(models, parent='./../predictions/'):
    """Generate txt files containing row and column indicators for 
    detected biclusters associated with different datasets."""
    
    for label, model in models.items():
    
        pass
    
    # Store bicluster data (row and col labels) for each set of ref data.
        """stem = '{0}_biclusters_{1}.txt'.format(model, label)
        with open(os.path.join(parent, stem), 'w') as outfile:   
            outfile.write('biclusters_{0}\n'.format(label))

            for cluster_num, coords in clusters[label].items():
                outfile.write('cluster_num_{0}\n'.format(cluster_num))
                outfile.write('{0}\n'.format(coords['cpgs']))
                outfile.write('{0}\n'.format(coords['genes']))
    
    return None"""