# Second chances

**Goal**

Verify that the model selection procedure selects the most suitable algorithms.

In [None]:
import os
import ast

import algorithms
import validation

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# To Plaid model eq.
import rpy2.robjects as robjects

from sklearn.cluster import SpectralBiclustering

plt.rcParams['font.size'] = 16
plt.rcParams['axes.facecolor'] = 'white'

%matplotlib inline

In [None]:
def biclusters(models, ref_labels):
    # Create Bicluster instances tracking detected clusters 
    
    biclusters = {}
    for label in ref_labels:
        rows = models[label].rows_
        cols = models[label].columns_

        assert np.shape(rows)[0] == np.shape(cols)[0]
    
        biclusters[label] = validation.Biclusters(
            rows=rows, cols=cols, data=ref_data[label]
        )
        
    return biclusters

In [None]:
# Paths to ref cluster indications.
path_target_genes = './../data/test/emQTL_Cluster_genes.txt'
path_target_cpgs = './../data/test/emQTL_Clusters_CpGs.txt'

In [None]:
# Indicator labels for classes of reference data.
ref_labels = [
    'orig_pvalues', 'sel_pvalues','orig_pcc', 'sel_pcc'
]

In [None]:
# Read experimental data
ref_data = {
    # NOTE: Transpose to (Cpgs x genes).
    ref_labels[0]: pd.read_csv(
        './../data/train/orig_pvalues_prep.csv', sep=',', index_col=0
    ).T,
    ref_labels[1]: pd.read_csv(
        './../data/train/sel_pvalues_prep.csv', sep=',', index_col=0
    ),
    # NOTE: Transpose to (Cpgs x genes).
    ref_labels[2]: pd.read_csv(
        './../data/train/orig_pcc_prep.csv', sep=',', index_col=0
    ).T,
    ref_labels[3]: pd.read_csv(
        './../data/train/sel_pcc_prep.csv', sep=',', index_col=0
    ),
}

In [None]:
ref_clusters = validation.References.from_files(
    path_target_cpgs, path_target_genes
)

## Model: Spectral Biclustering

In [None]:
biclust = SpectralBiclustering(
    n_clusters=2, method='log'
)

In [None]:
# Fit models to data.
sk_models = {}
for label in ref_labels:
    sk_models[label] = biclust.fit(ref_data[label])

In [None]:
# Create Bicluster instances tracking detected clusters 
sk_biclusters = biclusters(sk_models, ref_labels)

### Bicluster statistics

In [None]:
for label in ref_labels:
    print('{0}\n{1}'.format(label, '-' * 50))
    print(sk_biclusters[label].stats)
    print()

### Reference comparison

In [None]:
for label in ref_labels:
    print('{0}\n{1}'.format(label, '-' * 50))
    print(validation.compare_clusters(sk_biclusters[label], ref_clusters))
    print()

## Model: XMotifs

In [None]:
xmot = algorithms.XMotifs(
    number=2,
    ns=200,
    nd=100,
    sd=5,
    alpha=0.05
)

In [None]:
# Fit models to data.
xmot_models = {}
for label in ref_labels:
    xmot_models[label] = xmot.fit(ref_data[label].values)

In [None]:
# Create Bicluster instances tracking detected clusters 
xmot_biclusters = biclusters(xmot_models, ref_labels)

### Bicluster statistics

In [None]:
for label in ref_labels:
    print('{0}\n{1}'.format(label, '-' * 50))
    print(sk_biclusters[label].stats)
    print()

### Reference comparison

In [None]:
for label in ref_labels:
    print('{0}\n{1}'.format(label, '-' * 50))
    print(validation.compare_clusters(xmot_biclusters[label], ref_clusters))
    print()

## Model: Plaid

In [None]:
plaid = algorithms.Plaid(
    cluster='b',
    fit_model=robjects.r('y ~ m + a + b'),
    background=True,
    row_release=0.7,
    col_release=0.7,
    shuffle=3,
    back_fit=0,
    max_layers=20,
    iter_startup=5,
    iter_layer=10,
    back_fit=0,
    verbose=False,
)

In [None]:
# Fit models to data.
plaid_models = {}
for label in ref_labels:
    plaid_models[label] = plaid.fit(ref_data[label].values)

In [None]:
# Create Bicluster instances tracking detected clusters 
plaid_biclusters = biclusters(plaid_models, ref_labels)

### Bicluster statistics

In [None]:
for label in ref_labels:
    print('{0}\n{1}'.format(label, '-' * 50))
    print(plaid_biclusters[label].stats)
    print()

### Reference comparison

In [None]:
for label in ref_labels:
    print('{0}\n{1}'.format(label, '-' * 50))
    print(validation.compare_clusters(plaid_biclusters[label], ref_clusters))
    print()

## Model: Cheng Church

In [None]:
cc = algorithms.ChengChurch(
    delta= 0.1,
    alpha= 1.5,
    number= 2
)

In [None]:
# Fit models to data.
cc_models = {}
for label in ref_labels:
    cc_models[label] = cc.fit(ref_data[label].values)

In [None]:
# Create Bicluster instances tracking detected clusters 
cc_biclusters = biclusters(plaid_models, ref_labels)

### Bicluster statistics

In [None]:
for label in ref_labels:
    print('{0}\n{1}'.format(label, '-' * 50))
    print(cc_biclusters[label].stats)
    print()

### Reference comparison

In [None]:
for label in ref_labels:
    print('{0}\n{1}'.format(label, '-' * 50))
    print(validation.compare_clusters(cc_biclusters[label], ref_clusters))
    print()