# Reproduce

* How many true CpGs/genes are included in each detected bicluster?
* How many CpGs/genes are found in each detected bicluster, but not part of the target biclsuter?
* What is min/max/std of data points of each bicluster (cluster heteroginity)?

In [None]:
# NOTE: Try Agglomerative algorithm
path_ref_data = './../data/train/sel_pvalues_prep.csv'
path_target_genes = './../data/test/emQTL_Cluster_genes.txt'
path_target_cpgs = './../data/test/emQTL_Clusters_CpGs.txt'

In [None]:
import ast
import algorithms

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from collections import OrderedDict

from sklearn.cluster import SpectralBiclustering
from sklearn.cluster import SpectralCoclustering
from sklearn.cluster import AgglomerativeClustering

%matplotlib inline

In [None]:
def recovery_score(true, pred):
    """The fraction of true items among the predicted
    items."""
    
    return np.isin(pred, true).sum() / np.size(true)

In [None]:
def relevance_score(true, pred):
    """The fraction of predicted items not among the 
    true items."""
    
    return np.isin(true, pred).sum() / np.size(pred)

In [None]:
def cluster_coord_indicators(row_mat, col_mat):
    """Determine coordiantes of row and column indicators 
    for each bicluster."""
    
    num_biclusters = row_mat.shape[0]

    row_idx, col_idx = [], []
    for cluster_num in range(num_biclusters):
        
        rows_bools = row_mat[cluster_num, :] != 0
        cols_bools = col_mat[cluster_num, :] != 0

        rows = [index for index, elt in enumerate(rows_bools) if elt]
        cols = [index for index, elt in enumerate(cols_bools) if elt]

        row_idx.append(rows), col_idx.append(cols)

    return row_idx, col_idx

In [None]:
def cluster_scores(preds, refs, targets):
    scores = {
        'cl1_recovery': [], 'cl1_relevance': [],
        'cl2_recovery': [], 'cl2_relevance': [],
    }
    for num, pred_ids in enumerate(preds):

        pred = refs[pred_ids]
        true1, true2 = targets['1'], targets['2']
        # Frac targets among predicted (true positives).
        scores['cl1_recovery'].append(recovery_score(true1, pred))
        scores['cl2_recovery'].append(recovery_score(true2, pred))
        # Frac detected targets compared to cluster size.
        scores['cl1_relevance'].append(relevance_score(true1, pred))
        scores['cl2_relevance'].append(relevance_score(true2, pred))

    df_scores = pd.DataFrame(scores).T
    df_scores.columns = [
        'cluster_{}'.format(str(num + 1)) 
        for num in range(np.shape(preds)[0])
    ]
    df_scores.index = pd.MultiIndex.from_product(
        [('cluster1', 'cluster2'), ('recovery', 'relevance')]
    )

    return df_scores

In [None]:
def fetch_targets(path_to_targets, num_clusters=2):
    
    targets = {str(num + 1): [] for num in range(num_clusters)}
    with open(path_to_targets, 'r') as infile:

        contents = infile.read().split('\n')
        # Skip header line.
        for row in contents[1:-1]:
            # CpGs
            if len(row.split()) == 3:
                value, idx, _ = row.split()
                targets[idx].append(ast.literal_eval(value))
            # Genes
            elif len(row.split()) == 2:
                value, idx = row.split()
                targets[idx].append(ast.literal_eval(value))
            else:
                pass
        
    return targets

In [None]:
from scipy.stats import rankdata

def avg_spearmans_rho(row_idx, col_idx, data):
    """Assess bicluster quality."""
    
    # can detect shift, scale and shift-sclae patterns.
    # From paper:
    # http://www.scitepress.org/Papers/2018/66625/66625.pdf
    
    rho_rows = 0
    for num, rows in enumerate(row_idx):
        pass
    
    rho_cols = 0
    for num, rows in enumerate(row_idx):
        pass

In [None]:
def bicluster_stats(row_idx, col_idx, data):
    """Determine min, max and std of data points in biclusters."""

    stats = {}
    for num, rows in enumerate(row_idx):
        _row_data = data[rows, :]
        bicluster = _row_data[:, col_idx[num]]
        nrows, ncols = np.shape(bicluster)
        stats[str(num)] = {
            'max': np.max(bicluster),
            'min': np.min(bicluster),
            'std': np.std(bicluster),
            'nonzeros': int(np.count_nonzero(bicluster==0)),
            'nrows': int(nrows),
            'ncols': int(ncols),
            'size': (nrows * ncols) / np.size(data)
        }
    df_stats = pd.DataFrame(stats)
    df_stats.columns = ['cluster_{0}'.format(str(num + 1)) 
                        for num, _ in enumerate(stats)]

    return df_stats

In [None]:
np.shape(bispec_cpg_idx[0]), np.shape(bispec_gene_idx[0])

In [None]:
data = pd.read_csv(path_ref_data, sep=',', index_col=0)
data.head()

In [None]:
# Necessary witha arrays for fast indexing
all_genes = np.array(data.columns, dtype=object) 
all_cpgs = np.array(data.index, dtype=object)
# Read target CpGs and genes from files.
target_cpgs = fetch_targets(path_target_cpgs)
target_genes = fetch_targets(path_target_genes)

## sklearn: spectral biclustering

In [None]:
# Produces $2^n$ results for $n$ clusters. 
# Assumes the input data matrix has a hidden checkerboard structure.

# NB: Produces 2^n clusters for n clusters.
bispec = SpectralBiclustering(
    n_clusters=(2, 2), method='log', random_state=0
)
bispec.fit(data)

In [None]:
bispec_cpg_idx, bispec_gene_idx = cluster_coord_indicators(
    bispec.rows_, bispec.columns_
)

In [None]:
bicluster_stats(
    bispec_cpg_idx, bispec_gene_idx, data.values
)

**Observations**
* All biclusters involves zeros.
* Constitutes only a small part of the data matrix itself.
* Largest std in cluster 3.

In [None]:
bispec_cpg_scores = cluster_scores(
    bispec_cpg_idx, all_cpgs, target_cpgs
)
bispec_cpg_scores 

**Observations**
* Run $n=2$:
    * Cluster 3 and 4 contains $\approx 77 \%$ of the target CpGs of cluster 1 which constitutes $\approx 97 \%$ of the total cluster contents.
    * Cluster 1 and 2 contains $\approx 99 \%$ of the target CpGs of cluster 2, but these clusters also contains many other samples.

In [None]:
bispec_gene_scores = cluster_scores(
    bispec_gene_idx, all_genes, target_genes
)
bispec_gene_scores 

**Observations**
* Run $n=2$:
    * Cluster 1 and 3 contains $\approx 82 \%$ of the target CpGs of cluster 1 which constitutes $\approx 97 \%$ of the total cluster contents.
    * Cluster 2 and 4 contains $\approx 99 \%$ of the target CpGs of cluster 2, but these clusters also contains many other samples.

## sklearn: spectral coclustering

In [None]:
cospec = SpectralCoclustering(
    n_clusters=2, random_state=0
)
cospec.fit(data)

In [None]:
cospec_cpg_idx, cospec_gene_idx = cluster_coord_indicators(
    cospec.rows_, cospec.columns_
)

In [None]:
bicluster_stats(
    cospec_cpg_idx, cospec_gene_idx, data.values
)

**Observations**
* All biclusters include zeros.
* Cluster 2 is significantly smaller than cluster 1.

In [None]:
cospec_cpg_scores = cluster_scores(
    cospec_cpg_idx, all_cpgs, target_cpgs
)
cospec_cpg_scores 

In [None]:
cospec_gene_scores = cluster_scores(
    cospec_gene_idx, all_genes, target_genes
)
cospec_gene_scores 

**Observations**
* Run $n=2$:
    * All target cluster contents is captured in cluster 1 rendering cluster 2 as noise.
    * Clsuter 1 contains many more samples than the target clusters and, thus, indicates to the too coarse to capture any relevant information.

## R: Plaid

In [None]:
plaid = algorithms.Plaid()
plaid.fit(data.values)

plaid_cpg_idx, plaid_gene_idx = cluster_indicators(
    plaid.rows_, plaid.columns_
)

## R: xMotifs

In [None]:
xmot = algorithms.XMotifs(number=2)
xmot.fit(data.values)

In [None]:
xmot_cpg_idx, xmot_gene_idx = cluster_indicators(
    xmot.rows_, xmot.columns_
)

In [None]:
xmot_cpg_scores = cluster_scores(
    xmot_cpg_idx, all_cpgs, target_cpgs
)
xmot_cpg_scores 

In [None]:
xmot_gene_scores = cluster_scores(
    xmot_gene_idx, all_genes, target_genes
)
xmot_gene_scores 