# Reproduce

* How many true CpGs/genes are included in each detected bicluster?
* How many CpGs/genes are found in each detected bicluster, but not part of the target biclsuter?
* What is min/max/std of data points of each bicluster (cluster heteroginity)?

In [1]:
# NOTE: Try Agglomerative algorithm
path_ref_data = './../data/train/sel_pvalues_prep.csv'
path_target_genes = './../data/test/emQTL_Cluster_genes.txt'
path_target_cpgs = './../data/test/emQTL_Clusters_CpGs.txt'

In [2]:
import ast
import algorithms

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from collections import OrderedDict

from sklearn.cluster import SpectralBiclustering
from sklearn.cluster import SpectralCoclustering

%matplotlib inline

In [3]:
def cluster_similarity_score(true, pred):
    """Determines percentage of pred items in true."""
    
    return np.isin(pred, true).sum() / np.size(true)

In [4]:
def cluster_indicators(row_mat, col_mat):
    """Determine row and column indicators for each 
    bicluster."""

    num_biclusters = row_mat.shape[0]

    biclusters = []
    for cluster_num in range(num_biclusters):
        
        rows_bools = row_mat[cluster_num, :] != 0
        cols_bools = col_mat[cluster_num, :] != 0

        rows = [index for index, elt in enumerate(rows_bools) if elt]
        cols = [index for index, elt in enumerate(cols_bools) if elt]

        biclusters.append((rows, cols))

    return biclusters

In [5]:
def cluster_scores(row_mat, col_mat, target='cpgs'):
    
    num_biclusters = row_mat.shape[0]

    results = {
        'cl1_matches': [], 'cl1_superf': [],
        'cl2_matches': [], 'cl2_superf': [],
    }
    for cluster_num in range(num_biclusters):

        rows_bools = row_mat[cluster_num, :] != 0
        cols_bools = col_mat[cluster_num, :] != 0
        if target == 'cpgs':
            # Rows
            pred = [all_cpgs[index] for index, elt in enumerate(rows_bools) if elt]
            true = target_cpgs
        elif target == 'genes':
            # Cols
            pred = [all_genes[index] for index, elt in enumerate(cols_bools) if elt]
            true = target_genes
        
        results['cl1_matches'].append(
            cluster_similarity_score(true['1'], pred)
        )
        results['cl1_superf'].append(
            cluster_similarity_score(pred, true['1'])
        )
        results['cl2_matches'].append(
            cluster_similarity_score(true['2'], pred)
        )
        results['cl2_superf'].append(
            cluster_similarity_score(pred, true['2'])
        )
    output = pd.DataFrame(
        results, index=np.arange(num_biclusters) + 1
    )
    output.index.name = 'bicluster'
    
    return output

In [6]:
def bicluster_stats(clusters):
    """Determine min, max and std in biclsuters."""
    
    pass

In [7]:
def bicluster_score(clusters):
    """See silhouette coeffs or other."""
    pass

In [8]:
data = pd.read_csv(path_ref_data, sep=',', index_col=0)
data.head()

Unnamed: 0,TNFRSF4,MXRA8,ATAD3A,MMP23B,SLC35E2,MORN1,KCNAB2,KLHL21,TNFRSF9,ENO1,...,ATP11C,CXorf40B,MAGEA4,MAGEA12,CETN2,PNMA5,PNMA3,TREX2,ARHGAP4,FLNA
cg00002224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cg00002426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cg00002593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cg00002719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cg00003287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
all_genes, all_cpgs = list(data.columns), list(data.index)

In [10]:
target_genes = {'1': [], '2': []}
with open(path_target_genes, 'r') as infile:
    
    contents = infile.read().split('\n')
    # Skip header line.
    for row in contents[1:]:
        try:
            gene, cluster = row.split()
            target_genes[cluster].append(ast.literal_eval(gene))
        except:
            pass

In [11]:
target_cpgs = {'1': [], '2': []}
with open(path_target_cpgs, 'r') as infile:
    
    contents = infile.read().split('\n')
    # Skip header line.
    for row in contents[1:]:
        try:
            cpg, cluster, _ = row.split()
            target_cpgs[cluster].append(ast.literal_eval(cpg))
        except:
            pass

## sklearn: spectral coclustering

In [13]:
cospec = SpectralCoclustering(
    n_clusters=2, random_state=0
)
cospec.fit(data)

SpectralCoclustering(init='k-means++', mini_batch=False, n_clusters=2,
           n_init=10, n_jobs=1, n_svd_vecs=None, random_state=0,
           svd_method='randomized')

In [16]:
# Row indicators:
# cospec.rows_ = (num_biclusters, {1} if row in cluster[i])
# cospec.columns_ = (num_biclusters, {1} if column in cluster[i])
cospec.rows_.shape, cospec.columns_.shape

((2, 27561), (2, 2664))

In [28]:
model = cospec

# E.g.:
# model.rows_ = (num clusters, 1 if row is member, else 0)
#
# For each bicluster, where row coords = True => those rows are
# members of the biclusters. Else row coords = False.
#
# The row coords of a bicluster = the row number where 
# (for each cluster) the 
num_biclusters = model.rows_.shape[0]

row_ids, col_ids = [], []
for num in range(num_biclusters):

    row_mbers = model.rows_[num, :].astype(bool)
    col_mbers = model.columns_[num, :].astype(bool)
    row_idxs = [
        coord for coord, membersh in enumerate(row_mbers) if membersh
    ]
    col_idxs = [
        coord for coord, membersh in enumerate(col_mbers) if membersh
    ]
    row_ids.append(row_idxs), col_ids.append(col_idxs)

In [34]:
np.shape(row_ids[0])

(27468,)

## sklearn: spectral biclustering

In [None]:
bispectral = SpectralBiclustering(
    n_clusters=2, method='log', random_state=0
)
bispectral.fit(data)

In [None]:
cluster_scores(
    bispectral.rows_, bispectral.columns_, target='cpgs'
)

In [None]:
cluster_scores(
    bispectral.rows_, bispectral.columns_, target='genes'
)

## sklearn: spectral coclustering

In [None]:
cospec = SpectralBiclustering(
    n_clusters=2, method='log', random_state=0
)
cospec.fit(data)

In [None]:
cluster_scores(
    cospectral.rows_, cospectral.columns_, target='cpgs'
)

In [None]:
cluster_scores(
    cospectral.rows_, cospectral.columns_, target='genes'
)

## R: plaid

In [None]:
xmot = algorithms.XMotifs(
    **{'number': 10}
)
xmot.fit(data.values)

In [None]:
xmot.rows_.shape, xmot.columns_.shape