# Clustering Analysis

## DGUFS

**To Do**
* Compare detected clusters to non-redundant/redundant feature sets derived from model comparison experiments.
* Apply group LASSO to clusters from DGUFS. DGUFS performs siml. clustering and feature selection. Optimize DGUFS cluster quality according to metric from paper.


In [255]:
import numpy as np
import pandas as pd
import altair as alt

import concensus_clustering

from copy import deepcopy


from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [144]:
feature_categories = [
    'shape',
    'firstorder',
    'glcm',
    'glrlm',
    'glszm',
    'gldm',
    'ngtdm',
    'PETparam', 
    'clinical'
]

In [242]:
def category_counts(_):
    
    return {
        'shape': 0,
        'firstorder': 0,
        'glcm': 0,
        'glrlm': 0,
        'glszm': 0,
        'gldm': 0,
        'ngtdm': 0,
        'PETparam': 0,
        'clinical': 0,
    }


def _update_count(pet_output, ct_output, key):

    if 'PET' in key:
        pet_output[key]['PET'] += 1
    else:
        results[key]['CT'] += 1
            
    return pet_output, ct_output


def _norm_count(results, key, tot_counts):
    
    if 'PET' in key:
        results[key]['PET'] /= tot_counts[key]
    else:
        results[key]['CT'] /= tot_counts[key]
            
    return results


def to_feature_categories(cluster_indices):
    
    pet_output = category_counts(None)
    ct_output = category_counts(None)
    for label in X.columns[cluster_indices]:
        if 'shape' in label:
            pet_output, ct_output = _update_count(pet_output, ct_output, 'shape')
        elif 'firstorder' in label:
            output = _update_count(output, 'firstorder')
        elif 'glcm' in label:
            output = _update_count(output, 'glcm')
        elif 'glrlm' in label:
            output = _update_count(output, 'glrlm')
        elif 'glszm' in label:
            output = _update_count(output, 'glszm')
        elif 'gldm' in label:
            output = _update_count(output, 'gldm')
        elif 'ngtdm' in label:
            output = _update_count(output, 'ngtdm')
        elif 'PETparam' in label:
            output = _update_count(output, 'PETparam')
        else:
            output = _update_count(output, 'clinical')
        
    feature_counts = {
        'shape': 13,
        'firstorder': 54,
        'glcm': 69,
        'glrlm': 48,
        'glszm': 48,
        'gldm': 42,
        'ngtdm': 15,
        'PETparam': 3, 
        'clinical': 42
    }
    for label in output.keys():
        output = _norm_count(output, label, feature_counts)
        

    return output

In [232]:
# NB: For average Pearson use -1.0 * abs(rho). As rho improves 
# (indicating better results) a larger amount is subtracted to the 
# output indicating better overall biclusters.
def meta_score(scores, weights=None):
    """The weighted arithmetic mean of multiple scores.""" 
    
    if weights is None:
        weights = [0.5] * len(scores)
    
    outcome = 0
    for score, weight in zip(scores, weights):
        outcome = outcome + weight * -1.0 * abs(score)
        
    return outcome / len(scores)

In [233]:
def biclusters(model, X, param_config):
    # Create Bicluster instances tracking detected clusters.
    
    # Start fresh with each clustering.
    _model = deepcopy(model)
    
    # Set number of clusters to detect and fit model to data.
    _model.set_params(**param_config)
    _model.fit(X)

    rows, cols = _model.rows_, _model.columns_
    # Sanity check.
    assert np.shape(rows)[0] == np.shape(cols)[0]
    
    biclusters = concensus_clustering.Biclusters(
        rows=rows, cols=cols, data=X
    )
    return biclusters

In [234]:
def checker_coords(model, num_clusters):
    # Collect coordinates for biclusters with a checkerborad structure.
    
    tot_num_clusters = num_clusters[0] * num_clusters[1]
    coords = pd.DataFrame(
        np.zeros((tot_num_clusters, 4)),
        columns=('y1', 'y2', 'x1', 'x2')
    )
    
    num, prev_rows = 0, 0
    for row_num in range(num_clusters[0]):
        nrows = np.sum(model.rows_[row_num])

        prev_cols = 0
        for col_num in range(num_clusters[1]):
            ncols = np.sum(model.columns_[col_num])

            coords.iloc[num, 0] = prev_rows + 1
            coords.iloc[num, 1] = prev_rows + nrows
            coords.iloc[num, 2] = prev_cols
            coords.iloc[num, 3] = prev_cols + ncols

            num += 1

            prev_cols += ncols
        prev_rows += nrows - 1

    return coords

In [149]:
def bic_coords(model, num_clusters):
    # Collect coordinates for block diagonal biclusters.

    coords = pd.DataFrame(
        np.zeros((num_clusters, 4)),
        columns=('y1', 'y2', 'x1', 'x2')
    )
    prev_rows, prev_cols = 0, 0
    for num, row_bic in enumerate(model.rows_):
        num_rows = np.sum(row_bic)
        num_cols = np.sum(model.columns_[num])

        coords.iloc[num, 0] = prev_rows
        coords.iloc[num, 1] = prev_rows + num_rows
        coords.iloc[num, 2] = prev_cols
        coords.iloc[num, 3] = prev_cols + num_cols

        prev_rows += num_rows
        prev_cols += num_cols

    return coords

In [150]:
SEED = 0

In [236]:
y = pd.read_csv('./../../../data_source/to_analysis/target_dfs.csv', index_col=0)
y = np.squeeze(y.values)

In [237]:
X = pd.read_csv('./../../../data_source/to_analysis/no_filter_concat.csv', index_col=0)
X.head()

Unnamed: 0_level_0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxis,original_shape_MajorAxis,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MinorAxis,original_shape_Sphericity,...,PET_original_gldm_SmallDependenceHighGrayLevelEmphasis.2,PET_original_gldm_SmallDependenceLowGrayLevelEmphasis.2,PET_original_ngtdm_Busyness.2,PET_original_ngtdm_Coarseness.2,PET_original_ngtdm_Complexity.2,PET_original_ngtdm_Contrast.2,PET_original_ngtdm_Strength.2,PETparam_SUVpeak,PETparam_MTV,PETparam_TLG
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.738882,0.723925,27.060529,37.380273,41.976184,44.598206,42.720019,45.617979,27.619612,0.661532,...,4430.229066,0.00037,0.025559,0.004672,27568.285932,0.296325,70.049351,21.616549,7.384,124.870726
4,0.7969,0.629917,19.845151,31.504408,38.587563,35.468296,29.410882,38.704005,25.105855,0.701721,...,4270.509796,0.000527,0.027591,0.007443,31578.673152,0.271854,156.965282,15.296275,3.406,41.554406
5,0.600926,0.53514,22.515072,42.073251,46.065171,43.011626,32.015621,46.454279,25.282894,0.762365,...,4096.292481,0.00046,0.022439,0.005178,24870.405544,0.230801,64.918103,14.473272,7.934,86.22842
8,0.784571,0.414247,30.263897,73.057649,74.1485,80.956779,65.764732,83.4386,57.318945,0.520001,...,1198.601513,0.000319,0.139365,0.001371,11651.53076,0.081103,15.731158,10.510859,26.926,205.413389
10,0.69032,0.539743,19.449801,36.035312,33.286634,38.013156,33.015148,43.150898,24.875896,0.643822,...,1122.798029,0.000459,0.048381,0.005789,7160.79179,0.118371,29.024761,7.21319,6.041,32.10377


In [153]:
X_std = StandardScaler().fit_transform(X.values)
X_std.shape

(198, 610)

# Spectral Biclustering

Ref: Kluger, Yuval, et. al., 2003. Spectral biclustering of microarray data: coclustering genes and conditions.
* [blog](http://www.kemaleren.com/post/spectral-biclustering-part-1/)

In [154]:
np.random.seed(0)
bic_grid = ParameterGrid(
    {
        'n_clusters': [
            np.random.choice(np.arange(30), size=2) for _ in range(25)
        ],
        'n_components': [6, 9, 10],
        'n_best': [1, 3, 6],
        
    }
)
# Id config with best score results and plot with grid.
bic_scores = {}
for num, bic_param_config in enumerate(bic_grid):
    try:
        bic_model = SpectralBiclustering(
            random_state=SEED, method='log', svd_method='arpack'
        )
        bic_clusters = biclusters(
            bic_model, X_std, bic_param_config
        )
        bic_scores[num] = bic_clusters.external_metrics
    except:
        pass

  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.st

  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) 

  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  smsr_values = msr_values / (avg_rows ** 2 * avg_cols ** 2)
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  smsr_values = msr_values / (avg_rows ** 2 * avg_cols ** 2)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg

  smsr_values = msr_values / (avg_rows ** 2 * avg_cols ** 2)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  avg_cols_std = (avg_cols - np.mean(avg_cols)) / np.std(avg_cols)
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))


In [155]:
# Determine best biclustering config.
avg_bic_scores = []
for scores in bic_scores.values():
    _, non_inf_idx = np.where(scores.values != float('inf'))
    score = sum(scores.values.ravel()[non_inf_idx]) / len(non_inf_idx)
    avg_bic_scores.append(score)
    
best_config = bic_grid[np.argmin(avg_bic_scores)]
best_config, min(avg_bic_scores)

({'n_components': 6, 'n_clusters': array([21, 18]), 'n_best': 3},
 0.6052992748819872)

In [156]:
bic_model = SpectralBiclustering(
    random_state=SEED, method='log', svd_method='arpack'
)
bic_model.set_params(**best_config)
bic_model.fit(X_std)
row_sorted = X_std[np.argsort(bic_model.row_labels_), :]
fit_data = row_sorted[:, np.argsort(bic_model.column_labels_)]

In [157]:
# The proportion of patient outcomes per row bicluster.

bic_row_ids = []
bic_pfs_outcome, bic_not_pfs_outcome = [], []

for bic_row_idx in np.unique(bic_model.row_labels_):
    # Store cluster index.
    bic_row_ids.append(bic_row_idx)
    
    # ID samples belonging to current cluster.
    row_cluster_samples = np.where(bic_model.row_labels_ == bic_row_idx)
    
    # Store fractions of each outcome for current cluster.
    bic_pfs_outcome.append(sum(y[row_cluster_samples] == 0) / np.size(y))
    bic_not_pfs_outcome.append(sum(y[row_cluster_samples] == 1) / np.size(y))
    
sum(bic_pfs_outcome), sum(bic_not_pfs_outcome), len(bic_row_ids)

(0.6767676767676767, 0.32323232323232315, 21)

In [246]:
# The number of features from each feature category per column bicluster.

pet_category_stats = {}
ct_category_stats = {}

for bic_col_idx in np.unique(bic_model.column_labels_):    
    # ID samples belonging to current cluster.
    col_cluster_samples = np.squeeze(np.where(bic_model.column_labels_ == bic_col_idx))
    # Store fractions of present feature categories per modality.
    pet_output, ct_output = to_feature_categories(col_cluster_samples)
    pet_category_stats[bic_col_idx] = pet_output
    ct_category_stats[bic_col_idx] = ct_output
    
pet_df = pd.DataFrame(pet_category_stats)
ct_df = pd.DataFrame(ct_category_stats)

In [269]:
pet_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
PETparam,0.0,0.0,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
clinical,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
firstorder,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
glcm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gldm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
glrlm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
glszm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ngtdm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
shape,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [273]:
pet_df = pet_df.reset_index()
pet_df.index = ['PET'] * pet_df.shape[0]

ct_df = ct_df.reset_index()
ct_df.index = ['CT'] * ct_df.shape[0]

df = pd.concat((pet_df, ct_df))
df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
PET,PETparam,0.0,0.0,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
PET,clinical,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PET,firstorder,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PET,glcm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PET,gldm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [276]:
plt.figure(figsize=(8, 8))
sns.heatmap(
)

SyntaxError: invalid syntax (<ipython-input-276-d7c54853fb46>, line 4)

In [253]:
sorted_cluster_idx = np.concatenate((bic_row_ids, bic_row_ids))
comb_results = np.concatenate((bic_pfs_stats, bic_not_pfs_stats))

results_id =  np.concatenate((
    ['Progression-free Survival'] * len(bic_pfs_stats), 
    ['Other Event'] * len(bic_not_pfs_stats)
))
df_bic_stats = pd.DataFrame(
    {
        'comb_results': comb_results,
        'results_id': results_id
    },
    index=sorted_cluster_idx,
)
df_bic_stats.head()

NameError: name 'bic_pfs_stats' is not defined

In [113]:
# * Hue row (patient) clusters according to outcome. Capable of producing clusters with
#   pure outcomes? If so, which features separates a pure PFS outcome cluster from a 
#   cluster with no PFS outcomes?

plt.figure(figsize=(8, 6))
sns.barplot(
    x=sorted_cluster_idx,
    y='comb_results',
    hue='results_id',
    data=df_bic_stats,
    palette='muted',
)
plt.legend(
    loc='upper center', 
    fontsize=16,
    bbox_to_anchor=(0.5, 1.1),
    ncol=3, 
    fancybox=True, 
    shadow=True
)
plt.xlabel('Cluster Indicator', fontsize=16)
plt.ylabel('')
plt.tight_layout()

NameError: name 'sorted_cluster_idx' is not defined

<Figure size 576x432 with 0 Axes>

In [None]:
# * Possible to separate patients based on clinical outcomes whilst
#   identifying separating features?
fig, (cbar_ax, map_ax) = plt.subplots(
    nrows=2, figsize=(10, 10),  
    gridspec_kw={'height_ratios':[0.025, 1]}
)
sns.heatmap(
    fit_data, ax=map_ax, robust=True, 
    cmap=plt.cm.RdBu_r, fmt='f', 
    vmin=np.min(fit_data), 
    vmax=np.max(fit_data),
    cbar=False
)
#coords = checker_coords(bic_model, best_config['n_clusters'])
#for num in coords.index:
#    plt.plot(
#        (coords.loc[num, ['x1', 'x2', 'x2', 'x1', 'x1']]),
#        (coords.loc[num, ['y1', 'y1', 'y2', 'y2', 'y1']]),
#        linewidth=2, c='orangered' #darkred
#)
fig.colorbar(
    map_ax.get_children()[0], 
    cax=cbar_ax, 
    orientation='horizontal'
)
map_ax.set_xlabel('Features', fontsize=18)
map_ax.set_ylabel('Patients', fontsize=18)
map_ax.set_xticklabels('')
map_ax.set_yticklabels('')
plt.tight_layout()

# Spectral Coclustering

In [None]:
np.random.seed(0)   
co_grid = ParameterGrid(
    {
        'n_clusters': [
            np.random.choice(np.arange(30), size=1)[0] for _ in range(25)
        ],
    }
)
# Id config with best score results and plot with grid.
co_scores = {}
for num, co_param_config in enumerate(co_grid):
    try:
        co_model = SpectralCoclustering(
            random_state=SEED, svd_method='arpack'
        )
        co_clusters = biclusters(
            co_model, X_std, co_param_config
        )
        co_scores[num] = co_clusters.external_metrics
    except:
        pass

In [None]:
# Determine best coclustering config.
avg_co_scores = []
for scores in co_scores.values():
    _, non_inf_idx = np.where(scores.values != float('inf'))
    score = sum(scores.values.ravel()[non_inf_idx]) / len(non_inf_idx)
    avg_co_scores.append(score)
    
best_config = co_grid[np.argmin(avg_co_scores)]
best_config, min(avg_bic_scores)

In [None]:
co_model = SpectralCoclustering(
    random_state=SEED, svd_method='arpack'
)
co_model.set_params(**best_config)
co_model.fit(X_std)
row_sorted = X_std[np.argsort(co_model.row_labels_), :]
fit_data = row_sorted[:, np.argsort(co_model.column_labels_)]

In [None]:
# * Hue row (patient) clusters according to outcome. Capable of producing clusters with
#   pure outcomes? If so, which features separates a pure PFS outcome cluster from a 
#   cluster with no PFS outcomes?

# The proportion of outcomes per cocluster.
co_pfs_stats, co_not_pfs_stats, cluster_ids = [], [], []
for cluster_idx in np.unique(co_model.row_labels_):
    cluster_ids.append(cluster_idx)
    targets = np.where(co_model.row_labels_ == cluster_idx)
    co_pfs_stats.append(sum(y[targets] == 0) / np.size(y))
    co_not_pfs_stats.append(sum(y[targets] == 1) / np.size(y))
sum(co_pfs_stats), sum(co_not_pfs_stats), len(cluster_ids)

In [None]:
len(co_pfs_stats), len(co_not_pfs_stats)

In [None]:
sorted_cluster_idx = np.concatenate((cluster_ids, cluster_ids))
comb_results = np.concatenate((co_pfs_stats, co_not_pfs_stats))

results_id =  np.concatenate((
    ['Progression-free Survival'] * len(co_pfs_stats), 
    ['Other Event'] * len(co_not_pfs_stats)
))
df_co_stats = pd.DataFrame(
    {
        'comb_results': comb_results,
        'results_id': results_id
    },
    index=sorted_cluster_idx,
)
df_co_stats.head()

In [None]:
# * Hue row (patient) clusters according to outcome. Capable of producing clusters with
#   pure outcomes? If so, which features separates a pure PFS outcome cluster from a 
#   cluster with no PFS outcomes?

plt.figure(figsize=(8, 6))
sns.barplot(
    x=sorted_cluster_idx,
    y='comb_results',
    hue='results_id',
    data=df_co_stats,
    palette='muted',
)
plt.legend(
    loc='upper center', 
    fontsize=16,
    bbox_to_anchor=(0.5, 1.1),
    ncol=3, 
    fancybox=True, 
    shadow=True
)
plt.xlabel('Cluster Indicator', fontsize=16)
plt.ylabel('')
plt.tight_layout()

In [None]:
fig, (cbar_ax, map_ax) = plt.subplots(
    nrows=2, figsize=(10, 10),  
    gridspec_kw={'height_ratios':[0.025, 1]}
)
sns.heatmap(
    fit_data, ax=map_ax, robust=True, 
    cmap=plt.cm.RdBu_r, fmt='f', 
    vmin=np.min(fit_data), 
    vmax=np.max(fit_data),
    cbar=False
)
#coords = checker_coords(bic_model, best_config['n_clusters'])
#for num in coords.index:
#    plt.plot(
#        (coords.loc[num, ['x1', 'x2', 'x2', 'x1', 'x1']]),
#        (coords.loc[num, ['y1', 'y1', 'y2', 'y2', 'y1']]),
#        linewidth=2, c='orangered' #darkred
#)
fig.colorbar(
    map_ax.get_children()[0], 
    cax=cbar_ax, 
    orientation='horizontal'
)
map_ax.set_xlabel('Features', fontsize=18)
map_ax.set_ylabel('Patients', fontsize=18)
map_ax.set_xticklabels('')
map_ax.set_yticklabels('')
plt.tight_layout()

## DGUFS & Group LASSO

In [None]:
from dgufs import DGUFS

In [None]:
np.random.seed(0)
dgufs_param_grid = ParameterGrid(
    {
        'num_clusters': [
            np.random.choice(np.arange(30), size=1)[0] for _ in range(25)
        ],
        'num_features': [
            np.random.choice(np.arange(500), size=1)[0] for _ in range(25)
        ],
        
    }
)

In [None]:
for dgufs_param_config in dgufs_param_grid:
    model = DGUFS(**dgufs_param_config)
    model.fit(X_std)
    # Measure quality of clusters and ID best config.
    # Apply group LASSO to clusters.