In [2]:
import numpy as np
import pandas as pd
import sys,os
import random
import copy
from time import time

import matplotlib.pyplot as plt
import seaborn as sns


from utils.method import read_bic_table

from utils.eval import make_ref_groups
from utils.eval import calculate_perfromance, compare_gene_clusters

# 1. Reading expressions and annotations

In [3]:
exprs_file_t = "data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv"
exprs_t= pd.read_csv(exprs_file_t,sep = "\t",index_col=0)

exprs_file_m = "data/preprocessed_v6/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv"
exprs_m= pd.read_csv(exprs_file_m,sep = "\t",index_col=0)

m_subtypes = pd.read_csv("data/preprocessed_v6/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
m_annotation = pd.read_csv("data/preprocessed_v6/METABRIC_1904.annotation_v6.tsv",sep = "\t",index_col=0)

t_subtypes = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
t_annotation = pd.read_csv("data/preprocessed_v6/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv",sep = "\t",index_col=0)

## 1.1 Preparing ground truth samples sets for performance evaluation

### Example of known_groups dictionary for TCGA-BRCA

*make_ref_groups(subtypes, annotation, exprs)*

**input:**
  - subtypes - subtypes dataframe
  - annotation - annotation dataframe
  - exprs - expression dataframe
  
**returns:**
  -  known_groups = {classificaton1:{"subt1":{s1,s2,...} , "subt2":{...}, "subt3":{...}, ...}, "classi2":{"subtA":{...}}, ... }
*known_groups* is a dictionary with known sample classifications. Each classification (e.g. PAM50 or IHC or Luminal) is a dict that can conatain one or several sample sets 
  -  all_samples = {} set of all samples in expression and annotation files; necessary for computing overlap p-values

In [75]:
known_groups_t, all_samples_t = make_ref_groups(t_subtypes, t_annotation,exprs_t)
known_groups_m, all_samples_m = make_ref_groups(m_subtypes, m_annotation,exprs_m)

# Example 1: 
## The sructure of known_groups dict for TCGA-BRCA:

We calculate performance for **classifications**:
    * PAM50 = [Luminal, Basal, Her2, Normal]
    * Intrinsic = [Luminal, Basal, Her2, Normal, Claudin-low]
    * PAM50_AB =  [LumA, LumB, Basal, Her2, Normal]
    * SCMOD2 = [ER-/HER2-, ER+/HER2- High Prolif, ER+/HER2- Low Prolif,  HER2+]
    * IHC = [IHC_HER2, IHC_ER, IHC_PR, IHC_TNBC]
And for **isolated sample sets** corresponding to Luminal, Basal, LumA, NEC subtypes etc. 

In [88]:
for cl in known_groups_t.keys():
    if len(known_groups_t[cl].keys())>1:
        print("classification", cl)
        print("\tsbtypes:"," ".join(known_groups_t[cl].keys()))
    else:
        print(" classification", cl, "(individual subtype)")

classification PAM50
	sbtypes: Basal Normal Her2 Luminal
classification Intrinsic
	sbtypes: Basal Normal Her2 Luminal Claudin-low
classification PAM50_AB
	sbtypes: Basal Normal LumA Her2 LumB
classification SCMOD2
	sbtypes: ER-/HER2- ER+/HER2- High Prolif ER+/HER2- Low Prolif HER2+
classification IHC
	sbtypes: IHC_HER2 IHC_ER IHC_PR IHC_TNBC
 classification Luminal (individual subtype)
 classification Basal (individual subtype)
 classification Her2 (individual subtype)
 classification LumA (individual subtype)
 classification LumB (individual subtype)
 classification Normal (individual subtype)
 classification Claudin-low (individual subtype)
 classification IHC_HER2 (individual subtype)
 classification IHC_ER (individual subtype)
 classification IHC_PR (individual subtype)
 classification IHC_TNBC (individual subtype)
 classification NET_kmeans (individual subtype)
 classification NET_ward (individual subtype)


# Example 2: 
## evaluation of the resulting sample set (on the example of UnPaSt file) 
reading the results 

In [102]:
file = "results_on_real_data_WGCNA2/TCGA.seed=670487.bin=kmeans,pval=0.01,clust=WGCNA,direction=UP-DOWN,ds=3,dch=0.995,max_power=10,precluster=True.biclusters.tsv"
result = read_bic_table(file) # reading UnPaSt outputs
print("sample clusters: ", result.shape[0])
# drop clusters too small with < 5 samples
result = result.loc[result["samples"].apply(lambda x: len(x))>=5,:]
print("sample clusters: ", result.shape[0])
result.head(2)

sample clusters:  168
sample clusters:  168


Unnamed: 0_level_0,SNR,n_genes,n_samples,genes,samples,direction,genes_up,genes_down,gene_indexes,sample_indexes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4.042148,116,14,"{KRTAP11-1, OR13D1, C9orf57, LOC286083, CPS1-I...","{TCGA-A7-A13G-01, TCGA-A8-A08G-01, TCGA-A7-A0D...",UP,"{KRTAP11-1, OR13D1, C9orf57, LOC286083, CPS1-I...",{},"{8194, 10251, 13836, 13838, 15375, 2063, 2065,...","{130, 131, 132, 133, 262, 135, 137, 202, 138, ..."
1,3.314768,52,191,"{KLHDC7A, C9orf152, HOXB2, CYP4Z2P, AR, PAX2, ...","{TCGA-B6-A1KF-01, TCGA-S3-AA15-01, TCGA-EW-A1O...",DOWN,{},"{KLHDC7A, C9orf152, HOXB2, CYP4Z2P, AR, PAX2, ...","{2947, 7430, 17160, 17161, 15755, 7821, 12174,...","{1024, 514, 1018, 1030, 519, 8, 1035, 524, 103..."


* ensure that results file is a dataframe with "samples" column
* each row in samples column must contain a non-empty set of samples
## performance evaluation
* requires *known_groups* dict and *all_samples* set  
     - using *make_ref_groups()* is recommened for this breast cancer analysis
     - alternatively, *known_groups* dict and *all_samples* can be created manually
* if samples in (bi)clusters do not match *all_samples* set, trho

*calculate_perfromance(bi_clusters_df, annotation, exprs)*

**input:**
  - bi_clusters_df - a dataframe with sample clusters (sets in "sample" column)
  - *known_groups* is a dictionary with known sample classifications. Each classification (e.g. PAM50 or IHC or Luminal) is a dict that can conatain one or several sample sets 
  - *all_samples* = {} set of all samples in expression and annotation files; necessary for computing overlap p-values
  
**returns:**
  - performances - *pandas.Series* with overall perforamnce for each classification from *known_groups* 
  - best_matches - a dataframe with information about the best matching (bi)cluster for each sample set from *known_groups* (helpful for debugging and validation)

In [105]:
performances, best_matches = calculate_perfromance(result, known_groups_t,all_samples_t)
performances

PAM50          0.842163
Intrinsic      0.810808
PAM50_AB       0.630113
SCMOD2         0.652669
IHC            0.712046
Luminal        0.898305
Basal          0.949495
Her2           0.486842
LumA           0.680070
LumB           0.473404
Normal         0.082011
Claudin-low    0.118483
IHC_HER2       0.412371
IHC_ER         0.869464
IHC_PR         0.777910
IHC_TNBC       0.473430
NET_kmeans     0.671875
NET_ward       0.551724
dtype: float64

In [106]:
best_matches

Unnamed: 0,bm_id,J,weight,adj_pval,is_enriched,classification
Basal,1,0.949495,0.180723,0.0,True,PAM50
Normal,132,0.082011,0.029657,0.0,True,PAM50
Her2,8,0.486842,0.100093,0.0,True,PAM50
Luminal,4,0.898305,0.689527,0.0,False,PAM50
Basal,1,0.949495,0.173488,0.0,True,Intrinsic
Normal,132,0.082011,0.02847,0.0,True,Intrinsic
Her2,8,0.486842,0.096085,0.0,True,Intrinsic
Luminal,4,0.898305,0.661922,0.0,False,Intrinsic
Claudin-low,134,0.058997,0.040036,0.009062,False,Intrinsic
Basal,1,0.949495,0.180723,0.0,True,PAM50_AB


# 2. Evaluation of the results obtained with different parameters
(UnPaSt)

In [5]:
# selecting 5 seeds for probabilistic methods 
n_runs = 5
seeds = []
random.seed(42)
for i in range(n_runs):
    seeds.append(random.randint(0,1000000))
print("generate ",n_runs," seeds",seeds)

generate  5  seeds [670487, 116739, 26225, 777572, 288389]


In [19]:
subt_t = [] # Perfoemances for TCGA-BRCA
subt_m = [] # Perfoemances for METABRIC
clustering_similarities = [] # Similarities of gene clusters found in TCGA and METABRIC

# UnPaSt parameters 
from run_unpast import run
rpath="/home/olya/anaconda3/envs/r4_env/bin/"
out_dir= "results_on_real_data_WGCNA2/"
basename_t = "TCGA"
basename_m = "METABRIC" 
pvals = [0.05,0.01,0.005,0.001]
bin_methods = ["kmeans","GMM","ward"] 
directions =  [["UP","DOWN"],["BOTH"]]

Because UnPaSt parameters are different for Louvain and WGCNA feature clusterings, 
we run it for each clust_method in a separate *for* loop

In [22]:
### Louvain 
out_dir= "results_on_real_data_WGCNA2//"
modularities = [0,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

subt_t = []
subt_m = []
clustering_similarities = []


for pval in pvals:
    for bin_method in bin_methods:
        for d in directions:
            for m in modularities:
                # save parameters as a ;-separated string
                params = "bin="+bin_method+";pval="+str(pval)
                params += ";clust="+"Louvain"+";direction="+"-".join(d)+";m="+str(m)
                print()
                for r in range(n_runs):
                    seed = seeds[r]
                    params_dict = {"parameters":params, "seed":seed,"run":r}
                    ### running TCGA or reading results
                    try:
                        t0 = time()
                        fname = out_dir+basename_t+".seed="+str(seed)+\
                        ".bin="+bin_method +",pval="+str(pval)+",clust=Louvain"+",direction="+"-".join(d)+",m="+str(m)+".biclusters.tsv"
                        result_t = read_bic_table(fname)
                        """result_t = run_DESMOND(exprs_file_t, basename_t, out_dir=out_dir,
                                                    save=True, load = True,
                                                    ceiling = 3,
                                                    min_n_samples = 5,
                                                    bin_method = bin_method, pval = pval,
                                                    clust_method = "Louvain",
                                                    similarity_cutoffs = similarity_cutoffs,
                                                    seed = seed,
                                                    verbose = False)
                                                    """
                        time_t = time()-t0
                        # find the best matches between TCGA biclusters and subtypes
                        # and calculate overall performance == weighted sum of Jaccard indexes
                        performance_t,bm_dict_t = calculate_perfromance(result_t, known_groups_t,all_samples_t)
                        performance_t = performance_t.to_dict()
                        performance_t.update(params_dict)
                        performance_t["time"] = time_t
                        subt_t.append(performance_t)
                        t_failed = False
                    except:
                        print("TCGA biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                        print(fname)
                        t_failed = True
                        subt_t.append({params_dict})

                    ### running METABRIC or reading results
                    try:
                        t0 = time()
                        fname = out_dir+basename_m+".seed="+str(seed)+\
                        ".bin="+bin_method +",pval="+str(pval)+",clust=Louvain"+",direction="+"-".join(d)+",m="+str(m)+".biclusters.tsv"
                        result_m = read_bic_table(fname)
                        """result_m = run_DESMOND(exprs_file_m, basename_m, out_dir=out_dir,
                                                    save=True, load = True,
                                                    ceiling = 3,
                                                    min_n_samples = 5,
                                                    bin_method = bin_method, pval = pval,
                                                    clust_method = "Louvain",
                                                    similarity_cutoffs = similarity_cutoffs,
                                                    seed = seed,
                                                    verbose = False)"""
                        time_m = time()-t0
                        # find the best matches between METABRIC biclusters and subtypes
                        # and calculate overall performance == weighted sum of Jaccard indexes
                        performance_m,bm_dict_m = calculate_perfromance(result_m, known_groups_m,all_samples_m)
                        performance_m = performance_m.to_dict()
                        performance_m.update(params_dict)
                        performance_m["time"] = time_m
                        subt_m.append(performance_m)
                        m_failed = False
                    except:
                        print("METABRIC biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                        print(fname)
                        m_failed = True
                        subt_m.append(params_dict)
                    print(params,seed, round(performance_t["PAM50"],3),round(performance_m["PAM50"],3))    
                    # compare clustering results - only if gene sets are defined for each cluster
                    if not (t_failed or m_failed): 
                        N = exprs_m.shape[0]
                        clust_sim, bm, bm2 = compare_gene_clusters(result_t,result_m, N)                    
                    else:
                        clust_sim = {}
                    clust_sim.update(params_dict)
                    clustering_similarities.append(clust_sim)



bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0 670487 0.738 0.835
bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0 116739 0.841 0.834
bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0 26225 0.734 0.835
bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0 777572 0.75 0.834
bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0 288389 0.73 0.835

bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.3 670487 0.738 0.835
bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.3 116739 0.841 0.834
bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.3 26225 0.734 0.835
bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.3 777572 0.75 0.834
bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.3 288389 0.73 0.835

bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.4 670487 0.738 0.835
bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.4 116739 0.841 0.834
bin=kmeans;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.4 26225 0.734 0.835
bin=kmeans;pv


bin=GMM;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.8 670487 0.813 0.832
bin=GMM;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.8 116739 0.819 0.832
bin=GMM;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.8 26225 0.806 0.832
bin=GMM;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.8 777572 0.801 0.832
bin=GMM;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.8 288389 0.819 0.827

bin=GMM;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.9 670487 0.813 0.832
bin=GMM;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.9 116739 0.819 0.832
bin=GMM;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.9 26225 0.806 0.832
bin=GMM;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.9 777572 0.801 0.832
bin=GMM;pval=0.1;clust=Louvain;direction=UP-DOWN;m=0.9 288389 0.819 0.827

bin=GMM;pval=0.1;clust=Louvain;direction=BOTH;m=0 670487 0.721 0.831
bin=GMM;pval=0.1;clust=Louvain;direction=BOTH;m=0 116739 0.746 0.837
bin=GMM;pval=0.1;clust=Louvain;direction=BOTH;m=0 26225 0.747 0.837
bin=GMM;pval=0.1;clust=Louvain;direction=BOTH;m=0 777

bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.6 777572 0.735 0.815
bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.6 288389 0.712 0.815

bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.7 670487 0.712 0.815
bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.7 116739 0.735 0.815
bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.7 26225 0.712 0.815
bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.7 777572 0.735 0.815
bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.7 288389 0.712 0.815

bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.8 670487 0.712 0.815
bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.8 116739 0.735 0.815
bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.8 26225 0.712 0.815
bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.8 777572 0.735 0.815
bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.8 288389 0.712 0.815

bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.9 670487 0.712 0.815
bin=ward;pval=0.1;clust=Louvain;direction=BOTH;m=0.9 116739 0.7

bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.4 116739 0.819 0.831
bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.4 26225 0.815 0.831
bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.4 777572 0.819 0.834
bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.4 288389 0.818 0.834

bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.5 670487 0.82 0.835
bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.5 116739 0.819 0.831
bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.5 26225 0.815 0.831
bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.5 777572 0.819 0.834
bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.5 288389 0.818 0.834

bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.6 670487 0.82 0.835
bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.6 116739 0.819 0.831
bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.6 26225 0.815 0.831
bin=GMM;pval=0.001;clust=Louvain;direction=UP-DOWN;m=0.6 777572 0.819 0.834
bin=GMM;pval=0.

bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0 116739 0.734 0.827
bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0 26225 0.734 0.827
bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0 777572 0.734 0.827
bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0 288389 0.735 0.827

bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0.3 670487 0.734 0.827
bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0.3 116739 0.734 0.827
bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0.3 26225 0.734 0.827
bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0.3 777572 0.734 0.827
bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0.3 288389 0.735 0.827

bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0.4 670487 0.734 0.827
bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0.4 116739 0.734 0.827
bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0.4 26225 0.734 0.827
bin=ward;pval=0.001;clust=Louvain;direction=BOTH;m=0.4 777572 0.734 0.827
bin=ward;pval=0.001;clust=Louvain;direction=BOT


bin=kmeans;pval=0.05;clust=Louvain;direction=BOTH;m=0.8 670487 0.744 0.835
bin=kmeans;pval=0.05;clust=Louvain;direction=BOTH;m=0.8 116739 0.744 0.834
bin=kmeans;pval=0.05;clust=Louvain;direction=BOTH;m=0.8 26225 0.742 0.834
bin=kmeans;pval=0.05;clust=Louvain;direction=BOTH;m=0.8 777572 0.766 0.834
bin=kmeans;pval=0.05;clust=Louvain;direction=BOTH;m=0.8 288389 0.743 0.835

bin=kmeans;pval=0.05;clust=Louvain;direction=BOTH;m=0.9 670487 0.744 0.835
bin=kmeans;pval=0.05;clust=Louvain;direction=BOTH;m=0.9 116739 0.744 0.834
bin=kmeans;pval=0.05;clust=Louvain;direction=BOTH;m=0.9 26225 0.742 0.834
bin=kmeans;pval=0.05;clust=Louvain;direction=BOTH;m=0.9 777572 0.766 0.834
bin=kmeans;pval=0.05;clust=Louvain;direction=BOTH;m=0.9 288389 0.743 0.835

bin=GMM;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0 670487 0.808 0.831
bin=GMM;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0 116739 0.821 0.833
bin=GMM;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0 26225 0.818 0.832
bin=GMM;pval=0.05;clust=Louvain

bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.6 26225 0.81 0.806
bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.6 777572 0.808 0.806
bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.6 288389 0.81 0.806

bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.7 670487 0.806 0.806
bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.7 116739 0.81 0.806
bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.7 26225 0.81 0.806
bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.7 777572 0.808 0.806
bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.7 288389 0.81 0.806

bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.8 670487 0.806 0.806
bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.8 116739 0.81 0.806
bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.8 26225 0.81 0.806
bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.8 777572 0.808 0.806
bin=ward;pval=0.05;clust=Louvain;direction=UP-DOWN;m=0.8 288389 0.81 0.806

bin=ward;pval=0.05;c

bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.4 26225 0.769 0.831
bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.4 777572 0.756 0.834
bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.4 288389 0.761 0.835

bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.5 670487 0.77 0.835
bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.5 116739 0.736 0.834
bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.5 26225 0.769 0.831
bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.5 777572 0.756 0.834
bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.5 288389 0.761 0.835

bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.6 670487 0.77 0.835
bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.6 116739 0.736 0.834
bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.6 26225 0.769 0.831
bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.6 777572 0.756 0.834
bin=kmeans;pval=0.01;clust=Louvain;direction=BOTH;m=0.6 288389 0.761 0.835

bin=kmeans;pval=0.01;clust=

bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0 288389 0.729 0.833

bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0.3 670487 0.803 0.833
bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0.3 116739 0.803 0.833
bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0.3 26225 0.803 0.833
bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0.3 777572 0.729 0.833
bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0.3 288389 0.729 0.833

bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0.4 670487 0.803 0.833
bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0.4 116739 0.803 0.833
bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0.4 26225 0.803 0.833
bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0.4 777572 0.729 0.833
bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0.4 288389 0.729 0.833

bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0.5 670487 0.803 0.833
bin=ward;pval=0.01;clust=Louvain;direction=UP-DOWN;m=0.5 116739 0.803 0.833
bin=ward;pval

bin=kmeans;pval=0.005;clust=Louvain;direction=UP-DOWN;m=0.8 777572 0.842 0.834
bin=kmeans;pval=0.005;clust=Louvain;direction=UP-DOWN;m=0.8 288389 0.842 0.835

bin=kmeans;pval=0.005;clust=Louvain;direction=UP-DOWN;m=0.9 670487 0.842 0.835
bin=kmeans;pval=0.005;clust=Louvain;direction=UP-DOWN;m=0.9 116739 0.841 0.835
bin=kmeans;pval=0.005;clust=Louvain;direction=UP-DOWN;m=0.9 26225 0.841 0.834
bin=kmeans;pval=0.005;clust=Louvain;direction=UP-DOWN;m=0.9 777572 0.842 0.834
bin=kmeans;pval=0.005;clust=Louvain;direction=UP-DOWN;m=0.9 288389 0.842 0.835

bin=kmeans;pval=0.005;clust=Louvain;direction=BOTH;m=0 670487 0.747 0.833
bin=kmeans;pval=0.005;clust=Louvain;direction=BOTH;m=0 116739 0.736 0.831
bin=kmeans;pval=0.005;clust=Louvain;direction=BOTH;m=0 26225 0.736 0.831
bin=kmeans;pval=0.005;clust=Louvain;direction=BOTH;m=0 777572 0.745 0.832
bin=kmeans;pval=0.005;clust=Louvain;direction=BOTH;m=0 288389 0.736 0.833

bin=kmeans;pval=0.005;clust=Louvain;direction=BOTH;m=0.3 670487 0.747 0.833


bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.6 26225 0.698 0.837
bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.6 777572 0.724 0.837
bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.6 288389 0.722 0.837

bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.7 670487 0.703 0.836
bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.7 116739 0.702 0.836
bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.7 26225 0.698 0.837
bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.7 777572 0.724 0.837
bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.7 288389 0.722 0.837

bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.8 670487 0.703 0.836
bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.8 116739 0.702 0.836
bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.8 26225 0.698 0.837
bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.8 777572 0.724 0.837
bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0.8 288389 0.722 0.837

bin=GMM;pval=0.005;clust=Louvain;direction=BOTH;m=0

In [23]:
clust_methods = ["WGCNA"]
dss = [0,1,2,3,4]
dchs = [0.95,0.995]
cseed = 0

pc = True

for pval in pvals:
    for ds in dss:
        for dch in dchs:
            for d in directions:
                for clust_method in clust_methods:
                    for bin_method in bin_methods:
                        # save parameters as a ;-separated string
                        params = "bin="+bin_method+";pval="+str(pval)+";direction="+str("-".join(d))
                        params += ";clust="+clust_method+";dch="+str(dch)+";ds="+str(ds)+";preClustering=T"
                        print(params)
                        biclusters_t = []
                        biclusters_m = []
                        for r in range(n_runs):
                            seed = seeds[r]
                            #print("run",run,bin_method,pval,m,seed)
                            params_dict = {"parameters":params, "seed":seed,"run":r}

                            ### running TCGA or reading results
                            try:
                                t0 = time()
                                fname = out_dir+basename_t+".seed="+str(seed)+".bin="+bin_method +",pval="+str(pval)+",clust=WGCNA,direction="+str("-".join(d))+",ds="+str(ds)+",dch="+str(dch)+",max_power=10,precluster=True"+".biclusters.tsv"
                                try:
                                    result_t = read_bic_table(fname)
                                except:
                                    print("not found")
                                    """result_t = run(exprs_file_t, basename_t, out_dir=out_dir,
                                                                save=True, load = True,
                                                                min_n_samples = 5,
                                                                bin_method = bin_method, pval = pval,
                                                                directions = d,
                                                                clust_method = clust_method,
                                                                precluster=pc,
                                                                ds=ds,dch=dch,
                                                                rpath=rpath,
                                                                seed = seed,
                                                                verbose = False)
                                                                
                                    """
                                performance_t,bm_dict_t = calculate_perfromance(result_t, known_groups_t,all_samples_t)
                                performance_t = performance_t.to_dict()
                                performance_t.update(params_dict)
                                performance_t["time"] = time_t
                                subt_t.append(performance_t)
                                t_failed = False
                            except:
                                print("TCGA biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                                print(fname)
                                t_failed = True
                                subt_t.append({params_dict})
                            
                            ### running METABRIC or reading results
                            try:
                                t0 = time()
                                fname = out_dir+basename_m+".seed="+str(seed)+".bin="+bin_method +",pval="+str(pval)+",clust=WGCNA,direction="+str("-".join(d))+",ds="+str(ds)+",dch="+str(dch)+",max_power=10,precluster=True"+".biclusters.tsv"
                                try:
                                    result_m = read_bic_table(fname)
                                except:
                                    print(fname)
                                    """result_m = run(exprs_file_m, basename_m, out_dir=out_dir,
                                                                save=True, load = True,
                                                                min_n_samples = 5,
                                                                bin_method = bin_method, pval = pval,
                                                                directions = d,
                                                                clust_method = clust_method,
                                                                precluster=pc,
                                                                ds=ds,dch=dch,
                                                                rpath=rpath,
                                                                seed = seed,
                                                                verbose = False)
                                    """
                                time_m = time()-t0
                                # find the best matches between METABRIC biclusters and subtypes
                                # and calculate overall performance == weighted sum of Jaccard indexes
                                performance_m,bm_dict_m = calculate_perfromance(result_m, known_groups_m,all_samples_m)
                                performance_m = performance_m.to_dict()
                                performance_m.update(params_dict)
                                performance_m["time"] = time_m
                                subt_m.append(performance_m)
                                m_failed = False
                            except:
                                print("METABRIC biclustering failed with ",seed,  pval,bin_method ,file = sys.stderr)
                                print(fname)
                                m_failed = True
                                subt_m.append(params_dict)
                            print(params,seed, round(performance_t["PAM50"],3),round(performance_m["PAM50"],3))    
                            # compare clustering results - only if gene sets are defined for each cluster
                            if not (t_failed or m_failed): 
                                N = exprs_m.shape[0]
                                clust_sim, bm, bm2 = compare_gene_clusters(result_t,result_m, N)                    
                            else:
                                clust_sim = {}
                            clust_sim.update(params_dict)
                            clustering_similarities.append(clust_sim)

bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 670487 0.696 0.834
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 116739 0.789 0.834
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 26225 0.841 0.834
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 777572 0.698 0.834
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 288389 0.694 0.834
bin=GMM;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T
bin=GMM;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 670487 0.84 0.827
bin=GMM;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 116739 0.837 0.827
bin=GMM;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 26225 0.837 0.828
bin=GMM;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;pr

bin=ward;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 288389 0.838 0.832
bin=kmeans;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T
bin=kmeans;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T 670487 0.766 0.838
bin=kmeans;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T 116739 0.747 0.838
bin=kmeans;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T 26225 0.77 0.826
bin=kmeans;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T 777572 0.749 0.84
bin=kmeans;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T 288389 0.775 0.826
bin=GMM;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T
bin=GMM;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T 670487 0.71 0.814
bin=GMM;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T 116739 0.712 0.817
bin=GMM;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T 26225 0.71 0.812

bin=ward;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 777572 0.728 0.821
bin=ward;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 288389 0.835 0.824
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=2;preClustering=T
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=2;preClustering=T 670487 0.8 0.842
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=2;preClustering=T 116739 0.834 0.84
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=2;preClustering=T 26225 0.794 0.846
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=2;preClustering=T 777572 0.816 0.843
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=2;preClustering=T 288389 0.796 0.844
bin=GMM;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=2;preClustering=T
bin=GMM;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=2;preClustering=T 670487 0.832 0.821
bin=GMM;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=

bin=ward;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 26225 0.842 0.826
bin=ward;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 777572 0.826 0.824
bin=ward;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 288389 0.839 0.831
bin=kmeans;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=3;preClustering=T
bin=kmeans;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=3;preClustering=T 670487 0.824 0.836
bin=kmeans;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=3;preClustering=T 116739 0.847 0.836
bin=kmeans;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=3;preClustering=T 26225 0.795 0.826
bin=kmeans;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=3;preClustering=T 777572 0.821 0.827
bin=kmeans;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=3;preClustering=T 288389 0.781 0.82
bin=GMM;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=3;preClustering=T
bin=GMM;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=3;preClusterin

bin=ward;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 116739 0.811 0.83
bin=ward;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 26225 0.831 0.802
bin=ward;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 777572 0.849 0.823
bin=ward;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 288389 0.849 0.833
bin=kmeans;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T
bin=kmeans;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 670487 0.708 0.833
bin=kmeans;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 116739 0.739 0.832
bin=kmeans;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 26225 0.715 0.832
bin=kmeans;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 777572 0.737 0.836
bin=kmeans;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=0;preClustering=T 288389 0.791 0.835
bin=GMM;pval=0.001;direction=UP-DOWN

bin=GMM;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 288389 0.673 0.819
bin=ward;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T
bin=ward;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 670487 0.744 0.833
bin=ward;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 116739 0.737 0.823
bin=ward;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 26225 0.767 0.832
bin=ward;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 777572 0.711 0.833
bin=ward;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 288389 0.849 0.837
bin=kmeans;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T
bin=kmeans;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T 670487 0.71 0.841
bin=kmeans;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=1;preClustering=T 116739 0.708 0.837
bin=kmeans;pval=0.001;direction=BOTH;clust=WGCNA;dch=0

bin=GMM;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 116739 0.837 0.814
bin=GMM;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 26225 0.73 0.814
bin=GMM;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 777572 0.731 0.814
bin=GMM;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 288389 0.732 0.814
bin=ward;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T
bin=ward;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 670487 0.7 0.832
bin=ward;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 116739 0.772 0.832
bin=ward;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 26225 0.709 0.808
bin=ward;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 777572 0.837 0.809
bin=ward;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 288389 0.716 0.835
bin=kmeans;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=2;p

bin=kmeans;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 288389 0.793 0.841
bin=GMM;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T
bin=GMM;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 670487 0.84 0.827
bin=GMM;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 116739 0.757 0.822
bin=GMM;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 26225 0.775 0.82
bin=GMM;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 777572 0.843 0.816
bin=GMM;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 288389 0.781 0.818
bin=ward;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T
bin=ward;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 670487 0.799 0.829
bin=ward;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 116739 0.827 0.806
bin=ward;pval=0.001;direction=UP-DOWN;clust=W

bin=kmeans;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 116739 0.732 0.842
bin=kmeans;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 26225 0.762 0.837
bin=kmeans;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 777572 0.74 0.838
bin=kmeans;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 288389 0.71 0.833
bin=GMM;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T
bin=GMM;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 670487 0.766 0.816
bin=GMM;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 116739 0.839 0.827
bin=GMM;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 26225 0.769 0.819
bin=GMM;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 777572 0.773 0.814
bin=GMM;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 288389 0.787 0.823
bin=ward;pval=0.001;direction=BOTH;clust=WGCNA;dch=

bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 670487 0.835 0.837
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 116739 0.823 0.834
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 26225 0.836 0.834
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 777572 0.829 0.838
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 288389 0.727 0.834
bin=GMM;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T
bin=GMM;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 670487 0.834 0.829
bin=GMM;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 116739 0.841 0.814
bin=GMM;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 26225 0.659 0.829
bin=GMM;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=

bin=ward;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 777572 0.838 0.834
bin=ward;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 288389 0.836 0.832
bin=kmeans;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T
bin=kmeans;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 670487 0.755 0.838
bin=kmeans;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 116739 0.835 0.837
bin=kmeans;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 26225 0.754 0.838
bin=kmeans;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 777572 0.769 0.845
bin=kmeans;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 288389 0.8 0.844
bin=GMM;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T
bin=GMM;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 670487 0.681 0.817
bin=GMM;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=2;preClustering=T 

bin=ward;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 116739 0.844 0.826
bin=ward;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 26225 0.853 0.823
bin=ward;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 777572 0.854 0.823
bin=ward;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 288389 0.847 0.823
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 670487 0.834 0.846
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 116739 0.827 0.847
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 26225 0.832 0.845
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 777572 0.813 0.848
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T 288389 0.839 0.842
bin=GMM;pval=0.05;direction=UP-DOWN

bin=ward;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T
bin=ward;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 670487 0.833 0.83
bin=ward;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 116739 0.848 0.829
bin=ward;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 26225 0.855 0.842
bin=ward;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 777572 0.852 0.827
bin=ward;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 288389 0.833 0.828
bin=kmeans;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T
bin=kmeans;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 670487 0.765 0.84
bin=kmeans;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 116739 0.846 0.84
bin=kmeans;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T 26225 0.854 0.838
bin=kmeans;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.995

bin=GMM;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 777572 0.763 0.815
bin=GMM;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 288389 0.685 0.815
bin=ward;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T
bin=ward;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 670487 0.702 0.711
bin=ward;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 116739 0.814 0.833
bin=ward;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 26225 0.845 0.82
bin=ward;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 777572 0.843 0.767
bin=ward;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 288389 0.719 0.83
bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T
bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preClustering=T 670487 0.775 0.834
bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=1;preCluster

bin=GMM;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 116739 0.669 0.815
bin=GMM;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 26225 0.665 0.815
bin=GMM;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 777572 0.686 0.813
bin=GMM;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 288389 0.839 0.814
bin=ward;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T
bin=ward;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 670487 0.824 0.824
bin=ward;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 116739 0.823 0.82
bin=ward;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 26225 0.824 0.823
bin=ward;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 777572 0.79 0.82
bin=ward;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 288389 0.792 0.814
bin=kmeans;pval=0.01;direction=BOTH;clust=WGCNA;

bin=GMM;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T
bin=GMM;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 670487 0.745 0.812
bin=GMM;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 116739 0.735 0.813
bin=GMM;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 26225 0.733 0.815
bin=GMM;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 777572 0.733 0.815
bin=GMM;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 288389 0.734 0.81
bin=ward;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T
bin=ward;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 670487 0.846 0.837
bin=ward;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 116739 0.751 0.837
bin=ward;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 26225 0.782 0.823
bin=ward;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 777572 0.789 0.838
bi

bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 777572 0.838 0.84
bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 288389 0.843 0.84
bin=GMM;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T
bin=GMM;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 670487 0.783 0.817
bin=GMM;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 116739 0.784 0.809
bin=GMM;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 26225 0.793 0.808
bin=GMM;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 777572 0.791 0.817
bin=GMM;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 288389 0.836 0.811
bin=ward;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T
bin=ward;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 670487 0.843 0.815
bin=ward;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=

bin=kmeans;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 670487 0.823 0.833
bin=kmeans;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 116739 0.816 0.837
bin=kmeans;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 26225 0.728 0.823
bin=kmeans;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 777572 0.824 0.827
bin=kmeans;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 288389 0.815 0.835
bin=GMM;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T
bin=GMM;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 670487 0.674 0.823
bin=GMM;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 116739 0.839 0.815
bin=GMM;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 26225 0.84 0.817
bin=GMM;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T 777572 0.674 0.817
bin=GMM;pval=0.005;direction=BOTH;clust=WGCNA;d

bin=ward;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=1;preClustering=T 777572 0.756 0.747
bin=ward;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=1;preClustering=T 288389 0.844 0.825
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 670487 0.836 0.833
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 116739 0.838 0.834
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 26225 0.835 0.836
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 777572 0.838 0.836
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 288389 0.837 0.833
bin=GMM;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T
bin=GMM;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=2;preClustering=T 670487 0.67 0.819
bin=GMM;pval=0.005;direction=UP-DOWN;clust=W

bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=3;preClustering=T 670487 0.847 0.838
bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=3;preClustering=T 116739 0.858 0.832
bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=3;preClustering=T 26225 0.85 0.834
bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=3;preClustering=T 777572 0.853 0.833
bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=3;preClustering=T 288389 0.855 0.823
bin=kmeans;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T
bin=kmeans;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 670487 0.735 0.838
bin=kmeans;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 116739 0.739 0.837
bin=kmeans;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 26225 0.727 0.84
bin=kmeans;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T 777572 0.737 0.84
bin=kmeans;pval=0.005;direction=BOTH;cl

bin=GMM;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=4;preClustering=T 777572 0.73 0.813
bin=GMM;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=4;preClustering=T 288389 0.731 0.813
bin=ward;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=4;preClustering=T
bin=ward;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=4;preClustering=T 670487 0.821 0.835
bin=ward;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=4;preClustering=T 116739 0.828 0.835
bin=ward;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=4;preClustering=T 26225 0.852 0.824
bin=ward;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=4;preClustering=T 777572 0.801 0.835
bin=ward;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.95;ds=4;preClustering=T 288389 0.795 0.834
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T 670487 0.837 0.839
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;pre

### Saving method performaces for all parameter combinations

In [64]:
pd.DataFrame.from_records(clustering_similarities).to_csv("UnPaSt_similarities.tsv",sep = "\t")
pd.DataFrame.from_records(subt_t).to_csv("UnPaSt_TCGA.tsv",sep = "\t")
pd.DataFrame.from_records(subt_m).to_csv("UnPaSt_METABRIC.tsv",sep = "\t")
#df = pd.read_csv("UnPaSt_TCGA.tsv",sep = "\t",index_col =0).groupby("parameters").agg("mean").sort_values(by= cl,ascending=False)
#df2 = pd.read_csv("UnPaSt_METABRIC.tsv",sep = "\t",index_col =0).groupby("parameters").agg("mean").sort_values(by= cl,ascending=False)

# 3. Selecting parameters for TCGA and METABRIC
* max. performance for PAM50 classification
* TCGA-BRCA = 0.852
* METABRIC = 0.845

In [107]:
# TCGA-BRCA 
cl = "PAM50"
df = pd.DataFrame.from_records(subt_t).groupby("parameters").agg("mean")
df = df.sort_values(cl,ascending = False)
df.head(10)

Unnamed: 0_level_0,PAM50,Intrinsic,PAM50_AB,SCMOD2,IHC,Luminal,Basal,Her2,LumA,LumB,...,Claudin-low,IHC_HER2,IHC_ER,IHC_PR,IHC_TNBC,NET_kmeans,NET_ward,seed,run,time
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bin=ward;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=3;preClustering=T,0.852806,0.818664,0.619256,0.667183,0.708085,0.912704,0.946757,0.48227,0.668896,0.456727,...,0.119802,0.396739,0.863619,0.78232,0.474118,0.663697,0.56465,375882.4,2.0,0.015121
bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=3;preClustering=T,0.852708,0.818569,0.612114,0.672083,0.706543,0.914291,0.941046,0.48227,0.674865,0.436289,...,0.118305,0.396739,0.863433,0.785107,0.472746,0.646916,0.58466,375882.4,2.0,0.015121
bin=ward;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T,0.852124,0.820628,0.61873,0.661658,0.724461,0.914427,0.940465,0.479031,0.669067,0.458681,...,0.113962,0.391367,0.867202,0.784839,0.47093,0.634428,0.562521,375882.4,2.0,0.015121
bin=ward;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=4;preClustering=T,0.852082,0.817968,0.617381,0.666309,0.70797,0.911593,0.948596,0.48227,0.665307,0.456727,...,0.1192,0.396739,0.863479,0.782235,0.473166,0.663697,0.56465,375882.4,2.0,0.015121
bin=ward;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=3;preClustering=T,0.851299,0.819054,0.617083,0.660899,0.704382,0.912378,0.944388,0.48227,0.64771,0.475956,...,0.113962,0.396739,0.863861,0.782256,0.473786,0.657962,0.551061,375882.4,2.0,0.015121
bin=ward;pval=0.05;direction=BOTH;clust=WGCNA;dch=0.95;ds=4;preClustering=T,0.850751,0.819123,0.624003,0.660449,0.716855,0.910201,0.947265,0.48227,0.663921,0.478168,...,0.114287,0.396739,0.867292,0.783678,0.470874,0.657962,0.551061,375882.4,2.0,0.015121
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=4;preClustering=T,0.85069,0.817778,0.633483,0.674252,0.701811,0.917107,0.929185,0.482478,0.718875,0.451254,...,0.120295,0.424035,0.855934,0.786618,0.463306,0.327332,0.275547,375882.4,2.0,0.015121
bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=4;preClustering=T,0.850316,0.816273,0.612875,0.670878,0.706186,0.910563,0.941046,0.48227,0.674865,0.436289,...,0.118305,0.396739,0.862644,0.783256,0.472746,0.646916,0.58466,375882.4,2.0,0.015121
bin=kmeans;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=3;preClustering=T,0.849503,0.816637,0.633765,0.674636,0.703406,0.915425,0.929185,0.482478,0.719261,0.451248,...,0.120104,0.424035,0.855259,0.785502,0.463306,0.331898,0.269751,375882.4,2.0,0.015121
bin=ward;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T,0.849017,0.817501,0.617531,0.652607,0.714139,0.907314,0.941947,0.491428,0.673546,0.451109,...,0.120538,0.403471,0.861674,0.778864,0.470306,0.593159,0.542847,375882.4,2.0,0.015121


In [49]:
# METABRIC
df2 = pd.DataFrame.from_records(subt_m).groupby("parameters").agg("mean")
df2 = df2.sort_values(cl,ascending = False)
df2.head(10)

Unnamed: 0_level_0,PAM50,Intrinsic,PAM50_AB,SCMOD2,IHC,Luminal,Basal,Her2,LumA,LumB,...,Claudin-low,IHC_HER2,IHC_ER,IHC_PR,IHC_TNBC,NET_kmeans,NET_ward,seed,run,time
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T,0.845645,0.814071,0.578241,0.614674,0.763691,0.932988,0.869298,0.496223,0.571114,0.553441,...,0.160535,0.79027,0.93639,0.681838,0.681682,0.791445,0.636467,375882.4,2.0,0.047742
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T,0.845598,0.813851,0.582768,0.603657,0.740343,0.928953,0.863469,0.529044,0.571114,0.560592,...,0.160535,0.717883,0.929332,0.679421,0.681682,0.706168,0.496296,375882.4,2.0,0.019611
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=1;preClustering=T,0.845565,0.813987,0.584253,0.603499,0.742601,0.928953,0.863469,0.529044,0.571114,0.560519,...,0.160535,0.717883,0.929332,0.679421,0.681682,0.706168,0.496296,375882.4,2.0,0.02064
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=2;preClustering=T,0.845221,0.813657,0.582303,0.608683,0.754936,0.932988,0.864903,0.497984,0.571114,0.564405,...,0.160535,0.795081,0.93639,0.681838,0.681682,0.706193,0.496313,375882.4,2.0,0.030309
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T,0.843692,0.812269,0.579321,0.620045,0.779307,0.932129,0.868199,0.487593,0.571114,0.559239,...,0.160535,0.838778,0.938453,0.681908,0.681682,0.791445,0.636467,375882.4,2.0,0.066756
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T,0.842902,0.811304,0.563861,0.563194,0.73563,0.926916,0.856946,0.52654,0.502493,0.568243,...,0.15959,0.727946,0.924472,0.674109,0.675701,0.711591,0.500036,375882.4,2.0,0.018016
bin=kmeans;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T,0.842358,0.81078,0.555466,0.551287,0.707574,0.924308,0.853825,0.53873,0.490893,0.551685,...,0.16063,0.733361,0.920366,0.674554,0.681761,0.710235,0.499101,375882.4,2.0,0.017783
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=1;preClustering=T,0.842341,0.810716,0.55834,0.556616,0.738771,0.92789,0.856946,0.515215,0.504145,0.554576,...,0.15959,0.75441,0.928042,0.675513,0.675701,0.711591,0.500036,375882.4,2.0,0.020972
bin=kmeans;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=1;preClustering=T,0.841946,0.810344,0.551577,0.544238,0.728856,0.924471,0.852027,0.535004,0.488168,0.544972,...,0.161438,0.739655,0.923652,0.67332,0.679707,0.707498,0.497214,375882.4,2.0,0.022167
bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T,0.840818,0.809557,0.56038,0.579753,0.784695,0.931479,0.858427,0.482719,0.508724,0.564797,...,0.160588,0.852329,0.939144,0.680764,0.678733,0.786067,0.627347,375882.4,2.0,0.06076


# 4. Optimal parameters selection

* minimal rank sum for TCGA and METABRIC

In [59]:
df["rank"] = range(df.shape[0])
df2["rank"] = range(df2.shape[0])
r = df["rank"]+df2["rank"]
r.sort_values().head(10)

parameters
bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T      50
bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T      53
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T     60
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T      65
bin=kmeans;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T      67
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T     71
bin=kmeans;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T     94
bin=kmeans;pval=0.001;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T    103
bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=2;preClustering=T     104
bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.95;ds=3;preClustering=T      116
Name: rank, dtype: int64

### Performance with selected parameters only slightly decreases compared to best performance

In [62]:
params = r.sort_values().index[0]
print(params)
df.loc[[params],:]

bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T


Unnamed: 0_level_0,PAM50,Intrinsic,PAM50_AB,SCMOD2,IHC,Luminal,Basal,Her2,LumA,LumB,...,IHC_HER2,IHC_ER,IHC_PR,IHC_TNBC,NET_kmeans,NET_ward,seed,run,time,rank
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T,0.841306,0.809262,0.631223,0.662526,0.710483,0.89918,0.945687,0.475058,0.689499,0.473313,...,0.417338,0.864536,0.778363,0.471613,0.43055,0.351626,375882.4,2.0,0.015121,40


In [63]:
df2.loc[[params],:]

Unnamed: 0_level_0,PAM50,Intrinsic,PAM50_AB,SCMOD2,IHC,Luminal,Basal,Her2,LumA,LumB,...,IHC_HER2,IHC_ER,IHC_PR,IHC_TNBC,NET_kmeans,NET_ward,seed,run,time,rank
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T,0.840732,0.809425,0.561267,0.578005,0.777367,0.931168,0.857924,0.482644,0.506047,0.569946,...,0.857478,0.938055,0.68035,0.678733,0.77201,0.597819,375882.4,2.0,0.038138,10


### Similarities of (bi)clusters in TCGA and METABRIC 

In [108]:
#s = pd.read_csv("UnPaSt_similarities.tsv",sep = "\t",index_col=0).groupby("parameters").agg("mean")
s = pd.DataFrame.from_records(clustering_similarities).groupby("parameters").agg("mean")
s["avg_percent_matched"] = (s["percent_matched_1"]+s["percent_matched_2"])*0.5
s.sort_values(by = "avg_percent_matched",ascending = False)

Unnamed: 0_level_0,n_1,n_2,percent_matched_1,percent_matched_2,n_shared_genes_1,avg_bm_J_1,n_shared_genes_2,avg_bm_J_2,seed,run,avg_percent_matched
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=0;preClustering=T,44.8,51.0,0.544970,0.573866,293.0,0.171955,314.2,0.150679,375882.4,2.0,0.559418
bin=kmeans;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T,35.4,61.8,0.576392,0.528918,168.4,0.157806,246.4,0.113102,375882.4,2.0,0.552655
bin=kmeans;pval=0.005;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=1;preClustering=T,68.4,62.2,0.485745,0.576198,302.6,0.167821,319.0,0.160051,375882.4,2.0,0.530972
bin=kmeans;pval=0.01;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T,40.4,66.0,0.533764,0.514946,186.6,0.160966,261.2,0.114310,375882.4,2.0,0.524355
bin=ward;pval=0.005;direction=BOTH;clust=WGCNA;dch=0.995;ds=0;preClustering=T,37.0,70.8,0.597559,0.442186,191.8,0.146806,233.2,0.114953,375882.4,2.0,0.519873
...,...,...,...,...,...,...,...,...,...,...,...
bin=ward;pval=0.1;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T,483.6,421.8,0.161696,0.198435,282.2,0.250113,303.8,0.243000,375882.4,2.0,0.180065
bin=ward;pval=0.05;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=4;preClustering=T,428.6,386.6,0.161072,0.189657,246.2,0.236848,276.2,0.233937,375882.4,2.0,0.175365
bin=ward;pval=0.001;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T,214.2,240.6,0.149559,0.195537,159.8,0.222312,218.0,0.174894,375882.4,2.0,0.172548
bin=ward;pval=0.1;direction=BOTH;clust=WGCNA;dch=0.995;ds=4;preClustering=T,429.6,351.2,0.143510,0.194139,213.6,0.302687,246.0,0.292358,375882.4,2.0,0.168825


In [67]:
s.loc[[params],:]

Unnamed: 0_level_0,n_1,n_2,percent_matched_1,percent_matched_2,n_shared_genes_1,avg_bm_J_1,n_shared_genes_2,avg_bm_J_2,seed,run,avg_percent_matched
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
bin=kmeans;pval=0.01;direction=UP-DOWN;clust=WGCNA;dch=0.995;ds=3;preClustering=T,171.8,161.2,0.354259,0.378438,237.4,0.202671,263.8,0.199374,375882.4,2.0,0.366349
