In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
from IPython.display import display, Markdown
import warnings

warnings.filterwarnings("ignore")

In [2]:
def background_gradient(s, m=None, M=None, cmap="Blues", low=0, high=0.5):
    if m is None:
        m = s.min().min()
    if M is None:
        M = s.max().max()
    rng = M - m
    norm = colors.Normalize(m - (rng * low), M + (rng * high))
    normed = s.apply(norm)

    cm = plt.cm.get_cmap(cmap)
    c = normed.applymap(lambda x: colors.rgb2hex(cm(x)))
    ret = c.applymap(lambda x: "background-color: %s" % x)
    return ret


def get_characteristic(clinical_view, assignment):
    data = clinical_view.copy()
    data["cluster"] = assignment["assignment"]
    clinical_view["cluster"] = assignment["assignment"].astype(int)
    cons = [
        "cons05.resp",
        "cons05.cvs",
        "cons05.cns",
        "cons05.ren",
        "cons05.hep",
        "cons05.hem",
    ]
    overall_mean = clinical_view[
        cons
        + ["cons05.score", "ccc.summary", "age.at.bc", "sex", "death.30.bc", "picu"]
    ].mean()

    report = clinical_view.groupby("cluster").mean()[
        cons
        + ["cons05.score", "ccc.summary", "age.at.bc", "sex", "death.30.bc", "picu"]
    ]
    report["size"] = clinical_view.groupby("cluster").size().astype(str)
    report.loc["overall"] = overall_mean.tolist() + [str(len(clinical_view))]
    pd.set_option("display.precision", 3)
    display(report.T)

In [3]:
data_path = "data/"
score_path = "{}/ClusteringResults/".format(data_path)
KCC_path = "{}/KCC/".format(data_path)

# physio_view_original = pd.read_csv("data/PhysioView.csv", index_col=0)
# physio_view = pd.read_csv("data/PhysioViewNormalImputed.csv", index_col=0)

# contextual_view = pd.read_csv("data/ContextualViewZeroImputed.csv", index_col=0)
# clinical_view = pd.concat([physio_view, contextual_view], 1)

# data_episode = pd.read_csv("../olinks/spss_mlcb_olink_episode.csv", index_col=0)
# data_episode = data_episode.loc[clinical_view.index]
# proteome_view = pd.read_csv("data/ProteomeViewStandardized.csv", index_col=0)
# clinical_view[["death.30.bc", "picu"]] = data_episode[["death.30.bc", "picu"]].replace(
#     ["no", "yes"], [0, 1]
# )
# clinical_view = clinical_view[clinical_view.index.isin(proteome_view.index)]

FileNotFoundError: [Errno 2] No such file or directory: 'data/PhysioView.csv'

In [4]:
configs = [
    ["clinical", 4, "DBSCAN"],
    ["contextual", 6, "DBSCAN"],
    ["physio", 4, "ConsensusKMeans"],
    ["proteome", 5, "DBSCAN"]
]


# cluster overlapping single view

In [5]:
for i in range(len(configs)):
    for j in range(len(configs)):
        if j > i:
            view1, KCC_space, method1 = configs[i]
            assignment1 = pd.read_csv(
                "{}/{}_{}_view_KCC_{}_assignments_ranked.csv".format(
                    score_path, method1, view1, KCC_space
                ),
                index_col=0,
            )
            if method1 == "DBSCAN":
                assignment1["assignment"] = assignment1["assignment"] + 1
                assignment1 = assignment1[assignment1["assignment"] != 0]

            view2, KCC_space, method2 = configs[j]
            assignment2 = pd.read_csv(
                "{}/{}_{}_view_KCC_{}_assignments_ranked.csv".format(
                    score_path, method2, view2, KCC_space
                ),
                index_col=0,
            )
            if method2 == "DBSCAN":
                assignment2["assignment"] = assignment2["assignment"] + 1
                assignment2 = assignment2[assignment2["assignment"] != 0]

            agreement_mat = pd.DataFrame(
                index=[
                    "{} {}".format(view1, i)
                    for i in sorted(assignment1["assignment"].unique())
                ],
                columns=[
                    "{} {}".format(view2, i)
                    for i in sorted(assignment2["assignment"].unique())
                ],
            )
            display(Markdown(
                "**{} view, KCC {}, {} / {} view, KCC {}, {}**".format(
                    *(configs[i] + configs[j])
                )
            ))

            for cluster1 in sorted(assignment1["assignment"].unique()):
                for cluster2 in sorted(assignment2["assignment"].unique()):
                    index_1 = assignment1[assignment1["assignment"] == cluster1].index
                    index_2 = assignment2[assignment2["assignment"] == cluster2].index
                    len_overlap = len(list(set(index_1).intersection(set(index_2))))
                    len_union = len(set(index_1).union(set(index_2)))
                    agreement_mat.loc[
                        "{} {}".format(view1, cluster1), "{} {}".format(view2, cluster2)
                    ] = (len_overlap / len_union)
            agreement_mat = agreement_mat.astype(float)
            display(
                agreement_mat.style.set_precision(3).apply(
                    background_gradient, axis=None
                )
            )
            print("-" * 80)

**clinical view, KCC 4, DBSCAN / contextual view, KCC 6, DBSCAN**

Unnamed: 0,contextual 1,contextual 2,contextual 3,contextual 4,contextual 5,contextual 6,contextual 7
clinical 1,0.173,0.0,0.429,0.042,0.028,0.027,0.008
clinical 2,0.325,0.223,0.021,0.039,0.125,0.042,0.0
clinical 3,0.0,0.0,0.01,0.614,0.077,0.01,0.0
clinical 4,0.0,0.049,0.03,0.009,0.02,0.0,0.676
clinical 5,0.024,0.0,0.018,0.008,0.144,0.456,0.061


--------------------------------------------------------------------------------


**clinical view, KCC 4, DBSCAN / physio view, KCC 4, ConsensusKMeans**

Unnamed: 0,physio 1,physio 2,physio 3,physio 4
clinical 1,0.013,0.43,0.07,0.014
clinical 2,0.658,0.107,0.012,0.0
clinical 3,0.028,0.006,0.623,0.0
clinical 4,0.013,0.092,0.04,0.356
clinical 5,0.045,0.087,0.045,0.355


--------------------------------------------------------------------------------


**clinical view, KCC 4, DBSCAN / proteome view, KCC 5, DBSCAN**

Unnamed: 0,proteome 1,proteome 2,proteome 3
clinical 1,0.077,0.177,0.106
clinical 2,0.438,0.146,0.019
clinical 3,0.0,0.243,0.018
clinical 4,0.007,0.137,0.255
clinical 5,0.132,0.137,0.135


--------------------------------------------------------------------------------


**contextual view, KCC 6, DBSCAN / physio view, KCC 4, ConsensusKMeans**

Unnamed: 0,physio 1,physio 2,physio 3,physio 4
contextual 1,0.187,0.237,0.03,0.014
contextual 2,0.232,0.015,0.01,0.009
contextual 3,0.045,0.186,0.073,0.058
contextual 4,0.054,0.084,0.419,0.0
contextual 5,0.138,0.093,0.073,0.058
contextual 6,0.075,0.083,0.043,0.2
contextual 7,0.007,0.058,0.049,0.404


--------------------------------------------------------------------------------


**contextual view, KCC 6, DBSCAN / proteome view, KCC 5, DBSCAN**

Unnamed: 0,proteome 1,proteome 2,proteome 3
contextual 1,0.333,0.097,0.016
contextual 2,0.211,0.026,0.012
contextual 3,0.081,0.11,0.105
contextual 4,0.0,0.258,0.043
contextual 5,0.008,0.166,0.082
contextual 6,0.123,0.122,0.048
contextual 7,0.007,0.109,0.318


--------------------------------------------------------------------------------


**physio view, KCC 4, ConsensusKMeans / proteome view, KCC 5, DBSCAN**

Unnamed: 0,proteome 1,proteome 2,proteome 3
physio 1,0.317,0.17,0.047
physio 2,0.14,0.269,0.105
physio 3,0.046,0.261,0.056
physio 4,0.082,0.155,0.265


--------------------------------------------------------------------------------


# cluster characteristics

## single view

In [10]:
configs = [
    ["clinical", 4, "DBSCAN"],
    ["contextual", 6, "DBSCAN"],
    ["physio", 4, "ConsensusKMeans"],
    ["proteome", 5, "DBSCAN"]
]
for i in range(len(configs)):
    view, KCC_space, method = configs[i]
    
    assignment = pd.read_csv(
        "{}/{}_{}_view_KCC_{}_assignments_ranked.csv".format(
            score_path, method, view, KCC_space
        ),
        index_col=0,
    )
    if method == "DBSCAN":
        assignment["assignment"] = assignment["assignment"] + 1
        assignment = assignment[assignment["assignment"] != 0]
    assignment['assignment'] = assignment['assignment'].astype(int)
    print('{} view, KCC {}, {}'.format(*configs[i]))
    get_characteristic(clinical_view, assignment)
    print("-" * 80)

clinical view, KCC 4, DBSCAN


cluster,1.0,2.0,3.0,4.0,5.0,overall
cons05.resp,0.046,0.0,0.0,0.603,0.606,0.229
cons05.cvs,0.062,0.03,0.094,0.138,0.515,0.162
cons05.cns,0.0,0.02,0.0,0.086,0.348,0.089
cons05.ren,0.031,0.04,0.0,0.034,0.061,0.034
cons05.hep,0.0,0.03,0.302,0.017,0.121,0.078
cons05.hem,0.015,0.01,0.717,0.052,0.318,0.182
cons05.score,0.154,0.131,1.113,0.931,1.97,0.774
ccc.summary,0.185,0.293,1.094,1.483,1.227,0.788
age.at.bc,105.292,2787.657,2370.302,25.845,892.061,1347.137
sex,0.677,0.657,0.623,0.655,0.591,0.631


--------------------------------------------------------------------------------
contextual view, KCC 6, DBSCAN


cluster,1.0,2.0,3.0,4.0,5.0,6.0,7.0,overall
cons05.resp,0.0,0.037,0.111,0.017,0.133,0.51,0.732,0.229
cons05.cvs,0.0,0.037,0.156,0.05,0.2,0.449,0.232,0.162
cons05.cns,0.062,0.037,0.022,0.0,0.067,0.286,0.125,0.089
cons05.ren,0.016,0.0,0.0,0.05,0.067,0.041,0.054,0.034
cons05.hep,0.0,0.037,0.0,0.2,0.089,0.143,0.0,0.078
cons05.hem,0.0,0.037,0.022,0.533,0.178,0.265,0.125,0.182
cons05.score,0.078,0.185,0.311,0.85,0.733,1.694,1.268,0.774
ccc.summary,0.031,0.074,0.111,1.067,1.156,1.082,1.607,0.788
age.at.bc,1467.156,3234.889,85.778,1954.7,1820.667,1517.204,11.839,1347.137
sex,0.609,0.778,0.667,0.6,0.467,0.735,0.625,0.631


--------------------------------------------------------------------------------
physio view, KCC 4, ConsensusKMeans


cluster,1,2,3,4,overall
cons05.resp,0.042,0.13,0.097,0.687,0.229
cons05.cvs,0.074,0.056,0.125,0.434,0.162
cons05.cns,0.021,0.028,0.014,0.313,0.089
cons05.ren,0.042,0.028,0.0,0.06,0.034
cons05.hep,0.032,0.019,0.222,0.084,0.078
cons05.hem,0.032,0.019,0.569,0.229,0.182
cons05.score,0.242,0.278,1.028,1.807,0.774
ccc.summary,0.4,0.509,0.875,1.518,0.788
age.at.bc,2907.558,364.167,1776.903,467.349,1347.137
sex,0.642,0.639,0.611,0.627,0.631


--------------------------------------------------------------------------------
proteome view, KCC 5, DBSCAN


cluster,1.0,2.0,3.0,overall
cons05.resp,0.125,0.192,0.5,0.229
cons05.cvs,0.114,0.163,0.233,0.162
cons05.cns,0.114,0.062,0.15,0.089
cons05.ren,0.0,0.034,0.083,0.034
cons05.hep,0.057,0.101,0.033,0.078
cons05.hem,0.091,0.25,0.083,0.182
cons05.score,0.5,0.803,1.083,0.774
ccc.summary,0.057,0.952,1.3,0.788
age.at.bc,2168.989,1337.909,216.05,1347.137
sex,0.648,0.615,0.65,0.631


--------------------------------------------------------------------------------


## MVKDR


### proteome + clinical

In [6]:
data_path = 'data'
result_path = '{}/MVKDR_results/'.format(data_path)
Ks = {'clinical':4}
for view in ['clinical']:
    k = Ks[view]
    assignment = pd.read_csv('{}/proteome_{}_K_{}_assignments_ranked.csv'.format(result_path, view, k),index_col=0)
    assignment["assignment"] = assignment["assignment"] .astype(int)
    print('proteome + {} view'.format(view))
    get_characteristic(clinical_view, assignment)
    print("-" * 80)

proteome + clinical view


cluster,1,2,3,4,overall
cons05.resp,0.152,0.242,0.128,0.367,0.229
cons05.cvs,0.139,0.179,0.116,0.204,0.162
cons05.cns,0.089,0.084,0.105,0.082,0.089
cons05.ren,0.0,0.042,0.0,0.082,0.034
cons05.hep,0.114,0.063,0.058,0.082,0.078
cons05.hem,0.443,0.095,0.093,0.133,0.182
cons05.score,0.937,0.705,0.5,0.949,0.774
ccc.summary,0.937,1.074,0.058,1.031,0.788
age.at.bc,2099.797,887.074,2153.349,478.888,1347.137
sex,0.57,0.611,0.663,0.673,0.631


--------------------------------------------------------------------------------


# cluster overlapping, MVKDR vs SingleView

In [7]:
result_path = '{}/MVKDR_results/'.format(data_path)
assignment_mv = pd.read_csv('{}/proteome_clinical_K_4_assignments_ranked.csv'.format(result_path),index_col=0)
assignment_mv["assignment"] = assignment_mv["assignment"] .astype(int)

In [8]:
for i in range(len(configs)):
    view, KCC_space, method = configs[i]
    assignment_sv = pd.read_csv(
        "{}/{}_{}_view_KCC_{}_assignments_ranked.csv".format(
            score_path, method, view, KCC_space
        ),
        index_col=0,
    )
    if method == "DBSCAN":
        assignment_sv["assignment"] = assignment_sv["assignment"] + 1
        assignment_sv = assignment_sv[assignment_sv["assignment"] != 0]
    agreement_mat = pd.DataFrame(
        index=[
            "{} {}".format('MV', i) for i in sorted(assignment_mv["assignment"].unique())
        ],
        columns=[
            "{} {}".format(view, i) for i in sorted(assignment_sv["assignment"].unique())
        ],
    )

    for cluster_mv in sorted(assignment_mv["assignment"].unique()):
        for cluster_sv in sorted(assignment_sv["assignment"].unique()):
            index_mv = assignment_mv[assignment_mv["assignment"] == cluster_mv].index
            index_sv = assignment_sv[assignment_sv["assignment"] == cluster_sv].index
            len_overlap = len(list(set(index_mv).intersection(set(index_sv))))
            len_union = len(set(index_mv).union(set(index_sv)))
            agreement_mat.loc[
                            "{} {}".format('MV', cluster_mv), "{} {}".format(view, cluster_sv)
                        ] = (len_overlap / len_union)
    agreement_mat = agreement_mat.astype(float)
    display(
        agreement_mat.style.set_precision(3).apply(
            background_gradient, axis=None
        )
    )

Unnamed: 0,clinical 1,clinical 2,clinical 3,clinical 4,clinical 5
MV 1,0.029,0.085,0.361,0.038,0.107
MV 2,0.159,0.135,0.05,0.117,0.142
MV 3,0.079,0.445,0.0,0.007,0.126
MV 4,0.207,0.026,0.079,0.3,0.101


Unnamed: 0,contextual 1,contextual 2,contextual 3,contextual 4,contextual 5,contextual 6,contextual 7
MV 1,0.036,0.029,0.042,0.337,0.088,0.133,0.031
MV 2,0.06,0.017,0.148,0.099,0.138,0.125,0.11
MV 3,0.327,0.215,0.083,0.0,0.008,0.116,0.007
MV 4,0.087,0.016,0.092,0.075,0.135,0.028,0.305


Unnamed: 0,physio 1,physio 2,physio 3,physio 4
MV 1,0.123,0.075,0.291,0.087
MV 2,0.159,0.187,0.099,0.141
MV 3,0.312,0.141,0.046,0.076
MV 4,0.038,0.234,0.104,0.248


Unnamed: 0,proteome 1,proteome 2,proteome 3
MV 1,0.012,0.367,0.0
MV 2,0.0,0.212,0.36
MV 3,0.977,0.0,0.0
MV 4,0.0,0.342,0.137
