In [1]:
import pandas as pd

pd.set_option("display.precision", 3)
import os
import warnings

warnings.filterwarnings("ignore")
from sklearn.cluster import SpectralClustering
from mvlearn.cluster import MultiviewKMeans
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from IPython.display import display
import sys
import math
import seaborn as sns

sns.set_style("white")
from ConsensusClusteringMultiView import ConsensusCluster
import scipy.stats as sps
import copy
from tqdm import tqdm
from sklearn.manifold import TSNE
import matplotlib.cm as cm
import matplotlib.lines as mlines
from sklearn.cluster import DBSCAN
from pathlib import Path
from random import shuffle

In [None]:
def cal_F_stat(data):
    SSB = (
        ((data.groupby("cluster").mean() - data.drop("cluster", 1).mean()) ** 2).T
        * data.groupby("cluster").size().values
    ).sum(1)

    SSW = []
    assert len(data["cluster"].unique()) == 2
    for k in sorted(data["cluster"].unique()):
        cluster = data[data["cluster"] == k].drop("cluster", 1)
        diff = (cluster - data.groupby("cluster").mean().loc[k]) ** 2
        SSW.append(diff)
    SSW = pd.concat(SSW).sum()
    coef = (len(data) - len(data["cluster"].unique())) / (
        len(data["cluster"].unique()) - 1
    )
    F_stat = (coef * (SSB / SSW)).sort_values(ascending=False)
    return F_stat


def get_ovr_F_stat(data, num_var):
    F_stat_true = cal_F_stat(data)
    all_F_stat = []
    n_test = 19 * num_var
    for i in tqdm(range(n_test)):
        data_run = data.copy()
        rs = np.random.RandomState(i)
        random_assignment = data_run["cluster"].tolist()
        rs.shuffle(random_assignment)
        data_run["cluster"] = random_assignment
        F_stat = cal_F_stat(data_run)
        all_F_stat.append(F_stat)

    all_F_stat = pd.concat(all_F_stat, 1)
    all_F_stat.columns = ["random {}".format(i + 1) for i in range(all_F_stat.shape[1])]
    all_F_stat["true"] = F_stat_true
    return all_F_stat

def plot_top(data, target_var, save_path, name):
    plt.figure(figsize=(30, 10))
    target_cluster = np.unique(data["cluster"])
    for i, var in enumerate(target_var):
        plt.subplot(2, 5, i + 1)
        for k in target_cluster:
            cluster = data[data["cluster"] == k]
            plt.hist(
                cluster[var],
                bins=25,
                alpha=0.5,
                density=True,
                histtype="stepfilled",
                label="cluster {}, size {}".format(k, len(cluster)),
            )
        plt.legend()
        plt.title(var, fontsize=15)
    plt.suptitle(name, fontsize=25)
   
    plt.savefig(
        "{}/{}.png".format(save_path, name), dpi=300,
    )
    plt.close()
    

In [7]:
data_path = "data/"
score_path = "{}/Clustering_silhouette/".format(data_path)
cdf_path = "{}/CDF plots/".format(data_path)
tsne_path = "{}/TSNEplots/".format(data_path)
KCC_path = "{}/KCC/".format(data_path)
f_stat_path = '{}/F_stat_OVR/'.format(data_path)
top_feature_plot_path = '{}/Top_features_OVR/'.format(data_path)

In [8]:
physio_view = pd.read_csv("data/PhysioViewStandardized.csv", index_col=0)
physio_view_original = pd.read_csv("data/PhysioViewNormalImputed.csv", index_col=0)
physio_view_original = physio_view_original[physio_view.columns]
binary_vars = pd.read_csv("{}/ClinicalBinary.csv".format(data_path), index_col=0)

proteome_view = pd.read_csv("data/ProteomeViewMICEimputed.csv", index_col=0)
npx = pd.read_csv("../olinks/20191053_Giannoni_NPX.csv")
npx.set_index("Panel", inplace=True)

uniport = {}
for k, p in enumerate(npx.columns):
    uniport[p] = npx.iloc[:, k]["Uniprot ID"]
uniprot_col = []
for col in proteome_view.columns:
    uniprot_col.append(uniport[col])

proteome_view.columns = uniprot_col
continous_vars = pd.concat([physio_view_original, proteome_view], 1)

In [9]:
nb_continous_vars = continous_vars.shape[1]
nb_binary_vars = binary_vars.shape[1]
total_vars = nb_continous_vars + nb_binary_vars 

In [10]:
configs = [
    ["clinical", 4, "ConsensusKMeans"],
    ["contextual", 5, "ConsensusKMeans"],
    ["physio", 3, "DBSCAN"],
    ["proteome", 3, "ConsensusKMeans"],
    ["proteome", 4, "ConsensusKMeans"],
]


# OVR Fisher exact

In [None]:
for i in range(len(configs)):
    view, KCC_space, method = configs[i]

    assignment = pd.read_csv(
        "{}/{}_{}_view_KCC_{}_assignments.csv".format(
            score_path, method, view, KCC_space
        ),
        index_col=0,
    )
    print(assignment.head())
    if method == "DBSCAN":
        assignment["assignment"] = assignment["assignment"] + 1
        assignment = assignment[assignment["assignment"] != 0]
    assignment["assignment"] = assignment["assignment"].astype(int)
    
    data = binary_vars.copy()
    data["cluster"] = assignment["assignment"]
    data = data[data["cluster"].notnull()]
    
    num_cluster = len(data["cluster"].unique())
    all_clusters = sorted(data["cluster"].unique())

    for i in all_clusters:
        i = int(i)
        data_ovr = data.copy()
        data_ovr.loc[data[data['cluster'] != i].index, 'cluster'] = i + 1
        save_name = "{}_{}_KCC_{}_FisherExact_cluster_{}vsR.csv".format(
                    method, view, KCC_space, i
                )
        
        for col in binary_vars.columns:
            try:
                contingency_table = pd.crosstab(
                    data_ovr[col], data_ovr["cluster"]
                )
                p_val = stats.fisher_exact(contingency_table)[1]
                all_p_val.loc[col] = p_val
            except:
                pass
        all_p_val.to_csv("{}/{}".format(f_stat_path, save_name))


# OVR F-stat

In [None]:
for i in range(len(configs)):
    view, KCC_space, method = configs[i]
    assignment = pd.read_csv(
        "{}/{}_{}_view_KCC_{}_assignments.csv".format(
            score_path, method, view, KCC_space
        ),
        index_col=0,
    )
    if method == "DBSCAN":
        assignment["assignment"] = assignment["assignment"] + 1
        assignment = assignment[assignment["assignment"] != 0]

    data = continous_vars.copy()
    data["cluster"] = assignment["assignment"]
    data = data[data["cluster"] != 0]
    num_cluster = len(data["cluster"].unique())
    all_clusters = sorted(data["cluster"].unique())
    
    for i in all_clusters:
        data_ovr = data.copy()
        data_ovr.loc[data[data['cluster'] != i].index, 'cluster'] = i + 1
        save_name = "{}_{}_KCC_{}_F_stat_cluster_{}vsR.csv".format(
                        method, view, KCC_space, i
                    )
        all_F_stat = get_ovr_F_stat(data_ovr, total_vars)
        F_stat_true = all_F_stat["true"]
        F_stat_true_pvalue = (
            all_F_stat.rank(axis=1, ascending=False)["true"]
            / all_F_stat.shape[1]
        )
        F_stat_true = pd.concat([F_stat_true, F_stat_true_pvalue], 1)
        F_stat_true.columns = ["F_stat", "p-value"]

        F_stat_true.to_csv("{}/{}".format(f_stat_path, save_name))

9120


 55%|██████████████████████████████████████████▌                                   | 4978/9120 [06:36<05:50, 11.81it/s]

# plot top features

In [None]:
for i in range(len(configs)):
    view, KCC_space, method = configs[i]

    assignment = pd.read_csv(
        "{}/{}_{}_view_KCC_{}_assignments.csv".format(
            score_path, method, view, KCC_space
        ),
        index_col=0,
    )
    if method == "DBSCAN":
        assignment["assignment"] = assignment["assignment"] + 1
        assignment = assignment[assignment["assignment"] != 0]
    assignment["assignment"] = assignment["assignment"].astype(int)
    data = all_features.copy()
    data["cluster"] = assignment["assignment"]
    data = data[data["cluster"].notnull()]
    print(data.shape)
    num_cluster = len(data["cluster"].unique())
    all_clusters = sorted(data["cluster"].unique())

    for i in all_clusters:
        for j in all_clusters:
            if j > i:
                i, j = int(i), int(j)
                cluster_pw = data[(data["cluster"] == i) | (data["cluster"] == j)]
                fstat_name = "{}_{}_KCC_{}_F_stat_cluster_{}vs{}.csv".format(
                    method, view, KCC_space, i, j
                )
                F_stat = pd.read_csv(
                    "{}/{}".format(f_stat_path, fstat_name), index_col=0
                )

                fisher_name = "{}_{}_KCC_{}_FisherExact_cluster_{}vs{}.csv".format(
                    method, view, KCC_space, i, j
                )
                Fisher = pd.read_csv(
                    "{}/{}".format(f_stat_path, fisher_name), index_col=0
                )
                Fisher.columns = ["p-value"]
                Fisher["F_stat"] = np.nan

                F_stat = F_stat[F_stat["p-value"] < 0.05]
                Fisher = Fisher[Fisher["p-value"] < 0.05 /total_vars]
                stats = pd.concat([F_stat, Fisher])
                stats = (
                    stats.sort_values("p-value")
                    .sort_values("F_stat", ascending=False)
                    .iloc[:10]
                )
                fig_name = "{}_{}_KCC_{}_TopFeatures_cluster_{}vs{}".format(
                    method, view, KCC_space, i, j
                )
                plot_top(
                    cluster_pw, stats.index.tolist(), top_feature_plot_path, fig_name
                )