In [None]:
import pandas as pd
import numpy as np
import pathlib as pl

from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator

from scipy.stats import mannwhitneyu, fisher_exact, pearsonr, kruskal

In [None]:
import sys
sys.path.append("../../FinalCode/")
import download.download as dwnl
import utils.plotting as plting
import adVMP.adVMP_discovery as discov
import adVMP.adVMP_plots as advmpplt

In [None]:
# For figures
colors = sns.color_palette("muted")
fig_dir = pl.Path("/add/path/here/")

In [None]:
colors

# Download adVMP

In [None]:
data_dir = pl.Path("/add/path/here/")

In [None]:
# get aDVMCs
union_cpgs = pd.read_csv(data_dir / "adVMP" / "union_cpgs.csv", index_col=0).values.ravel()

In [None]:
# get the most variable probes present in healthy tissue
background_cpgs = pd.read_csv(data_dir / "variable_probes" / "union_cpgs_5_pct_most_variable_onlyhealthy.csv",index_col=0).values.ravel()

# Download GSE199057

In [None]:
# download and preprocess external data 
data_dir_GSE199057 = pl.Path("/add/path/here/")
test_ext_data = pd.read_pickle(data_dir_GSE199057 / "GPL21145" / "beta_values.pkl").T

test_ext_data = test_ext_data.dropna(axis=1)
test_ext_clin = pd.read_pickle(data_dir_GSE199057 / "GPL21145" / "GSE199057_GPL21145_meta_data.pkl").set_index("Sample_ID")
test_ext_clin["Ad_risk"] = test_ext_clin["tissue type"].replace({"normal colon tissue from non-CRC patient": 0, 
                            "normal colon tissue from CRC patient": 1, 
                            "colon tumor sample from CRC patient": 2})
test_ext_data = test_ext_data.loc[test_ext_clin.index]

In [None]:
heatmap_df1, hit_fraction1 = discov.get_heatmap_df(selcpgs=union_cpgs, 
                                            EPIC_m=test_ext_data, 
                                            phenotypes=test_ext_clin["Ad_risk"].ravel(), 
                                            bal=True)

heatmap_df1["Ad_plot"] = heatmap_df1["Ad"].replace({0: "Healthy", 1: "NAC", 2: "Cancer"})

In [None]:
advmpplt.get_performance_plots(
    heatmap_df=heatmap_df1,
    fig_dir=fig_dir / "testtodelete", hue_worm="Ad_plot",
    hue_palette_worm={"Healthy": colors[0], "NAC": colors[1], "Cancer": colors[5]}, 
    ftsize=15, leg_ftsize=15, figsize=5, rocauc=False, leg_title="Tissue type", 
    hue_order=["Healthy","NAC","Cancer"], order="Mixed Order",
)

In [None]:
plot_df = test_ext_clin[["age","Ad_risk"]].astype(float)
plot_df["Ad_risk"] = plot_df["Ad_risk"].replace({0: "Healthy", 1: "NAC", 2: "Cancer"})

fig, ax = plt.subplots(1,1)
sns.boxplot(data=plot_df,x="Ad_risk",y="age",order=["Healthy","NAC","Cancer"],ax=ax)
annot = Annotator(
        ax,
        pairs=[("Healthy","NAC"),("Healthy","Cancer"),("NAC","Cancer")],
        data=plot_df,x="Ad_risk",y="age", order=["Healthy","NAC","Cancer"],
    )
annot.configure(
        test="Mann-Whitney",
        loc="inside", text_format="simple",
        show_test_name=False,
        verbose=2,
        comparisons_correction=None,
    )
annot.apply_test()
ax, _ = annot.annotate()
plting.transform_plot_ax(ax, legend_title="",linew=2.5)

# Download GSE132804

In [None]:
data_dir_GSE132804 = pl.Path("/add/path/here/")
data1 = pd.read_pickle(data_dir_GSE132804 / "GPL21145" / "beta_values.pkl").T
data1 = data1.dropna(axis=1)
ext_clin1 = pd.read_pickle(data_dir_GSE132804 / "GPL21145" / "GSE132804_GPL21145_meta_data.pkl").set_index("Sample_ID")
ext_clin1["Ad_risk"] = ext_clin1["crc risk"].replace({"High": 2, "Medium": 1, "Low": 0})

red_clin = ext_clin1
data1 = data1.loc[ext_clin1.index]

In [None]:
heatmap_df2, hit_fraction2 = discov.get_heatmap_df(selcpgs=union_cpgs, EPIC_m=data1, 
                                                   phenotypes=red_clin["Ad_risk"].ravel(), 
                                                  bal=True)

heatmap_df2["Ad_plot"] = heatmap_df2["Ad"].replace({0: "Healthy", 1: "NAA", 2: "NAC"})

In [None]:
advmpplt.get_performance_plots(
    heatmap_df=heatmap_df2,
    fig_dir=fig_dir / "GSE132804", hue_worm="Ad_plot",
    hue_palette_worm={"Healthy": colors[0], "NAA": colors[3], "NAC": colors[1]}, 
    ftsize=15, leg_ftsize=15, figsize=5, rocauc=False, leg_title="Tissue type", 
    hue_order=["Healthy","NAA","NAC"], order="Mixed Order",
)

In [None]:
plot_df = ext_clin1[["age","Ad_risk"]].astype(float)
plot_df["Ad_risk"] = plot_df["Ad_risk"].replace({0: "Healthy", 1: "NAA", 2: "NAC"})

fig, ax = plt.subplots(1,1)
sns.boxplot(data=plot_df,x="Ad_risk",y="age",order=["Healthy","NAA","NAC"],ax=ax)
annot = Annotator(
        ax,
        pairs=[("Healthy","NAA"),("Healthy","NAC"),("NAC","NAA")],
        data=plot_df,x="Ad_risk",y="age", order=["Healthy","NAA","NAC"],
    )
annot.configure(
        test="Mann-Whitney",
        loc="inside", text_format="simple",
        show_test_name=False,
        verbose=2,
        comparisons_correction=None,
    )
annot.apply_test()
ax, _ = annot.annotate()
plting.transform_plot_ax(ax, legend_title="",linew=2.5)

# Download GSE48684

In [None]:
data_dir_GSE48684 = pl.Path("/add/path/here/")
data = pd.read_pickle(data_dir_GSE48684 / "methylation.pkl")

data = data.set_index("Unnamed: 0")

data = data.dropna(axis=1)

ext_clin = pd.read_csv(data_dir_GSE48684 / "metadata.csv",index_col=0)

ext_clin["Location"] = ext_clin.region.replace({'colon': "Unknown", "Retum": "Rectum", 
                         "Distal": "Left", "Proximal": "Right", "right": "Right", "left": "Left"})

ext_clin["Ad_risk"] = ext_clin.disease.replace({"cancer": 3, "adenoma": 2, "normal-C": 1, "normal-H": 0})
data = data.loc[ext_clin.index]

In [None]:
heatmap_df3, hit_fraction3 = discov.get_heatmap_df(selcpgs=union_cpgs, 
                                            EPIC_m=data, 
                                            phenotypes=ext_clin["Ad_risk"].ravel(), 
                                                   bal=True)

heatmap_df3["Ad_plot"] = heatmap_df3["Ad"].replace({0: "Healthy", 1: "NAC", 2: "Adenoma", 3: "Cancer"})

In [None]:
advmpplt.get_performance_plots(
    heatmap_df=heatmap_df3,
    fig_dir=fig_dir / "GSE48684", hue_worm="Ad_plot",
    hue_palette_worm={"Healthy": colors[0], "NAC": colors[1], "Adenoma": colors[7], "Cancer": colors[5]}, 
    ftsize=15, leg_ftsize=15, figsize=5, rocauc=False, leg_title="Tissue type", 
    hue_order=["Healthy","NAC","Adenoma","Cancer"], order="Mixed Order",
)

# Hit fraction

In [None]:
dfs = []
df1 = heatmap_df1[heatmap_df1["Ad"].isin([0,1])].copy()
df1 = df1[["Hit fraction","Ad_plot","Ad","Mean meth score"]]
df1 = pd.concat([df1,pd.DataFrame(["GSE199057"]*df1.shape[0],
                                      index=df1.index,columns=["Batch"])],axis=1)
df1["Order"] = np.arange(df1.shape[0])
df1 = df1.sort_values(by=["Hit fraction","Mean meth score"])
df1["Mixed Order"] = np.arange(df1.shape[0])
vc1 = df1.Ad_plot.value_counts()
dfs.append(df1)

df2 = heatmap_df2[heatmap_df2["Ad"].isin([0,2])].copy()
df2["Ad"] = df2["Ad"].replace({2: 1})
df2 = df2[["Hit fraction","Ad_plot","Ad","Mean meth score"]]
df2 = pd.concat([df2,pd.DataFrame(["GSE132804"]*df2.shape[0],
                                      index=df2.index,columns=["Batch"])],axis=1)
df2["Order"] = np.arange(df2.shape[0])
df2 = df2.sort_values(by=["Hit fraction","Mean meth score"])
df2["Mixed Order"] = np.arange(df2.shape[0])
vc2 = df2.Ad_plot.value_counts()
dfs.append(df2)

df3 = heatmap_df3[heatmap_df3["Ad"].isin([0,1])].copy()
df3 = df3[["Hit fraction","Ad_plot","Ad","Mean meth score"]]
df3 = pd.concat([df3,pd.DataFrame(["GSE48684"]*df3.shape[0],
                                      index=df3.index,columns=["Batch"])],axis=1)
df3["Order"] = np.arange(df3.shape[0])
df3 = df3.sort_values(by=["Hit fraction","Mean meth score"])
df3["Mixed Order"] = np.arange(df3.shape[0])
vc3 = df3.Ad_plot.value_counts()
dfs.append(df3)

In [None]:
df = pd.concat(dfs)

In [None]:
fig, ax= plt.subplots(1,1, figsize=(8,4))
sns.boxplot(data=df, x="Batch", y="Hit fraction", 
            hue="Ad_plot", palette={"Healthy": colors[0], "NAC": colors[1]}, hue_order=["Healthy","NAC"],)
annot = Annotator(
        ax,
        pairs=[(("GSE199057", "Healthy"),("GSE199057", "NAC")),
               (("GSE132804", "Healthy"),("GSE132804", "NAC")),
               (("GSE48684", "Healthy"),("GSE48684", "NAC"))],
        data=df, x="Batch", y="Hit fraction", hue="Ad_plot", hue_order=["Healthy","NAC"],
    )
annot.configure(
        test="Mann-Whitney",
        loc="inside",
        show_test_name=False,
        verbose=2,
        text_format="simple",
        comparisons_correction=None,
        correction_format="replace",
    )
annot.apply_test()
ax, _ = annot.annotate()
plting.transform_plot_ax(ax, legend_title="Tissue type",linew=2.5)
ax.set_xticklabels(["GSE199057\n$N_{Healthy}$="+f"{vc1.loc['Healthy']}\n"+"$N_{NAC}$="+f"{vc1.loc['NAC']}",
                    "GSE132804\n$N_{Healthy}$="+f"{vc2.loc['Healthy']}\n"+"$N_{NAC}$="+f"{vc2.loc['NAC']}",
                    "GSE48684\n$N_{Healthy}$="+f"{vc3.loc['Healthy']}\n"+"$N_{NAC}$="+f"{vc3.loc['NAC']}"], size=12)
ax.set_xlabel("")
fig.savefig(fig_dir / "validationCohorts_hit_fraction_dist.svg", bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(16,10))
flatax = ax.flatten()
gsenames = ["GSE199057","GSE132804","GSE48684"]
for i in range(len(flatax)):
    
    sns.scatterplot(
        data=dfs[i],
        x="Mixed Order",
        y="Hit fraction",
        hue="Ad_plot", s=50,
        palette={"Healthy": colors[0], "NAC": colors[1]}, ax=flatax[i],
    )
    plting.transform_plot_ax(flatax[i], legend_title="Tissue", ftsize=20, leg_ftsize=20)
    flatax[i].set_xlabel(gsenames[i], fontsize=20)
fig.tight_layout()
fig.savefig(fig_dir / "worm_plot_full_validationCohorts.svg", bbox_inches="tight")

In [None]:
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 5))
RocCurveDisplay.from_predictions(
    df1["Ad"].astype(int).ravel(),
    df1["Mixed Order"].ravel(),
    ax=ax,
    c=colors[0],
    name='GSE199057',
)
RocCurveDisplay.from_predictions(
    df2["Ad"].astype(int).ravel(),
    df2["Mixed Order"].ravel(),
    ax=ax,
    c=colors[1],
    name='GSE132804',
)
RocCurveDisplay.from_predictions(
    df3["Ad"].astype(int).ravel(),
    df3["Mixed Order"].ravel(),
    ax=ax, 
    c=colors[5],
    name='GSE48684',
)
plt.plot(np.linspace(0, 1, 100), np.linspace(0, 1, 100), c=colors[3])
plting.transform_plot_ax(ax, legend_title="", ftsize=17, leg_ftsize=17, linew=3)
fig.savefig(fig_dir / "validationCohorts_ROC_AUC_curve_crossval.svg", bbox_inches="tight")

In [None]:
import adVMP.comparison_random as cprdn

In [None]:
gse199057_ref = 0.82

background_cpgs = pd.read_csv(data_dir / "variable_probes" / "union_cpgs_5_pct_most_variable_onlyhealthy.csv",index_col=0).values.ravel()

cprdn.get_comparison_rdn_val(
    background_cpgs=background_cpgs,
    figdir=fig_dir / "GSE199057",
    ref=gse199057_ref,
    phenotypes=test_ext_clin["Ad_risk"].ravel(),
    union_cpgs=union_cpgs,
    data=test_ext_data,
    clin=test_ext_clin,
    n_iter=200,
    nadj=True,
    hit_limit=4,
    risk_col="Ad_risk",
    exclude_one=True,
    age_col="age", 
    order="Mixed Order")


In [None]:
gse132804_ref = 0.88

background_cpgs = pd.read_csv(data_dir / "variable_probes" / "union_cpgs_5_pct_most_variable_onlyhealthy.csv",index_col=0).values.ravel()

cprdn.get_comparison_rdn_val(
    background_cpgs=background_cpgs,
    figdir=fig_dir / "GSE132804",
    ref=gse132804_ref,
    phenotypes=red_clin["Ad_risk"].ravel(),
    union_cpgs=union_cpgs,
    data=data1,
    clin=red_clin,
    n_iter=200,
    nadj=False,
    hit_limit=4,
    risk_col="Ad_risk",
    exclude_one=True,
    age_col="age", 
    order="Mixed Order")


In [None]:
gse48684_ref = 0.82

background_cpgs = pd.read_csv(data_dir / "variable_probes" / "union_cpgs_5_pct_most_variable_onlyhealthy.csv",index_col=0).values.ravel()

cprdn.get_comparison_rdn_val(
    background_cpgs=background_cpgs,
    figdir=fig_dir / "GSE48684",
    ref=gse48684_ref,
    phenotypes=ext_clin["Ad_risk"].ravel(),
    union_cpgs=union_cpgs,
    data=data,
    clin=ext_clin,
    n_iter=200,
    nadj=True,
    hit_limit=4,
    risk_col="Ad_risk",
    exclude_one=True,
    age_col=None, 
    order="Mixed Order")


# How about vs neoplastic tissue?

In [None]:
dfs = []
df1 = heatmap_df1[["Hit fraction","Ad_plot","Ad","Mean meth score"]]
df1 = pd.concat([df1,pd.DataFrame(["GSE199057"]*df1.shape[0],
                                      index=df1.index,columns=["Batch"])],axis=1)
df1["Order"] = np.arange(df1.shape[0])
df1 = df1.sort_values(by=["Hit fraction","Mean meth score"])
df1["Mixed Order"] = np.arange(df1.shape[0])
vc1 = df1.Ad_plot.value_counts()
dfs.append(df1)

df3 = heatmap_df3[["Hit fraction","Ad_plot","Ad","Mean meth score"]]
df3 = pd.concat([df3,pd.DataFrame(["GSE48684"]*df3.shape[0],
                                      index=df3.index,columns=["Batch"])],axis=1)
df3["Order"] = np.arange(df3.shape[0])
df3 = df3.sort_values(by=["Hit fraction","Mean meth score"])
df3["Mixed Order"] = np.arange(df3.shape[0])
vc3 = df3.Ad_plot.value_counts()
dfs.append(df3)

In [None]:
df = pd.concat(dfs)

In [None]:
fig, ax= plt.subplots(1,1, figsize=(8,4))
sns.boxplot(data=df, x="Batch", y="Hit fraction", 
            hue="Ad_plot", palette={"Healthy": colors[0], 
                                    "NAC": colors[1],
                                    "Adenoma": colors[7], 
                                    "Cancer": colors[5]}, 
            hue_order=["Healthy","NAC","Adenoma","Cancer"],)
annot = Annotator(
        ax,
        pairs=[(("GSE199057", "Healthy"),("GSE199057", "NAC")),
               (("GSE199057", "NAC"),("GSE199057", "Cancer")),
               (("GSE199057", "Healthy"),("GSE199057", "Cancer")),
               (("GSE48684", "Healthy"),("GSE48684", "NAC")),
               (("GSE48684", "Healthy"),("GSE48684", "Adenoma")),
               (("GSE48684", "Healthy"),("GSE48684", "Cancer")),
               (("GSE48684", "NAC"),("GSE48684", "Adenoma")),
               (("GSE48684", "NAC"),("GSE48684", "Cancer")),
              (("GSE48684", "Adenoma"),("GSE48684", "Cancer")),],
        data=df, x="Batch", y="Hit fraction", hue="Ad_plot", hue_order=["Healthy","NAC","Adenoma","Cancer"],
    )
annot.configure(
        test="Mann-Whitney",
        loc="inside",
        show_test_name=False,
        text_format="simple",
        verbose=2,
        comparisons_correction=None,
        correction_format="replace",
    )
annot.apply_test()
ax, _ = annot.annotate()
plting.transform_plot_ax(ax, legend_title="Tissue type",linew=2.5)
ax.set_xticklabels(["GSE199057\n$N_{Healthy}$="+f"{vc1.loc['Healthy']}\n"+"$N_{NAC}$="+f"{vc1.loc['NAC']}\n"+"$N_{Cancer}$="+f"{vc1.loc['Cancer']}",
                    "GSE48684\n$N_{Healthy}$="+f"{vc3.loc['Healthy']}\n"+"$N_{NAC}$="+f"{vc3.loc['NAC']}\n"+"$N_{Adenoma}$="+f"{vc3.loc['Adenoma']}\n"+"$N_{Cancer}$="+f"{vc3.loc['Cancer']}"], size=12)
fig.savefig(fig_dir / "validationCohorts_WNeoplastic_hit_fraction_dist.svg", bbox_inches="tight")

In [None]:
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay

In [None]:
# compute the ROC AUC necessitates a binary indicator
# we thus transform the indicator as being neoplastic or not
red_df1 = df1[df1["Ad"].isin([0,2])]
red_df1["Ad"] = red_df1["Ad"].replace({2: 1})

red_df3_ad = df3[df3["Ad"].isin([0,2])]
red_df3_ad["Ad"] = red_df3_ad["Ad"].replace({2: 1})

red_df3_cr = df3[df3["Ad"].isin([0,3])]
red_df3_cr["Ad"] = red_df3_cr["Ad"].replace({3: 1})

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 5))
RocCurveDisplay.from_predictions(
    red_df1["Ad"].astype(int).ravel(),
    red_df1["Mixed Order"].ravel(),
    ax=ax,
    c=colors[0],
    name='GSE199057',
)
RocCurveDisplay.from_predictions(
    red_df3_ad["Ad"].astype(int).ravel(),
    red_df3_ad["Mixed Order"].ravel(),
    ax=ax,
    c=colors[1],
    name='GSE48684\n(Adenoma)',
)
RocCurveDisplay.from_predictions(
    red_df3_cr["Ad"].astype(int).ravel(),
    red_df3_cr["Mixed Order"].ravel(),
    ax=ax, 
    c=colors[5],
    name='GSE48684\n(Cancer)',
)
plt.plot(np.linspace(0, 1, 100), np.linspace(0, 1, 100), c=colors[3])
plting.transform_plot_ax(ax, legend_title="", ftsize=17, leg_ftsize=17, linew=3)
fig.savefig(fig_dir / "validationCohorts_WNeoplastic_ROC_AUC_curve_crossval.svg", bbox_inches="tight")

# Adenoma on the left

In [None]:
df2 = heatmap_df2.copy()
df2 = df2[["Hit fraction","Ad_plot","Ad","Mean meth score"]]
df2 = pd.concat([df2,pd.DataFrame(["GSE132804"]*df2.shape[0],
                                      index=df2.index,columns=["Batch"])],axis=1)
df2["Order"] = np.arange(df2.shape[0])
df2 = df2.sort_values(by=["Hit fraction","Mean meth score"])
df2["Mixed Order"] = np.arange(df2.shape[0])
vc2 = df2.Ad_plot.value_counts()

In [None]:
fig, ax= plt.subplots(1,1, figsize=(8,4))
sns.boxplot(data=df2, x="Batch",
            y="Hit fraction", 
            hue="Ad_plot",
            palette={"Healthy": colors[0], "NAA": colors[3], "NAC": colors[1]}, 
            hue_order=["Healthy","NAA",'NAC'],)
annot = Annotator(
        ax,
        pairs=[(("GSE132804","Healthy"), ("GSE132804","NAA")),
               (("GSE132804","Healthy"), ("GSE132804","NAC")),
               (("GSE132804","NAA"), ("GSE132804","NAC"))],
        data=df2, x="Batch", y="Hit fraction", hue="Ad_plot", hue_order=["Healthy","NAA","NAC"],
    )
annot.configure(
        test="Mann-Whitney",
        loc="inside",
        show_test_name=False,
        text_format="simple",
        verbose=2,
        comparisons_correction="BH",
        correction_format="replace",
    )
annot.apply_test()
ax, _ = annot.annotate()
plting.transform_plot_ax(ax, legend_title="Tissue type",linew=2.5)
ax.set_xticklabels(["GSE132804\n$N_{Healthy}$="+f"{vc2.loc['Healthy']}\n"+"$N_{NAA}$="+f"{vc2.loc['NAA']}\n"+"$N_{NAC}$="+f"{vc2.loc['NAC']}",], size=12)
fig.savefig(fig_dir / "validationCohorts_NACA_hit_fraction_dist.svg", bbox_inches="tight")

In [None]:
red_df2_ad = df2[df2["Ad"].isin([0,1])]

red_df2_cr = df2[df2["Ad"].isin([0,2])]
red_df2_cr["Ad"] = red_df2_cr["Ad"].replace({2: 1})

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 5))
RocCurveDisplay.from_predictions(
    red_df2_ad["Ad"].astype(int).ravel(),
    red_df2_ad["Mixed Order"].ravel(),
    ax=ax,
    c=colors[1],
    name='GSE132804\n(NAA)',
)
RocCurveDisplay.from_predictions(
    red_df2_cr["Ad"].astype(int).ravel(),
    red_df2_cr["Mixed Order"].ravel(),
    ax=ax, 
    c=colors[5],
    name='GSE132804\n(NAC)',
)
plt.plot(np.linspace(0, 1, 100), np.linspace(0, 1, 100), c=colors[3])
plting.transform_plot_ax(ax, legend_title="", ftsize=17, leg_ftsize=17, linew=3)
fig.savefig(fig_dir / "validationCohorts_NACA_ROC_AUC_curve_crossval.svg", bbox_inches="tight")

# Manhattan distances for stochastic to coordinated regime

In [None]:
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.utils import shuffle
from scipy.stats import ks_2samp

def manhattan_distance_plot(heatmap_df: pd.DataFrame, 
                            hit_limit: int=4, 
                            ad_col: str="Ad_plot", 
                            n_iter: int=100) -> None:

    cols = heatmap_df.columns.str.startswith("cg") + heatmap_df.columns.str.startswith(
        "chr"
    )
    
    # get the binarized matrix, 1 if the probe is a "hit"
    deviation_distance = (heatmap_df.loc[:,cols].abs()>hit_limit).astype(int)

    true_manh_distances = {i: [] for i in heatmap_df[ad_col].unique()}
    # compute the manhattan distance on the binarized matrix
    manh_dist = manhattan_distances(deviation_distance.values)

    manh_dist = pd.DataFrame(manh_dist,index=deviation_distance.index,columns=deviation_distance.index)

    for risk in sorted(heatmap_df[ad_col].unique()):
        # get the manhattan distances between each pair of a specific category
        # e.g., get the manhattan distance between all normal samples or all adenoma samples
        normal_dist = manh_dist.loc[(heatmap_df[ad_col]==risk).values,(heatmap_df[ad_col]==risk).values].values
        true_manh_distances[risk] = normal_dist[np.triu_indices(normal_dist.shape[0], k = 1)]

    random_manh_distances = {h: [] for h in heatmap_df[ad_col].unique()}
    # to create random matrix, we randomly permute for each patient the hit assignment (so that the total number of hits stays similar between patients)
    for i in range(n_iter):
        matrix = deviation_distance.copy()
        for i in range(matrix.shape[0]):
            matrix.iloc[i,:] = matrix.iloc[i,:].sample(frac=1).ravel()
        rdn_manh_dist = manhattan_distances(matrix)    

        for risk in heatmap_df[ad_col].unique():
            risk_idx = np.where((heatmap_df[ad_col]==risk).values.ravel())[0]
            risk_dist = rdn_manh_dist[np.ix_(risk_idx,risk_idx)]
            rdn_dist = risk_dist[np.triu_indices(risk_dist.shape[0], k = 1)]
            random_manh_distances[risk].append(rdn_dist)

    for i in random_manh_distances:
        # compare the random and true manhattan distances with the KS test 
        print(i,ks_2samp(np.concatenate(random_manh_distances[i]),true_manh_distances[i]))
        print(np.mean(np.concatenate(random_manh_distances[i])),np.mean(true_manh_distances[i]))
    
    fig, ax = plt.subplots(1,1)
    colors = ['r','b','g','y']
    for i,tp in enumerate(random_manh_distances):
        sns.kdeplot(np.concatenate(random_manh_distances[tp]),c=colors[i],linestyle="--",
                    ax=ax,label=f"Null-{tp}")
        plt.axvline(np.mean(np.concatenate(random_manh_distances[tp])),c=colors[i],linestyle="--")
    for i,tp in enumerate(true_manh_distances):
        sns.kdeplot(true_manh_distances[tp],c=colors[i],label=f"{tp}")
        plt.axvline(np.mean(true_manh_distances[tp]),c=colors[i])
    plting.transform_plot_ax(ax=ax, legend_title="")
    ax.set_xlabel("Manhattan distance")
    return fig

In [None]:
fig = manhattan_distance_plot(heatmap_df=heatmap_df1, hit_limit=4, ad_col="Ad_plot", n_iter=200)
fig.savefig(fig_dir / "GSE199057" / "manhattan_plot_GSE199057.svg", bbox_inches="tight")

In [None]:
fig = manhattan_distance_plot(heatmap_df=heatmap_df3, hit_limit=4, ad_col="Ad_plot", n_iter=200)
fig.savefig(fig_dir / "GSE48684" / "manhattan_plot_GSE48684.svg", bbox_inches="tight")