In [None]:
import pandas as pd
import numpy as np
import pathlib as pl

from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator

from scipy.stats import mannwhitneyu, fisher_exact, pearsonr, kruskal

In [None]:
import sys
sys.path.append("../../FinalCode/")
import download.download as dwnl
import utils.plotting as plting
import adVMP.adVMP_discovery as discov
import adVMP.adVMP_plots as advmpplt

In [None]:
# For figures
colors = sns.color_palette("muted")
fig_dir = pl.Path("/add/path/here")

In [None]:
base_dir = pl.Path("/add/path/here")
base_dir4 = pl.Path("/add/path/here")

data_dir = pl.Path("/add/path/here")

bad_probes = pd.read_csv(data_dir / "auxiliary" / "sketchy_probe_list_epic.csv",index_col=0).values.ravel()
sample_origin_path = pl.Path(data_dir / "clinical" / "sample_origin_wbatch.csv")

clinical_path = pl.Path(data_dir / "clinical" / "cleaned_clinical_reduced_diet.csv")
target_path = pl.Path(data_dir / "clinical" / "targets.csv")

In [None]:
EPIC2_b, EPIC2_clin, EPIC2_samples, EPIC2_phenotypes, EPIC3_b, EPIC3_clin, EPIC3_samples, EPIC3_phenotypes = dwnl.download_EPIC(sample_origin_path=sample_origin_path, 
                     base_dir=base_dir, clinical_path=clinical_path, target_path=target_path,
                  bad_probes=bad_probes, EPIC4=False) 

In [None]:
EPIC4_b, EPIC4_clin, EPIC4_samples, EPIC4_phenotypes = dwnl.download_EPIC(sample_origin_path=sample_origin_path, 
                     base_dir=base_dir4, clinical_path=clinical_path, target_path=target_path, 
                  bad_probes=bad_probes, EPIC4=True) 

In [None]:
background_cpgs = EPIC4_b.columns.intersection(EPIC2_b.columns).to_numpy()
pd.Series(background_cpgs).to_csv(data_dir / "auxiliary" / "full_background_probes.csv")

# Find adVMP

In [None]:
resdir = pl.Path("add/path/here")

In [None]:
test_results = discov.get_hyper_vDMC(methylation=EPIC2_b, phenotypes=EPIC2_phenotypes)

In [None]:
test_results.to_csv(resdir / "adVMP_SWEPIC1_right.csv")

In [None]:
test_results = discov.get_hyper_vDMC(methylation=EPIC3_b.loc[selected_patients2], phenotypes=red_phenotype2)

In [None]:
test_results.to_csv(resdir / "adVMP_SWEPIC2_right.csv")

In [None]:
test_results = discov.get_hyper_vDMC(methylation=EPIC4_b, phenotypes=EPIC4_phenotypes)

In [None]:
test_results.to_csv(resdir / "adVMP_SWEPIC3_right.csv")

# Ensembling probes

In [None]:
test_results = {}
for i in ['1','2','3']:
    test_results[i]  = pd.read_csv(resdir / f"adVMP_SWEPIC{i}_right.csv", index_col=0)

In [None]:
sign_probes = {}
for i in test_results:
    # keep only significant probes, that are sign. differentially variable, differentially methylated, and more variable in NAA
    sign_probes[i] = test_results[i][(test_results[i]["q"]<0.05) & (test_results[i]["ttest_p"]<0.05) & (test_results[i]["diffV"]>0)]

In [None]:
# get the union of the intersection of any two cohorts
sel_cpgs = [sign_probes["1"].index.intersection(sign_probes["2"].index)]
sel_cpgs.append(sign_probes["1"].index.intersection(sign_probes["3"].index))
sel_cpgs.append(sign_probes["2"].index.intersection(sign_probes["3"].index))
union_cpgs = np.unique(np.concatenate(sel_cpgs))

# only get CpGs that are present in all datasets 
union_cpgs = np.intersect1d(union_cpgs,EPIC4_b.columns)

In [None]:
all_advmps = np.unique(np.concatenate([sign_probes["1"].index,sign_probes["2"].index,sign_probes["3"].index]))

In [None]:
pd.Series(union_cpgs).to_csv(resdir / "union_cpgs.csv")
pd.Series(all_advmps).to_csv(resdir / "all_advmps.csv")

In [None]:
# library
import matplotlib.pyplot as plt
from matplotlib_venn import venn3

In [None]:
set1 = set(sign_probes["1"].index.intersection(EPIC4_b.columns))
set2 = set(sign_probes["2"].index.intersection(EPIC4_b.columns))
set3 = set(sign_probes["3"].index)
plt.figure(figsize=(3,3))
v = venn3([set1, set2, set3], ('SWEPIC1', 'SWEPIC2', 'SWEPIC3'), 
      set_colors=(colors[6], colors[7], colors[9]), alpha=0.9)
v.get_patch_by_id('110').set_color("r")
v.get_patch_by_id('101').set_color("r")
v.get_patch_by_id('011').set_color("r")
plt.text(0.5,0,'adVMPs',c="r",fontsize=13)
plt.gcf().savefig(fig_dir / "venn_diagram_union_probes.svg", dpi=250, bbox_inches="tight")

# Visualize probe performance

In [None]:
heatmap_df1, hit_fraction1 = discov.get_heatmap_df(selcpgs=union_cpgs, EPIC_m=EPIC2_b, phenotypes=EPIC2_phenotypes, bal=True)

heatmap_df2, hit_fraction2 = discov.get_heatmap_df(selcpgs=union_cpgs, EPIC_m=EPIC3_b, phenotypes=EPIC3_phenotypes, bal=True)

heatmap_df3, hit_fraction3 = discov.get_heatmap_df(selcpgs=union_cpgs, EPIC_m=EPIC4_b, phenotypes=EPIC4_phenotypes, bal=True)

In [None]:
advmpplt.get_performance_plots(
    heatmap_df=heatmap_df1,
    fig_dir=fig_dir / "SWEPIC1", hue_worm="Ad_plot",
    hue_palette_worm={"No": colors[0], "Yes": colors[3]}, 
    ftsize=15, leg_ftsize=15, figsize=5, order="Mixed Order",
)

In [None]:
advmpplt.get_performance_plots(
    heatmap_df=heatmap_df2,
    fig_dir=fig_dir / "SWEPIC2", hue_worm="Ad_plot",
    hue_palette_worm={"No": colors[0], "Yes": colors[3]}, 
    ftsize=15, leg_ftsize=15, figsize=5, order="Mixed Order",
)

In [None]:
advmpplt.get_performance_plots(
    heatmap_df=heatmap_df3,
    fig_dir=fig_dir / "SWEPIC3", hue_worm="Ad_plot",
    hue_palette_worm={"No": colors[0], "Yes": colors[3]}, 
    ftsize=15, leg_ftsize=15, figsize=5, order="Mixed Order",
)

# Hit fraction

In [None]:
dfs = []
df1 = heatmap_df1[["Hit fraction","Ad_plot","Mixed Order"]]
df1 = pd.concat([df1,pd.DataFrame(["SWEPIC1"]*df1.shape[0],
                                      index=df1.index,columns=["Batch"])],axis=1)
vc1 = df1.Ad_plot.value_counts()
dfs.append(df1)
df2 = heatmap_df2[["Hit fraction","Ad_plot","Mixed Order"]]
df2 = pd.concat([df2,pd.DataFrame(["SWEPIC2"]*df2.shape[0],
                                      index=df2.index,columns=["Batch"])],axis=1)
vc2 = df2.Ad_plot.value_counts()
dfs.append(df2)
df3 = heatmap_df3[["Hit fraction","Ad_plot","Mixed Order"]]
df3 = pd.concat([df3,pd.DataFrame(["SWEPIC3"]*df3.shape[0],
                                      index=df3.index,columns=["Batch"])],axis=1)
vc3 = df3.Ad_plot.value_counts()
dfs.append(df3)

In [None]:
df = pd.concat(dfs)

In [None]:
fig, ax= plt.subplots(1,1, figsize=(8,4))
sns.boxplot(data=df, x="Batch", y="Hit fraction", hue="Ad_plot", palette={"No": colors[0], "Yes": colors[3]})
annot = Annotator(
        ax,
        pairs=[(("SWEPIC1", "No"),("SWEPIC1", "Yes")),
               (("SWEPIC2", "No"),("SWEPIC2", "Yes")),
               (("SWEPIC3", "No"),("SWEPIC3", "Yes"))],
        data=df, x="Batch", y="Hit fraction", hue="Ad_plot",
    )
annot.configure(
        test="Mann-Whitney",
        loc="inside",
    text_format="simple",
        show_test_name=False,
        verbose=2,
        comparisons_correction=None,
        correction_format="replace",
    )
annot.apply_test()
ax, _ = annot.annotate()
plting.transform_plot_ax(ax, legend_title="Adenoma (right)",linew=2.5)
ax.set_xticklabels(["SWEPIC1\n$N_{No}$="+f"{vc1.loc['No']}\n"+"$N_{Yes}$="+f"{vc1.loc['Yes']}",
                    "SWEPIC2\n$N_{No}$="+f"{vc2.loc['No']}\n"+"$N_{Yes}$="+f"{vc2.loc['Yes']}",
                    "SWEPIC3\n$N_{No}$="+f"{vc3.loc['No']}\n"+"$N_{Yes}$="+f"{vc3.loc['Yes']}"], size=12)
ax.set_xlabel("")
fig.savefig(fig_dir / "hit_fraction_dist.svg", bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(16,10))
flatax = ax.flatten()
for i in range(len(flatax)):
    
    sns.scatterplot(
        data=dfs[i],
        x="Mixed Order",
        y="Hit fraction",
        hue="Ad_plot", s=50,
        palette={"No": colors[0], "Yes": colors[3]}, ax=flatax[i],
    )
    plting.transform_plot_ax(flatax[i], legend_title="Adenoma", ftsize=20, leg_ftsize=20)
    flatax[i].set_xlabel(f"SWEPIC{i+1}", fontsize=20)
fig.tight_layout()
fig.savefig(fig_dir / "worm_plot_full_swepic.svg", bbox_inches="tight")

In [None]:
from scipy.stats import spearmanr

In [None]:
r1, p1 = spearmanr(heatmap_df1["Ad"],heatmap_df1["Mixed Order"])
r2, p2 = spearmanr(heatmap_df2["Ad"],heatmap_df2["Mixed Order"])
r3, p3 = spearmanr(heatmap_df3["Ad"],heatmap_df3["Mixed Order"])

In [None]:
r1, r2, r3

In [None]:
p1, p2, p3

# Hit fraction by age group

In [None]:
from typing import List
def get_plot_by_age_group(EPIC_clin: pd.DataFrame, 
                          age_bins: List, age_cat_labels: List, 
                          heatmap_df: pd.DataFrame, title: str) -> plt.Axes:
    age_cat = pd.cut(EPIC_clin["Age at visit"],
       bins=age_bins, labels=age_cat_labels)

    df = pd.concat([heatmap_df[["Hit fraction","Ad_plot"]],age_cat],axis=1)
    vc = df.value_counts(["Age at visit","Ad_plot"])
    xticklabs = [f"{cat}\n"+"$N_{No}$="+f"{vc.loc[cat,'No']}\n"+"$N_{Yes}$="+f"{vc.loc[cat,'Yes']}" for cat in age_cat_labels]
    
    pairs = [((cat,"No"),(cat,"Yes")) for cat in age_cat_labels]
    fig, ax = plt.subplots(1,1)
    sns.boxplot(data=df, x="Age at visit",y="Hit fraction",hue="Ad_plot",
                palette={"No": colors[0], "Yes": colors[3]},
                ax=ax)

    annot = Annotator(
            ax,
            pairs=pairs,
            data=df, x="Age at visit", y="Hit fraction", hue="Ad_plot",
        )
    annot.configure(
            test="Mann-Whitney",
            loc="inside",
        text_format="simple",
            show_test_name=False,
            verbose=2,
            comparisons_correction=None,
            correction_format="replace",
        )
    annot.apply_test()
    ax, _ = annot.annotate()

    plting.transform_plot_ax(ax, legend_title="Adenoma")
    ax.set_xticklabels(xticklabs)
    ax.set_xlabel("")
    ax.set_title(title)
    
    return ax

In [None]:
age_bins = [0,55,65,120]
age_cat_labels = ["<55","55-65",">=65"]

In [None]:
ax = get_plot_by_age_group(EPIC_clin=EPIC2_clin, 
                          age_bins=age_bins, age_cat_labels=age_cat_labels, 
                          heatmap_df=heatmap_df1, title="SWEPIC1")
ax.figure.savefig(fig_dir / "SWEPIC1_age_cat_hit_fraction_dist.svg", bbox_inches="tight")

In [None]:
ax = get_plot_by_age_group(EPIC_clin=EPIC3_clin, 
                          age_bins=age_bins, age_cat_labels=age_cat_labels, 
                          heatmap_df=heatmap_df2, title="SWEPIC2")
ax.figure.savefig(fig_dir / "SWEPIC2_age_cat_hit_fraction_dist.svg", bbox_inches="tight")

In [None]:
ax = get_plot_by_age_group(EPIC_clin=EPIC4_clin, 
                          age_bins=age_bins, age_cat_labels=age_cat_labels, 
                          heatmap_df=heatmap_df3, title="SWEPIC3")
ax.figure.savefig(fig_dir / "SWEPIC3_age_cat_hit_fraction_dist.svg", bbox_inches="tight")

# Compare with polyp size

In [None]:
palette_size = {"None": colors[0], "5mm": colors[3], ">=6mm": colors[1]}
palette_nr = {"0": colors[0], "1": colors[3], ">=2": colors[1]}

In [None]:
subset_1 = EPIC2_clin[EPIC2_clin["Polyp Location"].isin(["None","Right"])].index.to_numpy()
subset_2 = EPIC3_clin[EPIC3_clin["Polyp Location"].isin(["None","Right"])].index.to_numpy()
subset_3 = EPIC4_clin[EPIC4_clin["Polyp Location"].isin(["None","Right"])].index.to_numpy()

In [None]:
heatmap_polyp1 = discov.get_polyp_size_nr_link(EPIC_clin=EPIC2_clin, heatmap_df=heatmap_df1, subset_pat=subset_1)
heatmap_polyp2 = discov.get_polyp_size_nr_link(EPIC_clin=EPIC3_clin, heatmap_df=heatmap_df2, subset_pat=subset_2)
heatmap_polyp3 = discov.get_polyp_size_nr_link(EPIC_clin=EPIC4_clin, heatmap_df=heatmap_df3, subset_pat=subset_3)

In [None]:
advmpplt.plot_polyp_size_nr_link(heatmap_df=heatmap_polyp1, 
                            fig_dir=fig_dir/"SWEPIC1", 
                        palette_size=palette_size, 
                        palette_nr=palette_nr, ftsize=15, leg_ftsize=15)

In [None]:
advmpplt.plot_polyp_size_nr_link(heatmap_df=heatmap_polyp2, 
                            fig_dir=fig_dir/"SWEPIC2", 
                        palette_size=palette_size, 
                        palette_nr=palette_nr, ftsize=15, leg_ftsize=15)

In [None]:
advmpplt.plot_polyp_size_nr_link(heatmap_df=heatmap_polyp3, 
                            fig_dir=fig_dir/"SWEPIC3", 
                        palette_size=palette_size, 
                        palette_nr=palette_nr, ftsize=15, leg_ftsize=15)

In [None]:
cols = ['Ad', 'Hit fraction',
       'Ad_plot', 'Order', 'polyps_total_nr', 'polyps_total_size',
       'polyps_right_nr', 'size_py_rght', 'Polyp Nr Right', 'Polyp Nr Total',
       'Polyp Size cat', 'Polyp Size Total cat']
heatmap_polyp = pd.concat([heatmap_polyp1[cols],heatmap_polyp2[cols],heatmap_polyp3[cols]])

In [None]:
advmpplt.plot_polyp_size_nr_link(heatmap_df=heatmap_polyp, 
                            fig_dir=fig_dir, figsize=(3,5),
                        palette_size=palette_size, 
                        palette_nr=palette_nr, ftsize=15, leg_ftsize=15)

# Link with cell type composition (deconvolution)

In [None]:
deconv_path  = pl.Path("/add/path/here/")
deconv4 = pd.read_csv(deconv_path / "epic4_estimates.csv",index_col=0)
deconv4.index = deconv4.index.astype(str)
deconv123 = pd.read_csv(deconv_path / "epic123_estimates.csv",index_col=0)
deconv123.index = deconv123.index.astype(str)

deconv = pd.concat([deconv123,deconv4])

deconv = pd.concat([df,deconv.loc[df.index]],axis=1)

In [None]:
from scipy.stats import pearsonr
for ct in ["EC","Epi","Lym","Mye","Stromal"]:
    print(pearsonr(deconv["Hit fraction"].ravel(),deconv[ct].ravel()))


# Getting info about adVMPs

In [None]:
union2_desc = EPIC2_b.loc[:,union_cpgs]
union2_desc = pd.concat([union2_desc, 
                         pd.DataFrame(EPIC2_phenotypes, 
                                      columns=["Adenoma (right)"], 
                                      index=union2_desc.index)],axis=1)

In [None]:
union3_desc = EPIC3_b.loc[:,union_cpgs]
union3_desc = pd.concat([union3_desc, 
                         pd.DataFrame(EPIC3_phenotypes, 
                                      columns=["Adenoma (right)"], 
                                      index=union3_desc.index)],axis=1)

In [None]:
union4_desc = EPIC4_b.loc[:,union_cpgs]
union4_desc = pd.concat([union4_desc, 
                         pd.DataFrame(EPIC4_phenotypes, 
                                      columns=["Adenoma (right)"], 
                                      index=union4_desc.index)],axis=1)

In [None]:
union2_desc = union2_desc.groupby(by="Adenoma (right)").describe().T
union2_desc.columns = ["(SWEPIC1) No Ad","(SWEPIC1) Ad"]

In [None]:
union3_desc = union3_desc.groupby(by="Adenoma (right)").describe().T
union3_desc.columns = ["(SWEPIC2) No Ad","(SWEPIC2) Ad"]

In [None]:
union4_desc = union4_desc.groupby(by="Adenoma (right)").describe().T
union4_desc.columns = ["(SWEPIC3) No Ad","(SWEPIC3) Ad"]

In [None]:
full_union_desc = pd.concat([union2_desc,
           union3_desc,
           union4_desc],axis=1)

In [None]:
full_union_desc.to_csv(data_dir / "adVMP" / "full_adVMP_description.csv")