In [None]:
import pandas as pd
import numpy as np
import pathlib as pl

from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator

from scipy.stats import mannwhitneyu, fisher_exact, pearsonr, kruskal

In [None]:
import sys
sys.path.append("../../FinalCode/")
import download.download as dwnl
import utils.plotting as plting

In [None]:
# For figures
colors = sns.color_palette("muted")
fig_dir = pl.Path("/add/path/here/")

In [None]:
def print_characteristic(EPIC_clin: pd.DataFrame, 
                         EPIC_phenotypes: np.ndarray, 
                         charac: str, count: bool) -> None:
    print("Total")
    if count:
        print(EPIC_clin[charac].sum(), EPIC_clin[charac].sum()/EPIC_clin.shape[0])
    else:
        print(EPIC_clin[charac].describe())
    print("Healthy")
    healthy = EPIC_clin.iloc[EPIC_phenotypes==0][charac]
    if count:
        print(healthy.sum(), healthy.sum()/healthy.shape[0])
    else:
        print(healthy.describe())
    print("Adenoma pat")
    adenoma = EPIC_clin.iloc[EPIC_phenotypes==1][charac]
    if count:
        print(adenoma.sum(), adenoma.sum()/adenoma.shape[0])
    else:
        print(adenoma.describe())
    print("pval healthy vs adenoma")
    if count:
        a = adenoma.sum()
        b = healthy.sum()
        c = adenoma.shape[0] - a
        d = healthy.shape[0] - b
        print(fisher_exact(np.array([[a,b],[c,d]])))
    else:
        print(mannwhitneyu(healthy.dropna().ravel(),adenoma.dropna().ravel()))

In [None]:
base_dir = pl.Path("/add/path/here/")
base_dir4 = pl.Path("/add/path/here/")

data_dir = pl.Path("/add/path/here/")

bad_probes = pd.read_csv(data_dir / "auxiliary" / "sketchy_probe_list_epic.csv",index_col=0).values.ravel()
sample_origin_path = pl.Path(data_dir / "clinical" / "sample_origin_wbatch.csv")

clinical_path = pl.Path(data_dir / "clinical" /"cleaned_clinical_reduced_diet.csv")
target_path = pl.Path(data_dir / "clinical" / "targets.csv")

In [None]:
EPIC2_b, EPIC2_clin, EPIC2_samples, EPIC2_phenotypes, EPIC3_b, EPIC3_clin, EPIC3_samples, EPIC3_phenotypes = dwnl.download_EPIC(sample_origin_path=sample_origin_path, 
                     base_dir=base_dir, clinical_path=clinical_path, target_path=target_path,
                  bad_probes=bad_probes, EPIC4=False) 

In [None]:
EPIC4_b, EPIC4_clin, EPIC4_samples, EPIC4_phenotypes = dwnl.download_EPIC(sample_origin_path=sample_origin_path, 
                     base_dir=base_dir4, clinical_path=clinical_path, target_path=target_path, 
                  bad_probes=bad_probes, EPIC4=True) 

In [None]:
def get_barplot_gender(df: pd.DataFrame, name: str, color) -> None:
    fig, ax = plt.subplots(1,1,figsize=(2,3))
    sns.barplot(data=df.T, color=color, ax=ax)
    plting.transform_plot_ax(ax, legend_title="")
    ax.set_ylabel("%",fontsize=15)
    ax.bar_label(ax.containers[0], fmt='%.0f', fontsize=15)
    ax.figure.savefig(fig_dir / "clinical_barplots" / name, bbox_inches="tight") 

In [None]:
df = pd.DataFrame(np.array([100,0]), index=["F","M"])
get_barplot_gender(df,"SWEPIC1_gender.svg",colors[6])

df = pd.DataFrame(np.array([100,0]), index=["F","M"])
get_barplot_gender(df,"SWEPIC2_gender.svg",colors[7])

df = pd.DataFrame(np.array([0,100]), index=["F","M"])
get_barplot_gender(df,"SWEPIC3_gender.svg",colors[9])

In [None]:
def get_barplot_ad_swepic(df: pd.DataFrame, name: str) -> None:
    fig, ax = plt.subplots(1,1,figsize=(2,3))
    sns.barplot(data=df.T, palette = [colors[0], colors[3]],ax=ax)
    ax.set_ylim([0,100])
    plting.transform_plot_ax(ax, legend_title="")
    ax.get_yaxis().set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.bar_label(ax.containers[0], fmt='%.0f', fontsize=15)
    ax.figure.savefig(fig_dir / "clinical_barplots" / name, bbox_inches="tight")

In [None]:
percpos = np.sum(EPIC2_phenotypes.astype(int))/len(EPIC2_phenotypes)
percneg = (len(EPIC2_phenotypes) - np.sum(EPIC2_phenotypes.astype(int)))/len(EPIC2_phenotypes)
df = pd.DataFrame(np.array([percneg,percpos]), index=["No Ad","Ad"])*100

get_barplot_ad_swepic(df=df, name="SWEPIC1_ad.svg")

percpos = np.sum(EPIC3_phenotypes.astype(int))/len(EPIC3_phenotypes)
percneg = (len(EPIC3_phenotypes) - np.sum(EPIC3_phenotypes.astype(int)))/len(EPIC3_phenotypes)

df = pd.DataFrame(np.array([percneg,percpos]), index=["No Ad","Ad"])*100
get_barplot_ad_swepic(df=df, name="SWEPIC2_ad.svg")

percpos = np.sum(EPIC4_phenotypes.astype(int))/len(EPIC4_phenotypes)
percneg = (len(EPIC4_phenotypes) - np.sum(EPIC4_phenotypes.astype(int)))/len(EPIC4_phenotypes)

df = pd.DataFrame(np.array([percneg,percpos]), index=["No Ad","Ad"])*100
get_barplot_ad_swepic(df=df, name="SWEPIC3_ad.svg")

In [None]:
df = pd.concat([EPIC2_clin[["Age at visit"]],
                pd.DataFrame(EPIC2_phenotypes, index=EPIC2_clin.index, columns=["Ad"]).replace({0: "No Ad", 1: "Ad"})],axis=1)

fig, ax = plt.subplots(1,1,figsize=(2,3))
sns.boxplot(data=df,y="Age at visit",x="Ad",palette={"No Ad": colors[0], "Ad": colors[3]},ax=ax)
ax.set_ylim([30,90])
plting.transform_plot_ax(ax, legend_title="")
ax.set_xlabel("")
ax.figure.savefig(fig_dir / "clinical_barplots" / "SWEPIC1_age_ad_dist.svg",bbox_inches="tight")

df = pd.concat([EPIC3_clin[["Age at visit"]],
                pd.DataFrame(EPIC3_phenotypes, index=EPIC3_clin.index, columns=["Ad"]).replace({0: "No Ad", 1: "Ad"})],axis=1)

fig, ax = plt.subplots(1,1,figsize=(2,3))
sns.boxplot(data=df,y="Age at visit",x="Ad",palette={"No Ad": colors[0], "Ad": colors[3]},ax=ax)
ax.set_ylim([30,90])
plting.transform_plot_ax(ax, legend_title="")
ax.set_xlabel("")
ax.figure.savefig(fig_dir / "clinical_barplots" / "SWEPIC2_age_ad_dist.svg",bbox_inches="tight")

df = pd.concat([EPIC4_clin[["Age at visit"]],
                pd.DataFrame(EPIC4_phenotypes, index=EPIC4_clin.index, columns=["Ad"]).replace({0: "No Ad", 1: "Ad"})],axis=1)

fig, ax = plt.subplots(1,1,figsize=(2,3))
sns.boxplot(data=df,y="Age at visit",x="Ad",palette={"No Ad": colors[0], "Ad": colors[3]},ax=ax)
ax.set_ylim([30,90])
plting.transform_plot_ax(ax, legend_title="")
ax.set_xlabel("")
ax.figure.savefig(fig_dir / "clinical_barplots" / "SWEPIC3_age_ad_dist.svg",bbox_inches="tight")


# tSNE data

In [None]:
full_EPIC_b = pd.concat([EPIC2_b,EPIC3_b,EPIC4_b]).dropna(axis=1)

In [None]:
full_EPIC_b.shape

In [None]:
full_EPIC_clin = pd.concat([EPIC2_clin, EPIC3_clin, EPIC4_clin])
full_EPIC_phenotypes = np.concatenate([EPIC2_phenotypes, EPIC3_phenotypes, EPIC4_phenotypes])

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(full_EPIC_b)
X_tsne = pd.DataFrame(X_tsne, index=full_EPIC_b.index, columns=["TSNE1","TSNE2"])

X_tsne = pd.concat([X_tsne, full_EPIC_clin],axis=1)

In [None]:
batch = np.concatenate([["SWEPIC1"]*EPIC2_clin.shape[0],["SWEPIC2"]*EPIC3_clin.shape[0],["SWEPIC3"]*EPIC4_clin.shape[0]])

X_tsne["Batch"] = batch

X_tsne["Adenoma"] = full_EPIC_phenotypes
X_tsne["Adenoma"] = X_tsne["Adenoma"].replace({0: "No", 1: "Yes"})

X_tsne["Gender"] = X_tsne["Gender"].replace({1: "F", 0: "M"})

X_tsne["Metabolic syndrome"] = X_tsne["Metabolic syndrome"].replace({0: "No", 1: "Yes"})

X_tsne["Analgesic >=2 years (overall)"] = X_tsne["Analgesic >=2 years (overall)"].replace({0: "No", 1: "Yes"})

In [None]:
ax = sns.scatterplot(data=X_tsne,x="TSNE1",y="TSNE2",hue="Batch", style="Gender", palette={"SWEPIC1": colors[6], 
                                                                           "SWEPIC2": colors[7], 
                                                                           "SWEPIC3": colors[9]}, 
                     markers={"F": "o", "M": "v"}, edgecolor="black")
plting.tsne_plot_ax(ax=ax, legend_title="SWEPIC dataset", leg_ftsize=10, ftsize=15, linew=4)
ax.figure.savefig(fig_dir / "TSNE_EPIC_batch.png", dpi=250, bbox_inches="tight")

In [None]:
ax = sns.scatterplot(data=X_tsne,x="TSNE1",y="TSNE2",hue="Age at visit", palette=sns.light_palette(colors[0], as_cmap=True))
plting.tsne_plot_ax(ax=ax, legend_title="Age at visit", leg_ftsize=12, ftsize=15, linew=4)
ax.figure.savefig(fig_dir / "TSNE_EPIC_age.png", dpi=250, bbox_inches="tight")

In [None]:
ax = sns.scatterplot(data=X_tsne,x="TSNE1",y="TSNE2",hue="Adenoma", palette = {"Yes": colors[3], "No": colors[0]})
plting.tsne_plot_ax(ax=ax, legend_title="Adenoma (right)", leg_ftsize=12, ftsize=15, linew=4)
ax.figure.savefig(fig_dir / "TSNE_EPIC_adenoma.png", dpi=250, bbox_inches="tight")

In [None]:
ax = sns.scatterplot(data=X_tsne,x="TSNE1",y="TSNE2",hue="Metabolic syndrome", palette={"No": colors[7], "Yes": colors[8]})
plting.tsne_plot_ax(ax=ax, legend_title="Metabolic syndrome", leg_ftsize=12, ftsize=15, linew=4)
ax.figure.savefig(fig_dir / "TSNE_EPIC_Metabolic syndrome.png", dpi=250, bbox_inches="tight")

In [None]:
ax = sns.scatterplot(data=X_tsne,x="TSNE1",y="TSNE2",hue="Analgesic >=2 years (overall)", palette={"No": colors[7], "Yes": colors[8]})
plting.tsne_plot_ax(ax=ax, legend_title="Analgesic use (over 2 years)", leg_ftsize=12, ftsize=15, linew=4)
ax.figure.savefig(fig_dir / "TSNE_EPIC_Analgesic >=2 years (overall).png", dpi=250, bbox_inches="tight")

In [None]:
ax = sns.scatterplot(data=X_tsne,x="TSNE1",y="TSNE2",hue="inflammatory_n", palette=sns.light_palette(colors[0], as_cmap=True))
plting.tsne_plot_ax(ax=ax, legend_title="Inflammatory diet index", leg_ftsize=12, ftsize=15, linew=4)
ax.figure.savefig(fig_dir / "TSNE_EPIC_inflammatory_n.png", dpi=250, bbox_inches="tight")

# PCA data

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
X_pca = pca.fit_transform(full_EPIC_b)
X_pca = pd.DataFrame(X_pca, index=full_EPIC_b.index, columns=[f"PCA{i+1}" for i in range(X_pca.shape[1])])

X_pca = pd.concat([X_pca, full_EPIC_clin],axis=1)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(2,2))
sns.scatterplot(x=np.arange(10),y=pca.explained_variance_ratio_[:10])
plting.pretty_ax(ax)
ax.set_ylabel("% variance explained")
ax.set_xlabel("PC")
ax.set_xticks(np.arange(10))
fig.savefig(fig_dir / "PCA_variance_explained.png", dpi=200, bbox_inches="tight")

In [None]:
batch = np.concatenate([["SWEPIC1"]*EPIC2_clin.shape[0],["SWEPIC2"]*EPIC3_clin.shape[0],["SWEPIC3"]*EPIC4_clin.shape[0]])

X_pca["Batch"] = batch
X_pca["Adenoma"] = full_EPIC_phenotypes

In [None]:
clin_params = ["Age at visit","Gender","BMI","Ever smoked cigarettes",
               "Metabolic syndrome","Analgesic >=2 years (overall)",
               "Pack years","inflammatory_n","anti-inflammatory_n",
               "western_n","prudent_n","Adenoma"]
bin_params = ["Adenoma","Gender","Ever smoked cigarettes","Metabolic syndrome",
              "Analgesic >=2 years (overall)",]
cont_params = ["Age at visit","BMI","Pack years","inflammatory_n",
               "anti-inflammatory_n",
               "western_n","prudent_n"]

In [None]:
df = X_pca[[f"PCA{i+1}" for i in range(50)] + clin_params + ["Batch"]]

In [None]:
associations = {}
for pc in [f"PCA{i+1}" for i in range(50)]:
    associations[pc] = {}
    pval = kruskal(df[df.Batch=="SWEPIC1"][pc],df[df.Batch=="SWEPIC2"][pc],df[df.Batch=="SWEPIC3"][pc])[1]
    associations[pc]["Batch"] = pval
    for col in bin_params:
        neg = df[col]==0
        pos = df[col]==1
        pval = kruskal(df[neg][pc],df[pos][pc])[1]
        associations[pc][col] = pval
    for col in cont_params:
        dfred = df[[pc,col]].dropna()
        pval = pearsonr(dfred[pc],dfred[col])[1]
        associations[pc][col] = pval
associations = pd.DataFrame.from_dict(associations)
associations = associations.applymap(lambda x: -np.log10(x))

In [None]:
fig, ax = plt.subplots(1,1,figsize=(12,4))
alpha_bonf = 0.05/associations.shape[0]
alpha_bonf = -np.log10(alpha_bonf)
sns.heatmap(associations, mask=associations<alpha_bonf, cmap="vlag", vmax=5,
            center=0.9*alpha_bonf, ax=ax, cbar_kws={"label": "-log10(p)"})
fig.savefig(fig_dir / "PCA_heatmap_clin_associations.png", dpi=250, bbox_inches="tight")

In [None]:
X_pca["Adenoma"] = full_EPIC_phenotypes
X_pca["Adenoma"] = X_pca["Adenoma"].replace({0: "No", 1: "Yes"})

X_pca["Gender"] = X_pca["Gender"].replace({1: "F", 0: "M"})

X_pca["Metabolic syndrome"] = X_pca["Metabolic syndrome"].replace({0: "No", 1: "Yes"})

X_pca["Analgesic >=2 years (overall)"] = X_pca["Analgesic >=2 years (overall)"].replace({0: "No", 1: "Yes"})

In [None]:
ax = sns.scatterplot(data=X_pca,x="PCA1",y="PCA2",hue="Batch", style="Gender", palette={"SWEPIC1": colors[6], 
                                                                           "SWEPIC2": colors[7], 
                                                                           "SWEPIC3": colors[9]}, 
                     markers={"F": "o", "M": "v"}, edgecolor="black")
plting.tsne_plot_ax(ax=ax, legend_title="SWEPIC dataset", leg_ftsize=10, ftsize=15, linew=4)
ax.figure.savefig(fig_dir / "pca_EPIC_batch.png", dpi=250, bbox_inches="tight")

In [None]:
ax = sns.scatterplot(data=X_pca, x="PCA1",y="PCA2",hue="Age at visit", palette=sns.light_palette(colors[0], as_cmap=True))
plting.tsne_plot_ax(ax=ax, legend_title="Age at visit", leg_ftsize=12, ftsize=15, linew=4)
ax.figure.savefig(fig_dir / "pca_EPIC_age.png", dpi=250, bbox_inches="tight")

In [None]:
ax = sns.scatterplot(data=X_pca,x="PCA1",y="PCA2",hue="Adenoma", palette = {"Yes": colors[3], "No": colors[0]})
plting.tsne_plot_ax(ax=ax, legend_title="Adenoma (right)", leg_ftsize=12, ftsize=15, linew=4)
ax.figure.savefig(fig_dir / "pca_EPIC_adenoma.png", dpi=250, bbox_inches="tight")

# General values

In [None]:
(n2, ncpg2), (n3, ncpg3), (n4, ncpg4)= EPIC2_b.shape, EPIC3_b.shape, EPIC4_b.shape

In [None]:
(n2, ncpg2), (n3, ncpg3), (n4, ncpg4)

In [None]:
ad2, ad3, ad4 = np.sum(EPIC2_phenotypes.astype(int)), np.sum(EPIC3_phenotypes.astype(int)), np.sum(EPIC4_phenotypes.astype(int))

In [None]:
(ad2, ad2/n2*100), (ad3, ad3/n3*100), (ad4, ad4/n4*100)

In [None]:
print_characteristic(EPIC_clin=full_EPIC_clin, EPIC_phenotypes=full_EPIC_phenotypes, charac="Age at visit", count=False)

In [None]:
print_characteristic(EPIC_clin=full_EPIC_clin, EPIC_phenotypes=full_EPIC_phenotypes, charac="BMI", count=False)

In [None]:
print_characteristic(EPIC_clin=full_EPIC_clin, EPIC_phenotypes=full_EPIC_phenotypes, charac="Metabolic syndrome", count=True)

In [None]:
print_characteristic(EPIC_clin=full_EPIC_clin, EPIC_phenotypes=full_EPIC_phenotypes, charac="Ever smoked cigarettes", count=True)

In [None]:
print_characteristic(EPIC_clin=full_EPIC_clin, EPIC_phenotypes=full_EPIC_phenotypes, charac="Pack years", count=False)

In [None]:
print_characteristic(EPIC_clin=full_EPIC_clin, EPIC_phenotypes=full_EPIC_phenotypes, charac="Analgesic >=2 years (overall)", count=True)

In [None]:
print_characteristic(EPIC_clin=full_EPIC_clin, EPIC_phenotypes=full_EPIC_phenotypes, charac="western_n", count=False)

In [None]:
print_characteristic(EPIC_clin=full_EPIC_clin, EPIC_phenotypes=full_EPIC_phenotypes, charac="prudent_n", count=False)

In [None]:
print_characteristic(EPIC_clin=full_EPIC_clin, EPIC_phenotypes=full_EPIC_phenotypes, charac="inflammatory_n", count=False)

In [None]:
print_characteristic(EPIC_clin=full_EPIC_clin, EPIC_phenotypes=full_EPIC_phenotypes, charac="anti-inflammatory_n", count=False)

# EPIC dataset-specific values

In [None]:
print_characteristic(EPIC_clin=EPIC4_clin, EPIC_phenotypes=EPIC4_phenotypes, charac="Age at visit", count=False)

In [None]:
print_characteristic(EPIC_clin=EPIC4_clin, EPIC_phenotypes=EPIC4_phenotypes, charac="BMI", count=False)

In [None]:
print_characteristic(EPIC_clin=EPIC4_clin, EPIC_phenotypes=EPIC4_phenotypes, charac="Metabolic syndrome", count=True)

In [None]:
print_characteristic(EPIC_clin=EPIC3_clin, EPIC_phenotypes=EPIC3_phenotypes, charac="Ever smoked cigarettes", count=True)

In [None]:
print_characteristic(EPIC_clin=EPIC4_clin, EPIC_phenotypes=EPIC4_phenotypes, charac="Pack years", count=False)

In [None]:
print_characteristic(EPIC_clin=EPIC4_clin, EPIC_phenotypes=EPIC4_phenotypes, charac="Analgesic >=2 years (overall)", count=True)

In [None]:
print_characteristic(EPIC_clin=EPIC4_clin, EPIC_phenotypes=EPIC4_phenotypes, charac="western_n", count=False)

In [None]:
print_characteristic(EPIC_clin=EPIC2_clin, EPIC_phenotypes=EPIC2_phenotypes, charac="prudent_n", count=False)

In [None]:
print_characteristic(EPIC_clin=EPIC3_clin, EPIC_phenotypes=EPIC3_phenotypes, charac="inflammatory_n", count=False)

In [None]:
print_characteristic(EPIC_clin=EPIC4_clin, EPIC_phenotypes=EPIC4_phenotypes, charac="anti-inflammatory_n", count=False)