In [None]:
import pandas as pd
import numpy as np
import pathlib as pl

from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator

from scipy.stats import mannwhitneyu, fisher_exact, pearsonr, kruskal

In [None]:
import sys
sys.path.append("../../FinalCode/")
import download.download as dwnl
import utils.plotting as plting

In [None]:
# For figures
colors = sns.color_palette("muted")
fig_dir = pl.Path("/add/path/here")

In [None]:
base_dir = pl.Path("/add/path/here")
base_dir4 = pl.Path("/add/path/here")

data_dir = pl.Path("/add/path/here")

bad_probes = pd.read_csv(data_dir / "auxiliary" / "sketchy_probe_list_epic.csv",index_col=0).values.ravel()
sample_origin_path = pl.Path(data_dir / "clinical" / "sample_origin_wbatch.csv")

clinical_path = pl.Path(data_dir / "clinical" / "cleaned_clinical_reduced_diet.csv")
target_path = pl.Path(data_dir / "clinical" / "targets.csv")

In [None]:
EPIC2_b, EPIC2_clin, EPIC2_samples, EPIC2_phenotypes, EPIC3_b, EPIC3_clin, EPIC3_samples, EPIC3_phenotypes = dwnl.download_EPIC(sample_origin_path=sample_origin_path, 
                     base_dir=base_dir, clinical_path=clinical_path, target_path=target_path,
                  bad_probes=bad_probes, EPIC4=False) 

In [None]:
EPIC4_b, EPIC4_clin, EPIC4_samples, EPIC4_phenotypes = dwnl.download_EPIC(sample_origin_path=sample_origin_path, 
                     base_dir=base_dir4, clinical_path=clinical_path, target_path=target_path, 
                  bad_probes=bad_probes, EPIC4=True) 

# Find most variable probes

In [None]:
from scipy.stats import median_abs_deviation
mad2 = median_abs_deviation(EPIC2_b)
mad2 = pd.Series(mad2, index=EPIC2_b.columns)
mad3 = median_abs_deviation(EPIC3_b)
mad3 = pd.Series(mad3, index=EPIC3_b.columns)
mad4 = median_abs_deviation(EPIC4_b)
mad4 = pd.Series(mad4, index=EPIC4_b.columns)

In [None]:
fig, ax = plt.subplots(1,1)
sns.histplot(data=mad2.clip(None,0.15), bins=50, ax=ax)
plt.axvline(mad2.quantile(0.95), c="r")
plt.text(1.1*mad2.quantile(0.95),53000,"95% quantile",c="r", fontdict = {"size": 15})
plting.transform_plot_ax(ax, legend_title="", ftsize=15)
fig.savefig(fig_dir / "SWEPIC1" / "mad_95_qt.svg", bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(1,1)
sns.histplot(data=mad3.clip(None,0.15), bins=50, ax=ax)
plt.axvline(mad3.quantile(0.95), c="r")
plt.text(1.1*mad3.quantile(0.95),50000,"95% quantile",c="r", fontdict = {"size": 15})
plting.transform_plot_ax(ax, legend_title="", ftsize=15)
fig.savefig(fig_dir / "SWEPIC2" / "mad_95_qt.svg", bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(1,1)
sns.histplot(data=mad4.clip(None,0.15), bins=50, ax=ax)
plt.axvline(mad4.quantile(0.95), c="r")
plt.text(1.1*mad4.quantile(0.95),60000,"95% quantile",c="r", fontdict = {"size": 15})
plting.transform_plot_ax(ax, legend_title="", ftsize=15)
fig.savefig(fig_dir / "SWEPIC3" / "mad_95_qt.svg", bbox_inches="tight")

In [None]:
union_cpgs_5_pct = np.unique(np.concatenate([mad2[mad2>mad2.quantile(0.95)].index,
                mad3[mad3>mad3.quantile(0.95)].index,
                mad4[mad4>mad4.quantile(0.95)].index]))

pd.Series(union_cpgs_5_pct).to_csv(data_dir / "variable_probes" / "union_cpgs_5_pct_most_variable.csv")

# Find most variable probes in healthy tissue

In [None]:
from scipy.stats import median_abs_deviation
mad2 = median_abs_deviation(EPIC2_b.loc[EPIC2_phenotypes==0])
mad2 = pd.Series(mad2, index=EPIC2_b.columns)
mad3 = median_abs_deviation(EPIC3_b.loc[EPIC3_phenotypes==0])
mad3 = pd.Series(mad3, index=EPIC3_b.columns)
mad4 = median_abs_deviation(EPIC4_b.loc[EPIC4_phenotypes==0])
mad4 = pd.Series(mad4, index=EPIC4_b.columns)

In [None]:
fig, ax = plt.subplots(1,1)
sns.histplot(data=mad2.clip(None,0.15), bins=50, ax=ax)
plt.axvline(mad2.quantile(0.95), c="r")
plt.text(1.1*mad2.quantile(0.95),60000,"95% quantile",c="r", fontdict = {"size": 15})
plting.transform_plot_ax(ax, legend_title="", ftsize=15)
fig.savefig(fig_dir / "SWEPIC1" / "mad_95_qt_onlyhealthy.svg", bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(1,1)
sns.histplot(data=mad3.clip(None,0.15), bins=50, ax=ax)
plt.axvline(mad3.quantile(0.95), c="r")
plt.text(1.1*mad3.quantile(0.95),50000,"95% quantile",c="r", fontdict = {"size": 15})
plting.transform_plot_ax(ax, legend_title="", ftsize=15)
fig.savefig(fig_dir / "SWEPIC2" / "mad_95_qt_onlyhealthy.svg", bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(1,1)
sns.histplot(data=mad4.clip(None,0.15), bins=50, ax=ax)
plt.axvline(mad4.quantile(0.95), c="r")
plt.text(1.1*mad4.quantile(0.95),60000,"95% quantile",c="r", fontdict = {"size": 15})
plting.transform_plot_ax(ax, legend_title="", ftsize=15)
fig.savefig(fig_dir / "SWEPIC3" / "mad_95_qt_onlyhealthy.svg", bbox_inches="tight")

In [None]:
union_cpgs_5_pct = np.unique(np.concatenate([mad2[mad2>mad2.quantile(0.95)].index,
                mad3[mad3>mad3.quantile(0.95)].index,
                mad4[mad4>mad4.quantile(0.95)].index]))
pd.Series(union_cpgs_5_pct).to_csv(data_dir / "variable_probes" / "union_cpgs_5_pct_most_variable_onlyhealthy.csv")