In [None]:
import pandas as pd
import numpy as np
import pathlib as pl


from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator
from statsmodels.stats.multitest import multipletests

from scipy.stats import mannwhitneyu, fisher_exact, pearsonr, kruskal

In [None]:
import sys
import os
sys.path.append("../../FinalCode/")
import download.download as dwnl
import utils.plotting as plting
import adVMP.adVMP_discovery as discov
import adVMP.adVMP_plots as advmpplt

In [None]:
# For figures
colors = sns.color_palette("muted")
fig_dir = pl.Path("/add/path/here/")

In [None]:
base_dir = pl.Path("/add/path/here/")
base_dir4 = pl.Path("/add/path/here/")

data_dir = pl.Path("/add/path/here/")

bad_probes = pd.read_csv(data_dir / "auxiliary" / "sketchy_probe_list_epic.csv",index_col=0).values.ravel()
sample_origin_path = pl.Path(data_dir / "clinical" / "sample_origin_wbatch.csv")

clinical_path = pl.Path(data_dir / "clinical" / "cleaned_clinical_reduced_diet.csv")
target_path = pl.Path(data_dir / "clinical" / "targets.csv")

In [None]:
EPIC2_b, EPIC2_clin, EPIC2_samples, EPIC2_phenotypes, EPIC3_b, EPIC3_clin, EPIC3_samples, EPIC3_phenotypes = dwnl.download_EPIC(sample_origin_path=sample_origin_path, 
                     base_dir=base_dir, clinical_path=clinical_path, target_path=target_path,
                  bad_probes=bad_probes, EPIC4=False) 

In [None]:
EPIC4_b, EPIC4_clin, EPIC4_samples, EPIC4_phenotypes = dwnl.download_EPIC(sample_origin_path=sample_origin_path, 
                     base_dir=base_dir4, clinical_path=clinical_path, target_path=target_path, 
                  bad_probes=bad_probes, EPIC4=True) 

# Load CpGs

In [None]:
union_cpgs = pd.read_csv(data_dir / "adVMP"/ "union_cpgs_q005.csv", index_col=0).values.ravel()

# Map Roadmap Epigenomics to EPIC array

In [None]:
roadmap = pd.read_csv(data_dir / "NIH_Epigenomics_Roadmap" / "E075_15_coreMarks_dense.bed.gz",sep="\t",skiprows=1,header=None)

roadmap = roadmap.iloc[:,:4]

roadmap.columns = ["chrom","start","end","state"]

In [None]:
epic_manifest = pd.read_csv(data_dir / "illumina_manifests" / "GPL21145_MethylationEPIC_15073387_v-1-0.csv.gz",skiprows=7,index_col=0)

In [None]:
probes = epic_manifest[['CHR', 'MAPINFO']].dropna()
probes = probes.loc[~probes.index.duplicated()]

# get a probe df for each chromosome
probes_pc = {}
for chrom in sorted(probes.CHR.unique()):
    probes_pc[chrom] = probes[probes["CHR"]==chrom].sort_values(by="MAPINFO")

# find probes that are located in the roadmap annotated regions for each chromosome
mapping = {}
for i in tqdm(probes_pc):
    chromosome = f'chr{i}'
    if chromosome=="chr0":
        continue
    print(chromosome)
    roadmap_chrom = roadmap[roadmap["chrom"]==chromosome]
    for cg in tqdm(probes_pc[i].index):
        pos = probes_pc[i].loc[cg,"MAPINFO"]
        found = roadmap_chrom[(roadmap_chrom["start"]<=pos) & (roadmap_chrom["end"]>pos)]
        if found.shape[0]==0:
            # some probes will fall outside of annotated regions
            print(f"No mapping for {cg}")
            mapping[cg] = ["None"]
        else:
            mapping[cg] = found["state"].values

mapping_dict = pd.DataFrame.from_dict(mapping).T

mapping_dict.columns = ["State"]

mapping_dict = pd.concat([probes,mapping_dict],axis=1)

In [None]:
mapping_dict = mapping_dict.dropna()

In [None]:
mapping_dict.to_csv(data_dir / "NIH_Epigenomics_Roadmap" / "EPIC_to_state_mapping.csv")

# Get enrichment

In [None]:
mapping_dict = pd.read_csv(data_dir / "NIH_Epigenomics_Roadmap" / "EPIC_to_state_mapping.csv",index_col=0)

In [None]:
# rename the Roadmap Epigenomics states into 8 main categories
meaningful_groups = {"1_TssA": "Active promoter", "2_TssAFlnk": "Active promoter", "3_TxFlnk": "Active promoter", 
                     "4_Tx": "Transcription (body)", "5_TxWk": "Transcription (body)", 
                     "6_EnhG": "Enhancer", "7_Enh": "Enhancer", "8_ZNF/Rpts": "ZNF/Repeats", 
                     "10_TssBiv": "Bivalent promoter", "11_BivFlnk": "Bivalent promoter", 
                     "12_EnhBiv": "Bivalent enhancer", "13_ReprPC": "Repressed polycomb", 
                     "14_ReprPCWk": "Repressed polycomb", "15_Quies": "Quiescent"}

In [None]:
mapping_dict["Red_State"] = mapping_dict["State"].replace(meaningful_groups)

In [None]:
df = mapping_dict.loc[union_cpgs].Red_State.value_counts()
dict_states = df.to_dict()

In [None]:
# create data
names = list(dict_states.keys())
size = list(dict_states.values())
 
# Create a circle at the center of the plot
my_circle = plt.Circle( (0,0), 0.7, color='white')

def func(pct, allvals):
    absolute = int(np.round(pct/100.*np.sum(allvals)))
    return f"{pct:.1f}%"

# Custom wedges
plt.pie(size, labels=names, 
        wedgeprops = { 'linewidth' : 7, 'edgecolor' : 'white' }, autopct=lambda pct: func(pct, size), 
        textprops=dict(color="black", size=12))
p = plt.gcf()
p.gca().add_artist(my_circle)
p.savefig(fig_dir / "donut_plot_enrichment_roadmap.svg", bbox_inches="tight")

In [None]:
cl_bright = sns.color_palette()
# make sure the colors correspond
pl = {"Active promoter": cl_bright[0], "Bivalent promoter": cl_bright[1], 
      "Quiescent": cl_bright[2], "Transcription (body)": cl_bright[3], 
      "Repressed polycomb": cl_bright[4], "Enhancer": cl_bright[5], 
      "Bivalent enhancer": cl_bright[6], "ZNF/Repeats": cl_bright[7]}

In [None]:
background_cpgs = EPIC4_b.columns.intersection(EPIC2_b.columns).to_numpy()

In [None]:
def get_fisher_enrichment(mapping_dict: pd.DataFrame, 
                          background: np.ndarray, 
                          union_cpgs: np.ndarray, state_col: str="State") -> pd.DataFrame:
    
    common_probes = mapping_dict.index.intersection(background)
    mapping_background = mapping_dict.loc[common_probes]

    # get the value counts of the probes present in the mapping that are also aDVMCs 
    vc_uc = mapping_background.loc[mapping_background.index.isin(union_cpgs)][state_col].value_counts()

    # get the value counts of the probes present in the mapping that are not aDVMCs 
    vc_nuc = mapping_background.loc[~mapping_background.index.isin(union_cpgs)][state_col].value_counts()

    fisher_enrichment = {}
    for state in vc_uc.index:
        # aDVMCs in the mapping in a specific state
        a = vc_uc.loc[state]
        # non-aDVMCs in the mapping in a specific state
        b = vc_nuc.loc[state]
        # aDVMCs in the mapping not in that specific state
        c = vc_uc.sum() - a
        # non-aDVMCs in the mapping not in that specific state
        d = vc_nuc.sum() - b
        # compute the Fisher exact test on the contingency table
        OR, p = fisher_exact(np.array([[a,c],[b,d]]))
        fisher_enrichment[state] = [OR,p]

    fisher_enrichment = pd.DataFrame.from_dict(fisher_enrichment).T
    fisher_enrichment.columns = ["OR","p"]

    # FDR correction
    q = multipletests(fisher_enrichment["p"],method="fdr_bh")[1]

    fisher_enrichment["q"] = q
    fisher_enrichment["log2(OR)"] = fisher_enrichment["OR"].apply(np.log2)
    
    return fisher_enrichment

In [None]:
def convert_pvalue_to_asterisks(pvalue):
    if pvalue <= 0.0001:
        return "****"
    elif pvalue <= 0.001:
        return "***"
    elif pvalue <= 0.01:
        return "**"
    elif pvalue <= 0.05:
        return "*"
    return "ns"

In [None]:
fisher_enrichment = get_fisher_enrichment(mapping_dict=mapping_dict, 
                                          background=background_cpgs, 
                                          union_cpgs=union_cpgs, state_col="Red_State")
fisher_enrichment = fisher_enrichment.sort_values(by="log2(OR)",ascending=False)

In [None]:
annotations = fisher_enrichment['q'].apply(convert_pvalue_to_asterisks).ravel()

In [None]:
fig, ax = plt.subplots(1,1)
sns.barplot(data=fisher_enrichment.sort_values(by="log2(OR)",ascending=False).reset_index(),
            palette = pl, x="index",y="log2(OR)",ax=ax)
ax.spines[['right', 'top']].set_visible(False)
ax.spines[["bottom", "left"]].set_linewidth(4)
ax.set_xticklabels(ax.get_xticklabels(),
                   rotation=45,horizontalalignment="right",fontsize=15)
ax.set_ylim(bottom=-2.1, top=2.1)
ax.set_yticklabels(ax.get_yticklabels(),fontsize=15)
ax.set_xlabel("")
ax.set_ylabel("log2(OR)",fontsize=15)


y_positions = fisher_enrichment["log2(OR)"]
y_positions= y_positions.apply(lambda x: x+0.1 if x>0 else x-0.1)
y_positions = y_positions.ravel()
for idx, pval in enumerate(annotations):
    plt.text(x=idx, y=y_positions[idx], s=pval, ha="center", va="center", fontsize=12)
fig.savefig(fig_dir / "adVMP_roadmap_enrichment.svg", bbox_inches="tight")