In [1]:
# Notebook: Notebook to prepare and annotate identified CpGs
# Description: Notbook to prepare CpG [DMPs] annotations based on EPIC B4 manifest.

In [1]:
%load_ext lab_black

In [2]:
from typing import List, Set

import pandas as pd
import numpy as np

from src.utils import Utils

In [3]:
def convert_set_to_df(set_: Set[str]) -> pd.DataFrame:
    """
    Convert set to DataFrame.
    """
    return pd.DataFrame(list(set_), columns=["CpG"]).set_index("CpG")


def extract(df, threshold=0.1, alpha=0.05):
    """
    Function to extract CpGs from report
    """
    df = df[(df["Delta mean"].abs() > threshold) & (df["q-value"] <= alpha)]
    return set(df.index)


def annotate(cpgs: pd.DataFrame, manifest: pd.DataFrame) -> pd.DataFrame:
    """
    Function to select specific values from manifes.
    """
    ins = set.intersection(set(manifest.index), set(cpgs.index))
    return manifest.loc[ins]

In [4]:
# Load EPIC

In [5]:
epic = pd.read_csv(
    "../data/additional/EPIC/MethylationEPIC_v-1-0_B4.csv",
    index_col=0,
    low_memory=False,
)

epic = epic[["UCSC_RefGene_Group", "Relation_to_UCSC_CpG_Island", "UCSC_RefGene_Name"]]

In [6]:
epic

Unnamed: 0_level_0,UCSC_RefGene_Group,Relation_to_UCSC_CpG_Island,UCSC_RefGene_Name
IlmnID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cg07881041,Body;Body;Body;Body,N_Shore,PTPRS;PTPRS;PTPRS;PTPRS
cg18478105,TSS200,Island,YTHDF1
cg23229610,,N_Shelf,
cg03513874,,S_Shelf,
cg09835024,TSS1500,Island,EIF2S3
...,...,...,...
71678368,,,
72748406,,,
73635489,,,
73784382,,,


In [7]:
# DMPs common for each COVID-19 group in comparison with healthy controls

In [8]:
usa = extract(pd.read_csv("statistics/output/USACoV_vs_HB_ALLProbes.csv", index_col=0))
pl = extract(pd.read_csv("statistics/output/PLCoV_vs_HB_ALLProbes.csv", index_col=0))
spain = extract(
    pd.read_csv("statistics/output/SpainCoV_vs_HB_ALLProbes.csv", index_col=0)
)

In [9]:
intersection = convert_set_to_df(set.intersection(usa, pl, spain))
intersection = annotate(intersection, epic)
intersection

Unnamed: 0_level_0,UCSC_RefGene_Group,Relation_to_UCSC_CpG_Island,UCSC_RefGene_Name
IlmnID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cg03860054,Body,S_Shore,SLC6A2
cg11075029,5'UTR,S_Shore,SHF
cg11962649,Body,N_Shore,HSPA5
cg08609238,,Island,
cg02035102,,,
...,...,...,...
cg21465162,TSS200;TSS200,S_Shore,UBQLN1;UBQLN1
cg14648311,,,
cg11325989,Body,,KRT6A
cg01517139,Body,,THSD7A


In [10]:
intersection.to_csv("../Files/DMPs_In_All_COV_Kohorts.csv")

In [20]:
intersection_tss_only = Utils.extract_probes_in_specific_region(
    intersection, ["TSS200", "TSS1500"]
)
intersection_tss_only = annotate(intersection_tss_only, epic)
intersection_tss_only.to_csv("../Files/DMPs_In_All_COV_Kohorts_TSS_Only.csv")
intersection_tss_only

Unnamed: 0_level_0,UCSC_RefGene_Group,Relation_to_UCSC_CpG_Island,UCSC_RefGene_Name
IlmnID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cg22930808,5'UTR;5'UTR;TSS1500;5'UTR;5'UTR;5'UTR;5'UTR,N_Shore,PARP9;PARP9;DTX3L;PARP9;PARP9;PARP9;PARP9
cg08585897,5'UTR;TSS1500;Body,N_Shore,KARS;TERF2IP;KARS
cg18642567,TSS1500,,RPGRIP1
cg13407664,TSS1500,,HTN1
cg03782202,TSS1500,Island,HOXD11
cg24315703,TSS200,Island,LIMK2
cg17515347,TSS1500,,AIM2
cg12981595,TSS200,,KRTAP4-8
cg19556901,TSS1500,,SNORD115-1
cg21465162,TSS200;TSS200,S_Shore,UBQLN1;UBQLN1


In [21]:
# DMPs COVID-19 USA vs non-COVID-19 USA

In [23]:
usa_cov_vs_noncov = extract(
    pd.read_csv("statistics/output/USANoNCoV_vs_CoV_ALLProbes.csv", index_col=0)
)

usa_cov_vs_noncov = annotate(convert_set_to_df(usa_cov_vs_noncov), epic)
usa_cov_vs_noncov.to_csv("../Files/DMPs_COV_vs_nonCOV_USA.csv")
usa_cov_vs_noncov

Unnamed: 0_level_0,UCSC_RefGene_Group,Relation_to_UCSC_CpG_Island,UCSC_RefGene_Name
IlmnID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cg02770216,Body;Body,,AKAP9;AKAP9
cg24388175,Body;Body;Body,,PCCA;PCCA;PCCA
cg24145401,TSS1500,,AIM2
cg04445427,,,
cg05450804,Body,N_Shore,GXYLT2
...,...,...,...
cg13982456,,S_Shelf,
cg07992500,5'UTR,N_Shore,CDC42EP3
cg04595719,Body,,CDCA2
cg11251971,Body,,CYSTM1


In [80]:
Utils.extract_probes_in_specific_region(usa_cov_vs_noncov).sort_values().index.nunique()

26

In [24]:
# DMPs non-COVID-19 vs healthy controls

In [25]:
usa_noncov_vs_hb = extract(
    pd.read_csv("statistics/output/USANoNCoV_vs_HB_ALLProbes.csv", index_col=0)
)

usa_noncov_vs_hb = annotate(convert_set_to_df(usa_noncov_vs_hb), epic)
usa_noncov_vs_hb.to_csv("../Files/DMPs_nonCOV_vs_HB_USA.csv")
usa_noncov_vs_hb

Unnamed: 0_level_0,UCSC_RefGene_Group,Relation_to_UCSC_CpG_Island,UCSC_RefGene_Name
IlmnID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cg03860054,Body,S_Shore,SLC6A2
cg16499677,Body,,C14orf37
cg02052531,,,
cg05521474,,,
cg27374674,Body,,GAS7
...,...,...,...
cg23819092,Body,Island,MEX3A
cg14648311,,,
cg11325989,Body,,KRT6A
cg01517139,Body,,THSD7A
