In [1]:
# Notebook: Notebook to annotate CpGs
# Description:  ...

In [2]:
%load_ext lab_black

In [104]:
import pandas as pd
import numpy as np

from typing import List, Set

In [136]:
def structurize(set_: Set[str]) -> pd.DataFrame:
    """
    Convert set to DataFrame.
    """
    return pd.DataFrame(list(set_), columns=["CpG"]).set_index("CpG")


def exract_probes_in_specific_region(
    df: pd.DataFrame, regions_to_extract: List[str]
) -> pd.DataFrame:

    df = df.dropna()
    results_frame = pd.DataFrame()

    for idx, row in df.iterrows():
        gene_regions = np.asarray(row.loc["UCSC_RefGene_Group"].split(";"))

        all_idx = []
        for region in regions_to_extract:
            where = np.where(gene_regions == region)
            where = [loc for loc in where[0]]
            all_idx.extend(where)

        if all_idx != []:
            gene = np.asarray(row.loc["UCSC_RefGene_Name"].split(";")).flatten()[
                all_idx
            ]

            gene = set(gene)
            results_frame.loc[idx, "UCSC_RefGene_Name"] = " ".join(gene)
            results_frame.loc[idx, "UCSC_RefGene_Group"] = " | ".join(
                regions_to_extract
            )

    return results_frame


def extract(df, threshold=0.1, alpha=0.05):
    """Function to extract CpGs from report"""
    df = df[(df["Delta mean"].abs() > threshold) & (df["q-value"] <= alpha)]
    return set(df.index)

In [5]:
# Load EPIC

In [5]:
epic = pd.read_csv(
    "../data/Additional/EPIC/MethylationEPIC_v-1-0_B4.csv",
    index_col=0,
    low_memory=False,
)
epic = epic[["UCSC_RefGene_Name", "UCSC_RefGene_Group", "Relation_to_UCSC_CpG_Island"]]

In [6]:
epic

Unnamed: 0_level_0,UCSC_RefGene_Name,UCSC_RefGene_Group,Relation_to_UCSC_CpG_Island
IlmnID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cg07881041,PTPRS;PTPRS;PTPRS;PTPRS,Body;Body;Body;Body,N_Shore
cg18478105,YTHDF1,TSS200,Island
cg23229610,,,N_Shelf
cg03513874,,,S_Shelf
cg09835024,EIF2S3,TSS1500,Island
...,...,...,...
71678368,,,
72748406,,,
73635489,,,
73784382,,,


In [3]:
# DMPs in ALL covid groups vs HB

In [7]:
report = pd.read_csv("../Files/DMPsInAllCoVGroup", index_col=0).set_index("CpG")
report = pd.concat((report, epic.loc[report.index]), axis=1)
report.to_csv("../Annots/DMPsInAllStudiesAnnots.csv")
report

Unnamed: 0_level_0,UCSC_RefGene_Name,UCSC_RefGene_Group,Relation_to_UCSC_CpG_Island
CpG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cg08876558,,,
cg26548134,SLC25A17,Body,Island
cg05909891,,,
cg05112967,,,
cg18731202,TGFA;TGFA,Body;Body,
...,...,...,...
cg15004555,AIM2,Body,
cg11962649,HSPA5,Body,N_Shore
cg03889742,MYO5A;MYO5A,Body;Body,
cg09622330,GRIN2A;GRIN2A;GRIN2A,Body;Body;Body,


In [10]:
# DMPs non-Covid vs HB

In [11]:
report = structurize(
    extract(pd.read_csv("output/USANoNCoV_vs_HB_ALLProbes.csv", index_col=0))
)
report = pd.concat((report, epic.loc[report.index]), axis=1)
report.to_csv("../Annots/DMPsNonCovVsHBAnnots.csv")
report

Unnamed: 0_level_0,UCSC_RefGene_Name,UCSC_RefGene_Group,Relation_to_UCSC_CpG_Island
CpG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cg22726533,YAE1D1;YAE1D1,Body;Body,S_Shelf
cg10211530,TNRC18,Body,N_Shore
cg20232119,HSPA1L,Body,S_Shelf
cg05521474,,,
cg08609238,,,Island
...,...,...,...
cg04783624,RANBP9,Body,
cg10685380,IL12B,Body,
cg25940202,,,
cg08022717,CYP27C1,Body,


In [47]:
# DMPs non-Cov vs CoV [USA]

In [12]:
report = structurize(
    extract(pd.read_csv("output/USANoNCoV_vs_CoV_ALLProbes.csv", index_col=0))
)
report = pd.concat((report, epic.loc[report.index]), axis=1)
report.to_csv("../Annots/DMPsNonCovVsCoVAnnots.csv")
report

Unnamed: 0_level_0,UCSC_RefGene_Name,UCSC_RefGene_Group,Relation_to_UCSC_CpG_Island
CpG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cg14541119,LOC100507477;LOC100507477,Body;Body,
cg06712932,PLXNC1;PLXNC1,Body;Body,
cg01176329,GABBR1;GABBR1;GABBR1,Body;Body;Body,N_Shelf
cg24103563,TRIM34;TRIM34;TRIM6-TRIM34,TSS1500;5'UTR;Body,
cg20426042,PGCP,5'UTR,
...,...,...,...
cg16168081,,,N_Shore
cg03037271,,,
cg18557047,TET2;TET2;TET2-AS1,5'UTR;5'UTR;Body,
cg17114584,IRF7;IRF7;IRF7,Body;Body;Body,N_Shore
