In [None]:
# Identification of COVID-19 specific DMPs

In [None]:
import os

from tqdm import tqdm
import numpy as np
import pandas as pd
from glob import glob

import plotly.express as px

pd.set_option("max_column", None)

In [None]:
def extract(
    report: pd.DataFrame, threshold: float, alpha: float = 0.05
) -> pd.DataFrame:
    return report[
        (report["Delta mean"].abs() >= threshold) & (report["Adj. p-value"] <= alpha)
    ]

In [None]:
epic = pd.read_parquet(os.environ.get("POETRY_EPIC"))

In [None]:
reports = glob("statistics/output/*.csv")

reports_cov = set([name for name in reports if "controls" in name])
reports_oi = set([name for name in reports if "infection" in name])

In [None]:
data = []

for min_delta in tqdm(np.arange(0.01, 0.1005, 0.005)):

    cov_vs_hb_dmps = []
    cov_vs_hb_genes = []

    for report in reports_cov:
        df = extract(pd.read_csv(report, index_col=0), threshold=min_delta)
        dmps = df.index + "-" + df["Status"]
        dmps = set(dmps)

        cov_vs_hb_dmps.append(dmps)

    cov_vs_hb_dmps = set.intersection(
        *cov_vs_hb_dmps
    )  # probes common in all cov cohorts

    cov_vs_oi_dmps = []

    for report in reports_oi:
        df = extract(pd.read_csv(report, index_col=0), threshold=min_delta)
        dmps = df.index + "-" + df["Status"]
        dmps = set(dmps)

        cov_vs_oi_dmps.append(dmps)

    cov_vs_oi_dmps = set.intersection(
        *cov_vs_oi_dmps
    )  # probes common in all OI cohorts

    intersection_probes = set.intersection(cov_vs_hb_dmps, cov_vs_oi_dmps)

    if not intersection_probes:
        break

    data.append(
        {
            "Delta": min_delta,
            "COVID-19 specific DMPs": ";".join(intersection_probes),
            "DMPs number": len(intersection_probes),
        }
    )

df = pd.DataFrame(data)
df

In [None]:
fig = px.area(df, x="Delta", y="DMPs number")
fig.update_layout(font=dict(size=14))
fig.show()

In [None]:
def extract_probes_id(data: str) -> list:
    cpgs = [cpg.split("-")[0] for cpg in data.split(";")]
    return cpgs

In [None]:
selected_probes = epic.loc[extract_probes_id(df.iloc[8, 1])]
selected_probes = selected_probes[
    ["UCSC_RefGene_Name", "UCSC_RefGene_Group", "Relation_to_UCSC_CpG_Island"]
]
selected_probes

In [None]:
selected_probes.shape

In [None]:
selected_probes.to_csv("../Files/COVSpecificDMPs.csv")

In [None]:
genes = (
    selected_probes.UCSC_RefGene_Name.dropna()
    .str.split(";")
    .explode()
    .drop_duplicates()
)
genes.name = "Genes"
print(genes.nunique())
genes.to_csv("../Files/COVSpecificGenes.csv")