In [None]:
import os

from tqdm import tqdm
import numpy as np
import pandas as pd
from glob import glob

import plotly.express as px

pd.set_option("max_column", None)

In [None]:
def extract(report: pd.DataFrame, threshold) -> pd.DataFrame:
    return report[
        (report["Delta mean"].abs() > threshold) & (report["Adj. p-value"] <= 0.05)
    ]

In [None]:
epic = pd.read_parquet(os.environ.get("POETRY_EPIC"))

In [None]:
reports = glob("statistics/output/*.csv")
reports_cov = set([name for name in reports if "controls" in name])
reports_cov

In [None]:
data = []

for min_delta in tqdm(np.arange(0.01, 0.105, 0.005)):

    cov_vs_hb_dmps = []
    for report in reports_cov:

        df = extract(pd.read_csv(report, index_col=0), threshold=min_delta)
        df = df.index + "-" + df["Status"]
        dmps = set(df)
        cov_vs_hb_dmps.append(dmps)

    intersection = set.intersection(
        *cov_vs_hb_dmps
    )  # probes common in all covid-19 cohorts

    if not intersection:
        break

    data.append(
        {
            "Delta": min_delta,
            "COVID-19 specific DMPs": ";".join(intersection),
            "Number": len(intersection),
        }
    )

df = pd.DataFrame(data)
df

In [None]:
fig = px.area(df, x="Delta", y="Number", labels={"Number": "DMPs number"})
fig.update_layout(font=dict(size=18))
fig.show()

In [None]:
def extract_probes(data: str, val: int) -> list:
    cpgs = [cpg.split("-")[val] for cpg in data.split(";")]
    return cpgs

In [None]:
pd.Series(extract_probes(df.iloc[8, 1], val=1)).value_counts()

In [None]:
selected_probes = epic.loc[extract_probes(df.iloc[8, 1], val=0)]
selected_probes[
    ["UCSC_RefGene_Name", "UCSC_RefGene_Group", "Relation_to_UCSC_CpG_Island"]
]

In [None]:
selected_probes.to_csv("../Files/DMPs_COV_vs_HC.csv")

In [None]:
genes = (
    selected_probes.UCSC_RefGene_Name.dropna()
    .str.split(";")
    .explode()
    .drop_duplicates()
)
genes

In [None]:
genes.nunique()

In [None]:
genes.to_csv("../Files/Genes_COV_vs_HC.csv")