In [None]:
# Analysis of promotors global methylation levels [genes associated with ICU]

In [None]:
import os

import pandas as pd
import statsmodels.api as sts
from statsmodels.stats.multitest import multipletests as mp

from src.figures import boxplot

In [None]:
samplesheet = pd.read_csv("../data/raw/SampleSheet.csv", index_col=0)

samplesheet = samplesheet[samplesheet.ICU.isin(["non-ICU", "ICU"])]
samplesheet = samplesheet[samplesheet.Status.str.contains("COVID")]
samplesheet = samplesheet[~samplesheet.Status.str.contains("PL")]

samplesheet.Sex = samplesheet.Sex.replace({"F": 1, "M": 0})
samplesheet.ICU = samplesheet.ICU.replace({"non-ICU": 0, "ICU": 1})
samplesheet["intercept"] = 1
samplesheet

In [None]:
mynorm = pd.read_parquet(
    "../data/processed/CorrectedMyNorms/mynorm.parquet", columns=samplesheet.index
)

In [None]:
genes = pd.read_csv("../Files/DEGs_ICU_nonICU.csv", index_col=0).Gene.values
genes

In [None]:
epic = pd.read_parquet(os.environ.get("POETRY_EPIC"))[["UCSC_RefGene_Name", "UCSC_RefGene_Group", "Regulatory_Feature_Group"]]
epic = epic.dropna()

promotors = epic[epic["UCSC_RefGene_Name"].str.contains("|".join(genes))]

promotors = promotors[
    promotors["Regulatory_Feature_Group"].str.contains("promoter", case=False)
]

promotors = promotors.loc[set.intersection(set(promotors.index), set(mynorm.index)), :]
promotors.UCSC_RefGene_Name.unique()  # no probes for IFI44L in mynorm

In [None]:
samplesheet.ICU.value_counts()

In [None]:
frames = []

for group in samplesheet.Status.unique():

    samples = samplesheet[samplesheet.Status == group].index
    temp_mynorm = mynorm[samples].T
    results_df = []

    for gene in genes:
        probes = promotors[promotors["UCSC_RefGene_Name"].str.contains(gene)].index

        if len(probes) == 0:
            continue

        promotor_methylation_level = temp_mynorm[probes].mean(axis=1)
        promotor_methylation_level.name = gene

        temp = pd.concat(
            (
                promotor_methylation_level,
                samplesheet.loc[samples, ["intercept", "Sex", "Age", "ICU"]],
            ),
            axis=1,
        )

        model = sts.Logit(
            endog=temp["ICU"], exog=temp[["intercept", "Sex", "Age", gene]]
        )

        model = model.fit()
        pval = model.pvalues.loc[gene]

        record = {"Promotor": gene, "Group": group, "p-value": pval}
        results_df.append(record)

    results_df = pd.DataFrame(results_df)
    _, results_df["FDR"], _, _ = mp(results_df["p-value"], method="fdr_bh")
    frames.append(results_df)

results = pd.concat(frames)

In [None]:
results

In [None]:
results[results["FDR"] <= 0.05]