In [None]:
# Analysis to find probes correlated with set of COVID-19 specific DMPs

In [None]:
import os

import scipy.stats as sts
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict
from src.figures import clustermap, scatterplot
from src.col_palette import pal

import multiprocessing

In [None]:
dmps = pd.read_csv("../Files/COVSpecificDMPs.csv", index_col=0).index.tolist()
len(dmps)

In [None]:
samplesheet = pd.read_csv("../data/raw/SampleSheet.csv", index_col=0)
samplesheet = samplesheet[samplesheet.Status.str.contains("COVID")]
samplesheet

In [None]:
mynorm = pd.read_parquet(
    "../data/processed/CorrectedMyNorms/mynorm.parquet", columns=samplesheet.index
)
mynorm = mynorm.T
cpgs = mynorm.columns

In [None]:
corr = mynorm[dmps].corr()

corr.index.name = ""
corr.columns.name = ""

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

with sns.axes_style("white"):
    fig, ax = plt.subplots(figsize=(16, 16))
    ax = sns.heatmap(corr, mask=mask, square=True, annot=False, annot_kws={"size": 12})

fig.savefig("../Plots/CorrelationMap.png", dpi=300)

In [None]:
fig = plt.figure(figsize=(30, 30))
sns.set_context("paper", rc={"axes.labelsize": 20})

fig = sns.pairplot(
    mynorm[dmps],
    corner=True,
)

fig.savefig("../Plots/PairplotCpGs.png", dpi=300)
plt.show()

In [None]:
samplesheet.Status.unique()

In [None]:
def find_correlated_cpgs(group_name: str) -> list:

    samples = samplesheet[
        (samplesheet.Status == group_name) & (samplesheet.ICU != "Home")
    ].index
    temp_mynorm = mynorm.loc[samples, :].copy()

    print(f"Set: {group_name}, mynorm: {temp_mynorm.shape}")
    print("Looking for correlated DMPs ...")

    correlated_cpgs = []

    for dmp in dmps:
        dmp_met = temp_mynorm[dmp]

        for cpg in cpgs:
            cpg_met = temp_mynorm[cpg]
            corr, pval = sts.pearsonr(dmp_met, cpg_met)

            if abs(corr) >= 0.7 and pval <= 0.05:
                correlated_cpgs.append(cpg)

    print("DONE")
    return set(correlated_cpgs)


with multiprocessing.Pool(4) as p:
    correlated_cpgs = p.map(find_correlated_cpgs, samplesheet.Status.unique())

In [None]:
sets_of_correlated_cpgs = [set(var) for var in correlated_cpgs]

In [None]:
list_of_probes = set.intersection(*sets_of_correlated_cpgs)
len(list_of_probes)

In [None]:
epic = pd.read_parquet(os.environ.get("POETRY_EPIC"))

In [None]:
annotations = epic.loc[
    list_of_probes,
    [
        "UCSC_RefGene_Name",
        "UCSC_RefGene_Group",
        "Relation_to_UCSC_CpG_Island",
        "Regulatory_Feature_Group",
    ],
]

In [None]:
annotations.to_csv("../Files/ExtendedDMPS.csv")
annotations

In [None]:
annotations.loc[set.difference(set(annotations.index), set(dmps))]

In [None]:
annotations.UCSC_RefGene_Name.str.split(
    ";"
).explode().dropna().drop_duplicates().to_csv("../Files/ExtendedGenes.csv")

In [None]:
# TSNE
from sklearn.manifold import TSNE


def tsne(df: pd.DataFrame, poi_column: str = "Status") -> pd.DataFrame:

    perplexity = min(df[poi_column].value_counts())
    print("Perplexity: ", perplexity)

    tsne = TSNE(n_components=2, method="exact", random_state=101, perplexity=perplexity)
    deco = tsne.fit_transform(df.drop(poi_column, axis=1))
    deco = pd.DataFrame(deco, index=df.index, columns=["t-SNE 1", "t-SNE 2"])

    return pd.concat((deco, df[poi_column]), axis=1)

In [None]:
samplesheet = pd.read_csv("../data/raw/SampleSheet.csv", index_col=0)
samplesheet.Status.unique()

In [None]:
dmps = pd.read_csv("../Files/COVSpecificDMPs.csv", index_col=0)
mynorm = pd.read_parquet("../data/processed/CorrectedMyNorms/mynorm.parquet")
mynorm = mynorm.loc[dmps.index, :].T

In [None]:
df = pd.concat((mynorm, samplesheet[["Status"]]), axis=1)

In [None]:
df_usa1 = df[
    df["Status"].isin(["COVID-19 USA 1", "Other respiratory infections USA 1"])
]

scatterplot(
    tsne(df_usa1, "Status"),
    x="t-SNE 1",
    y="t-SNE 2",
    color_column="Status",
    color_discrete_map=pal,
    trendline=None,
    path="../Plots/tSNE_USA1_correlatedProbes.png",
)

In [None]:
df_usa2 = df[
    df["Status"].isin(["COVID-19 USA 2", "Other respiratory infections USA 2"])
]

scatterplot(
    tsne(df_usa2, "Status"),
    x="t-SNE 1",
    y="t-SNE 2",
    color_column="Status",
    trendline=None,
    color_discrete_map=pal,
    path="../Plots/tSNE_USA2_correlatedProbes.png",
)