In [None]:
# Analysis of cell fraction correction impact on methylation levels

In [None]:
import os
import random

random.seed = 44

import typing as t

import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import scipy.stats as sts

import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

In [None]:
# Load data

In [None]:
labels = pd.read_csv("../data/raw/SampleSheet.csv", index_col=0)["Status"]
labels

In [None]:
labels.value_counts()

In [None]:
# raw WBC fractions - before refBase correction

In [None]:
cf = pd.read_csv("../data/processed/CF/raw_CF.csv", index_col=0)
cf = pd.concat((cf, labels), axis=1).dropna()
cf

In [None]:
# WBC fractions - estimated using methylation profiles before correction

In [None]:
fig = px.box(
    cf,
    color="Status",
    labels={"variable": "", "value": "Frequency"},
    points=False,
    category_orders={
        "Status": [
            "COVID-19 ES",
            "COVID-19 PL",
            "COVID-19 USA 1",
            "COVID-19 USA 2",
            "Healthy controls",
        ]
    },
)

fig.update_layout(
    legend=dict(font=dict(size=24), title=""),
    font=dict(size=22),
    width=1600,
    height=600,
)
fig.update_traces(marker=dict(size=10))
fig.update_yaxes(range=[0, 1])

fig.write_image("../Plots/CF_before_CFC.jpg")
fig.show()

In [None]:
# WBC fractions - estimated using methylation profiles after correction

In [None]:
cfc = pd.read_csv("../data/processed/CF/corrected_CF.csv", index_col=0)
cfc = pd.concat((cfc, labels), axis=1).dropna()
cfc

In [None]:
cfc.Status.unique()

In [None]:
fig = px.box(
    cfc,
    color="Status",
    labels={"variable": "", "value": "Frequency"},
    points=False,
    category_orders={
        "Status": [
            "COVID-19 ES",
            "COVID-19 PL",
            "COVID-19 USA 1",
            "COVID-19 USA 2",
            "Healthy controls",
        ]
    },
)


fig.update_layout(
    legend=dict(font=dict(size=24), title=""),
    font=dict(size=22),
    width=1600,
    height=600,
)
fig.update_traces(marker=dict(size=10))
fig.update_yaxes(range=[0, 1])

fig.write_image("../Plots/CF_after_CFC.jpg")
fig.show()

In [None]:
# LR model te estimate methylation level variance explained by WBC composition

In [None]:
# Load data before and after correction
mynorm_no_cfc = pd.read_csv("../data/interim/NEW_ALL/myNorm.csv", index_col=0)[
    labels.index
]

mynorm_cfc = pd.read_parquet(
    "../data/processed/CorrectedMyNorms/mynorm.parquet",
    columns=labels.index.tolist(),
)  # mynorm after correction

In [None]:
cpgs_to_test = random.sample(set(mynorm_cfc.index), 10000)

mynorm_cfc = mynorm_cfc.loc[cpgs_to_test, :]
mynorm_no_cfc = mynorm_no_cfc.loc[cpgs_to_test, :]

In [None]:
dataset = t.Union[pd.DataFrame, pd.Series, np.array]


def calculate_adj_r2(r2: float, n: int, p: int) -> float:
    """
    Return adj. R2 coefficient.
    r2 - R2 coefficient: float
    n - number of observation: int
    p - number of explanatory variables (excluding constant): int
    """

    param = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

    if param < 0:
        return 0

    elif param > 1:
        return 1

    else:
        return param


def model(X: dataset, y: dataset) -> float:
    """
    Calculate adj. R2 per cpg.
    X - matrix of WBC fractions per sample.
    y - vector [methylation levels per sample].
    """
    
    model = LinearRegression(fit_intercept=True)
    model.fit(X, y)
    r2 = r2_score(y, model.predict(X))
    adj_r2 = calculate_adj_r2(r2=r2, n=len(y), p=X.shape[1])

    return adj_r2


# Use 10.000 random CpGs
# Per each CpG fit model: CpG methylation level ~ WBC fractions

results = []
wbc_composition = cf.drop(
    "Status", axis=1
)  # Estimated WBC fractions by EpiDish package [using raw methylation profiles]

for cpg in tqdm(cpgs_to_test):

    # Get corrected beta values for specific CpG
    met_level_corrected = mynorm_cfc.loc[cpg, :]

    # Merge corrected beta values with WBC composition
    df_cfc = pd.concat((wbc_composition, met_level_corrected), axis=1).astype(float)

    X_cfc = df_cfc.drop(cpg, axis=1).values  # WBC fractions - predictors
    y_cfc = df_cfc[cpg].values.reshape(-1, 1)  # Methylation levels - response variable

    # Fit model using corrected beta values
    r2_cfc = model(X_cfc, y_cfc)

    # Get raw beta values [before correction]
    met_level_no_cfc = mynorm_no_cfc.loc[cpg, :]
    df_no_cfc = pd.concat((wbc_composition, met_level_no_cfc), axis=1)

    X_no_cfc = df_no_cfc.drop(cpg, axis=1).values  # again WBC fractions - predictors
    y_no_cfc = df_no_cfc[cpg].values.reshape(
        -1, 1
    )  # Methylation levels - response variable

    # Fit model using raw beta values before cf-corrction
    r2_no_cfc = model(X_no_cfc, y_no_cfc)

    results.append(
        {"Marker": cpg, "Raw data": r2_no_cfc, "Cell-fraction corrected data": r2_cfc}
    )

In [None]:
results = pd.DataFrame(results).set_index("Marker")
results

In [None]:
# Visualise results
fig = px.box(results, labels={"variable": "", "value": "Adjusted R2"}, points="all")

fig.update_layout(legend=dict(font=dict(size=24), title=""), font=dict(size=22))
fig.update_traces(marker=dict(size=10))

fig.update_yaxes(range=[0, 1])
fig.write_image("../Plots/AdjR2.jpg")
fig.show()