In [None]:
# Identfication of DEGs between ICU and non-ICU COVID-19 patients

In [None]:
import os

from statsmodels.stats.outliers_influence import variance_inflation_factor
import scipy.stats as sts
import pandas as pd
import numpy as np

from src.stats import LogModel

np.seed = 101


def calculate_vif(df: pd.DataFrame) -> pd.DataFrame:
    vif = pd.DataFrame()
    vif["variables"] = cf.columns
    vif["VIF"] = [variance_inflation_factor(cf.values, i) for i in range(cf.shape[1])]

    return vif

In [None]:
# GSE ids
id_ = pd.read_csv("../data/Annotations/GSE174818.csv", index_col=0)
id_ = id_.description.map(lambda x: x.split(",")[1])
id_ = dict(zip(id_.values, id_.index))

In [None]:
# SampleSheet
samplesheet = pd.read_csv("../data/raw/SampleSheet.csv", index_col=0).dropna()
samplesheet

In [None]:
# Covid Specific Genes
deg = pd.read_csv("../Files/COVSpecificDMPs.csv", index_col=0)[
    ["UCSC_RefGene_Name", "UCSC_RefGene_Group"]
].dropna()

deg

In [None]:
genes_set = set(deg.UCSC_RefGene_Name.str.split(";").explode())
len(genes_set)

In [None]:
raw_ttable = pd.read_table("../Expression/GSE157103_genes.tpm.tsv", index_col=0).T
raw_ttable.index = raw_ttable.index.map(dict(zip(samplesheet.ID, samplesheet.index)))

raw_ttable = raw_ttable.loc[~raw_ttable.index.isna(), :]
raw_ttable

In [None]:
# Select differential methyalted genes

In [None]:
ttable = raw_ttable[genes_set.intersection(set(raw_ttable.columns))]
ttable = ttable.loc[:, ttable.mean() > 1]

genes_set = set(ttable.columns)  # Update genes
print("Updated number of genes: ", len(genes_set))

In [None]:
# clinical data
clinical = pd.read_excel(
    "../data/Annotations/GSE174818_Clinical.xlsx", index_col=1, skiprows=1
)
clinical.index = clinical.index.map(dict(zip(samplesheet.ID, samplesheet.index)))
clinical = clinical.loc[~clinical.index.isna()]
clinical = clinical[clinical["Diagnosis"] == "COVID-19"]

In [None]:
# DEG [differential expressed genes] analysis

In [None]:
cf = pd.read_csv(
    "../data/processed/CF/raw_CF.csv", index_col=0
)  # -> Estimated WBC fractions for each sample using RPC method from EpiDish package [using raw methylation profiles]

cf = cf.loc[ttable.index, :]
cf = cf.drop("Eosino", axis=1)
cf = cf.drop("Mono", axis=1)

# VIF
calculate_vif(cf)

In [None]:
data = pd.concat(
    (cf, samplesheet[["Sex", "Age"]], clinical[["Steroids", "ICU (1=yes)"]]),
    axis=1,
).dropna()

data.Sex = data.Sex.map({"F": 1, "M": 0})
df = data.dropna()
data

In [None]:
ttable

In [None]:
analysis = LogModel(data=ttable, pheno_table=data, response_var="ICU (1=yes)")
results = analysis.run()

In [None]:
results

In [None]:
results.to_csv("../Files/GenesICU_nonICU_stats.csv")