In [None]:
# Notebook: Expression Analysis [including PARP9 gene]
# Description: Notebook contains differential expresion analysis for covid-19 and non-covid-19 patients
# Data collected from: GSE157103

In [None]:
import os

import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

from src.stats import LogModel

np.seed = 101


def calculate_vif(df: pd.DataFrame) -> pd.DataFrame:
    vif = pd.DataFrame()
    vif["variables"] = cf.columns
    vif["VIF"] = [variance_inflation_factor(cf.values, i) for i in range(cf.shape[1])]

    return vif

In [None]:
# GSE ids
id_ = pd.read_csv("../data/Annotations/GSE174818.csv", index_col=0)
id_ = id_.description.map(lambda x: x.split(",")[1])
id_ = dict(zip(id_.values, id_.index))

In [None]:
# SampleSheet
samplesheet = pd.read_csv("../data/raw/SampleSheet.csv", index_col=0)
samplesheet

In [None]:
samplesheet[samplesheet.Status.str.contains("USA")]

In [None]:
# Covid Specific Genes
deg = pd.read_csv("../Files/COVSpecificDMPs.csv", index_col=0)[
    ["UCSC_RefGene_Name", "UCSC_RefGene_Group"]
].dropna()

deg

In [None]:
genes_set = set(deg.UCSC_RefGene_Name.str.split(";").explode())
len(genes_set)

In [None]:
raw_ttable = pd.read_table("../Expression/GSE157103_genes.tpm.tsv", index_col=0).T
raw_ttable.index = raw_ttable.index.map(id_)
raw_ttable = raw_ttable.loc[~raw_ttable.index.isna()]

In [None]:
# Select differential methyalted genes

In [None]:
ttable = raw_ttable[genes_set.intersection(set(raw_ttable.columns))]
ttable = ttable.loc[:, ttable.mean() > 1]

genes_set = set(ttable.columns)  # Update genes
print("Updated number of genes: ", len(genes_set))

ttable

In [None]:
# clinical
clinical = pd.read_excel(
    "../data/Annotations/GSE174818_supplement.xlsx", index_col=1, skiprows=1
)
clinical.index = clinical.index.map(id_)
clinical = clinical.loc[~clinical.index.isna()]
clinical

In [None]:
# DEG [differential expressed genes] analysis

In [None]:
cf = pd.read_csv(
    "../data/processed/CF/raw_CF.csv", index_col=0
)  # -> Estimated WBC fractions for each sample using RPC method from EpiDish package [using raw methylation profiles]

cf = cf.loc[ttable.index, :]
cf = cf.drop("Eosino", axis=1)
cf = cf.drop("Mono", axis=1)

# VIF
calculate_vif(cf)

In [None]:
samplesheet.Status.unique()

In [None]:
data = pd.concat(
    (cf, samplesheet[["Sex", "Age", "Status"]], clinical[["Steroids", "ICU (1=yes)"]]),
    axis=1,
).dropna()

data["POI"] = [1 if "COVID" in status else 0 for status in data.Status]

data = data.drop(["Status"], axis=1)
data.Sex = data.Sex.map({"F": 1, "M": 0})
data = data.dropna()
data

In [None]:
analysis = LogModel(data=ttable, pheno_table=data, response_var="POI")
results = analysis.run()

In [None]:
results

In [None]:
results.to_csv("../Files/DEGs_COV_OI_stats.csv")