In [None]:
from multiprocessing import Pool

import os
import pandas as pd

from src.stats import StatsAnalysis

In [None]:
mynorm = pd.read_parquet("../../data/processed/CorrectedMyNorms/mynorm.parquet")
mynorm

In [None]:
sampleSheet = pd.read_csv("../../data/raw/SampleSheet.csv", index_col=0)
sampleSheet = sampleSheet[sampleSheet["ICU"] != "Home"]
sampleSheet

In [None]:
sampleSheet.Status.unique()

In [None]:
sampleSheet.Status.value_counts()

In [None]:
runs = (
    ("COVID-19 USA 1", "Healthy controls"),
    ("COVID-19 USA 2", "Healthy controls"),
    ("COVID-19 ES", "Healthy controls"),
    ("COVID-19 PL", "Healthy controls"),
    ("COVID-19 USA 1", "Other respiratory infections USA 1"),
    ("COVID-19 USA 2", "Other respiratory infections USA 1"),
    ("COVID-19 ES", "Other respiratory infections USA 1"),
    ("COVID-19 PL", "Other respiratory infections USA 1"),
    ("COVID-19 USA 1", "Other respiratory infections USA 2"),
    ("COVID-19 USA 2", "Other respiratory infections USA 2"),
    ("COVID-19 ES", "Other respiratory infections USA 2"),
    ("COVID-19 PL", "Other respiratory infections USA 2"),
)

print(len(runs))
runs

In [None]:
epic = pd.read_parquet(os.environ.get("POETRY_EPIC"))

In [None]:
def process(design: str) -> None:
    name = "_vs_".join(design)

    if not os.path.exists(f"output/{name}.csv"):

        target_group, control_group = design
        target_samples = sampleSheet[sampleSheet.Status == target_group].index
        control_samples = sampleSheet[(sampleSheet.Status == control_group)].index

        target_mynorm = mynorm[target_samples].T
        control_mynorm = mynorm[control_samples].T

        analysis = StatsAnalysis(
            df_target=target_mynorm, df_control=control_mynorm, epic=epic
        )

        results = analysis.run(name=name)
        results.to_csv(f"output/{name}.csv")

In [None]:
with Pool(8) as p:
    p.map(process, runs)