In [1]:
import os

os.chdir("../../src")

In [2]:
import utils.data as data_utils
import utils.data.extraction, utils.data.splits, utils.data.dataset, utils.data.dataloader
import utils.dev_notebooks_utils as dev
from utils.paths import RAW_DATA_PATH, PREPROCESSED_DATA_PATH

In [3]:
extraction_cfg = dev.dict_to_namespace(
    {
        "path": PREPROCESSED_DATA_PATH
        / "GSE194122_openproblems_neurips2021_cite_BMMC_processed_processed.h5",
        "obs": {"columns": ["cell_type", "batch", "Site"]},
    }
)
extraction_cfg
obs = data_utils.extraction.get_dataset_obs(extraction_cfg)
obs.head(6)

Unnamed: 0,cell_type,batch,Site
GCATTAGCATAAGCGG-1-s1d1,27,0,0
TACAGGTGTTAGAGTA-1-s1d1,15,0,0
AGGATCTAGGTCTACT-1-s1d1,27,0,0
GTAGAAAGTGACACAG-1-s1d1,19,0,0
TCCGAAAAGGATCATA-1-s1d1,35,0,0
CTCCCAATCCATTGGA-1-s1d1,29,0,0


In [4]:
fraction_split_cfg = dev.dict_to_namespace({"val_fraction": 0.2})
train_idxs, val_idxs = data_utils.splits.naive_mixing_fraction_split(
    max_idx=len(obs), cfg=fraction_split_cfg
)

In [5]:
train_dataset_cfg = dev.dict_to_namespace(
    {
        "path": RAW_DATA_PATH
        / "GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad",
        "rowsize": 14087,
        "obs": {
            "columns": [
                {
                    "org_name": "cell_type",
                    "new_name": "cell_type",
                    "remap_categories": False,
                },
                {"org_name": "batch", "new_name": "batch", "remap_categories": False},
                {"org_name": "Site", "new_name": "site", "remap_categories": False},
            ]
        },
    }
)
train_dataset = data_utils.dataset.hdf5SparseDataset(
    dataset_idxs=train_idxs, cfg=train_dataset_cfg
)

In [6]:
next(iter(train_dataset))

{'data': tensor([[0.0000, 0.0000, 0.0000,  ..., 0.1544, 0.9172, 0.6069]]),
 'cell_type': tensor([27]),
 'batch': tensor([0]),
 'site': tensor([0])}

In [7]:
train_dataset._dataset_idxs

array([    0,     1,     2, ..., 90257, 90258, 90260])

In [8]:
train_dataloader_cfg = dev.dict_to_namespace(
    {"dataloader": {"batch_size": 16, "num_workers": 1}}
)
train_dataloader = data_utils.dataloader.get_hdf5SparseDataloader(
    train_dataloader_cfg, train_dataset
)

In [9]:
batch = next(iter(train_dataloader))

In [10]:
batch

{'data': tensor([[0.0000, 0.0000, 0.0000,  ..., 1.5427, 1.1026, 1.2521],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 2.4066, 0.5532],
         [0.0000, 0.0000, 0.0000,  ..., 0.8319, 0.5547, 0.1701],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 1.1480, 0.8132, 0.9377],
         [0.0000, 0.0000, 0.0000,  ..., 1.7889, 0.3393, 1.0597],
         [0.0000, 0.0000, 0.0000,  ..., 0.8540, 0.6415, 0.9881]]),
 'cell_type': tensor([15, 35, 13, 17,  5, 34, 35,  5, 15, 19,  9, 36, 15,  9,  8,  8]),
 'batch': tensor([ 0,  0,  1,  3,  3,  4,  4,  4,  6,  6,  7,  8,  8, 10, 10, 11]),
 'site': tensor([0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3])}