In [1]:
import os

os.chdir("../../src")

In [2]:
import utils.data as data_utils
import utils.data.extraction, utils.data.splits, utils.data.dataset, utils.data.dataloader
import utils.dev_notebooks_utils as dev
from utils.paths import RAW_DATA_PATH, PREPROCESSED_DATA_PATH

In [3]:
extraction_cfg = dev.dict_to_namespace(
    {
        "path": PREPROCESSED_DATA_PATH
        / "GSE194122_openproblems_neurips2021_cite_BMMC_processed_processed.h5",
        "obs": {"columns": ["cell_type", "batch", "Site"]},
    }
)
extraction_cfg
obs = data_utils.extraction.get_dataset_obs(extraction_cfg)
obs.head(6)

Unnamed: 0,cell_type,batch,Site
GCATTAGCATAAGCGG-1-s1d1,27,0,0
TACAGGTGTTAGAGTA-1-s1d1,15,0,0
AGGATCTAGGTCTACT-1-s1d1,27,0,0
GTAGAAAGTGACACAG-1-s1d1,19,0,0
TCCGAAAAGGATCATA-1-s1d1,35,0,0
CTCCCAATCCATTGGA-1-s1d1,29,0,0


In [4]:
fraction_split_cfg = dev.dict_to_namespace({"val_fraction": 0.2})
train_idxs, val_idxs = data_utils.splits.naive_mixing_fraction_split(
    max_idx=len(obs), cfg=fraction_split_cfg
)
train_idxs, val_idxs

(array([ 4881, 15365, 41140, ..., 44380, 25727,  6321]),
 array([58618, 31221, 36590, ..., 21952, 64949,  7565]))

In [5]:
composite_kfold_split_cfg = dev.dict_to_namespace(
    {
        "grid_variables": [
            {"name": "cell_type", "as_codes": True},
            {"name": "batch", "as_codes": True},
        ],
        "n_splits": 2,
        "random_state": 0,
    }
)
split = data_utils.splits.composite_k_fold_split(df=obs, cfg=composite_kfold_split_cfg)
# train_idxs, val_idxs =

In [6]:
next(iter(split))

(array([    2,     4,     7, ..., 90254, 90257, 90258]),
 array([    0,     1,     3, ..., 90256, 90259, 90260]))

In [6]:
train_dataset_cfg = dev.dict_to_namespace(
    {
        "path": RAW_DATA_PATH
        / "GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad",
        "rowsize": 14087,
        "obs": {
            "columns": [
                {
                    "org_name": "cell_type",
                    "new_name": "cell_type",
                    "remap_categories": False,
                },
                {"org_name": "batch", "new_name": "batch", "remap_categories": False},
                {"org_name": "Site", "new_name": "site", "remap_categories": False},
            ]
        },
    }
)
train_dataset = data_utils.dataset.hdf5SparseDataset(
    dataset_idxs=train_idxs, cfg=train_dataset_cfg
)

In [7]:
next(iter(train_dataset))

{'data': tensor([[0.0000, 0.0000, 0.0000,  ..., 0.1544, 0.9172, 0.6069]]),
 'cell_type': tensor([27]),
 'batch': tensor([0]),
 'site': tensor([0])}

In [8]:
train_dataset._dataset_idxs

array([    0,     1,     2, ..., 90257, 90258, 90260])

In [9]:
train_dataloader_cfg = dev.dict_to_namespace(
    {"dataloader": {"batch_size": 16, "num_workers": 1}}
)
train_dataloader = data_utils.dataloader.get_hdf5SparseDataloader(
    train_dataloader_cfg, train_dataset
)

In [10]:
batch = next(iter(train_dataloader))

In [11]:
batch

{'data': tensor([[0.0000, 0.0000, 0.0000,  ..., 0.8635, 3.2267, 0.8213],
         [0.0000, 0.0000, 0.0000,  ..., 1.8377, 2.0756, 1.1691],
         [0.0000, 0.0000, 0.0000,  ..., 2.2885, 0.6505, 1.6634],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.7781, 2.4880, 0.9627],
         [0.0000, 0.0000, 0.0000,  ..., 0.5656, 2.1396, 1.0073],
         [0.0000, 0.0000, 0.0000,  ..., 0.9567, 0.6283, 1.1119]]),
 'cell_type': tensor([35, 15,  3,  5, 13,  1, 22, 15, 25, 15,  1, 12, 13, 35, 34, 22]),
 'batch': tensor([ 1,  1,  1,  2,  2,  3,  4,  5,  5,  6,  7,  9,  9, 11, 11, 11]),
 'site': tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3])}