In [1]:
import os

os.chdir("../../src")

In [2]:
import h5py
from utils.paths import RAW_DATA_PATH, PREPROCESSED_DATA_PATH
import utils.data.hdf5
import utils.data.preprocessing
import utils.dev.notebook as dev
import utils.data.dataset
import numpy as np

#### Preprocessing config specification

In [3]:
preprocessing_cfg = dev.dict_to_namespace(
    {
        "transforms": ["normalize_total", "log1p"],  # , "divide_by_nonzero_median"
        "groups_to_clone": ["obs", "var", "uns"],
        "chunk_size": 1000,
        "read_file": {
            "filename": "GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad"
        },
        "write_file": {
            "filename": "GSE194122_openproblems_neurips2021_cite_BMMC_processed_processed.h5"
        },
    }
)

#### Examining structure of read hdf5 file

In [4]:
with h5py.File(RAW_DATA_PATH / preprocessing_cfg.read_file.filename, "r") as f:
    utils.data.hdf5.print_hdf5_structure_tree("", f)



\X
\X
\X\data
\X\indices
\X\indptr
\layers
\layers
\layers\counts
\layers\counts
\layers\counts\data
\layers\counts\indices
\layers\counts\indptr
\obs
\obs
\obs\ADT_iso_count
\obs\ADT_n_antibodies_by_counts
\obs\ADT_pseudotime_order
\obs\ADT_total_counts
\obs\DonorAge
\obs\DonorBMI
\obs\DonorBloodType
\obs\DonorGender
\obs\DonorID
\obs\DonorNumber
\obs\DonorRace
\obs\DonorSmoker
\obs\Ethnicity
\obs\GEX_n_genes_by_counts
\obs\GEX_pct_counts_mt
\obs\GEX_phase
\obs\GEX_pseudotime_order
\obs\GEX_size_factors
\obs\Modality
\obs\QCMeds
\obs\Samplename
\obs\Site
\obs\VendorLot
\obs\__categories
\obs\__categories
\obs\__categories\DonorBloodType
\obs\__categories\DonorGender
\obs\__categories\DonorNumber
\obs\__categories\DonorRace
\obs\__categories\DonorSmoker
\obs\__categories\Ethnicity
\obs\__categories\GEX_phase
\obs\__categories\Modality
\obs\__categories\QCMeds
\obs\__categories\Samplename
\obs\__categories\Site
\obs\__categories\batch
\obs\__categories\cell_type
\obs\__categories\is_t

#### Running configured preprocessing

In [5]:
utils.data.preprocessing.preprocess_and_save_dataset(cfg=preprocessing_cfg)

Data preprocessing:  99%|█████████▉| 90/91 [01:12<00:00,  1.24it/s]


#### Examining structure of write hdf5 file

In [6]:
with h5py.File(
    PREPROCESSED_DATA_PATH / preprocessing_cfg.write_file.filename, "r"
) as f:
    utils.data.hdf5.print_hdf5_structure_tree("", f)



\X
\X
\X\data
\X\indices
\X\indptr
\obs
\obs
\obs\ADT_iso_count
\obs\ADT_n_antibodies_by_counts
\obs\ADT_pseudotime_order
\obs\ADT_total_counts
\obs\DonorAge
\obs\DonorBMI
\obs\DonorBloodType
\obs\DonorGender
\obs\DonorID
\obs\DonorNumber
\obs\DonorRace
\obs\DonorSmoker
\obs\Ethnicity
\obs\GEX_n_genes_by_counts
\obs\GEX_pct_counts_mt
\obs\GEX_phase
\obs\GEX_pseudotime_order
\obs\GEX_size_factors
\obs\Modality
\obs\QCMeds
\obs\Samplename
\obs\Site
\obs\VendorLot
\obs\__categories
\obs\__categories
\obs\__categories\DonorBloodType
\obs\__categories\DonorGender
\obs\__categories\DonorNumber
\obs\__categories\DonorRace
\obs\__categories\DonorSmoker
\obs\__categories\Ethnicity
\obs\__categories\GEX_phase
\obs\__categories\Modality
\obs\__categories\QCMeds
\obs\__categories\Samplename
\obs\__categories\Site
\obs\__categories\batch
\obs\__categories\cell_type
\obs\__categories\is_train
\obs\_index
\obs\batch
\obs\cell_type
\obs\is_train
\uns
\uns
\uns\dataset_id
\uns\genome
\uns\organism
\v

#### Loading the two datasets to comapre the wether the preprocessing works

In [8]:
read_file_dataset_cfg = dev.dict_to_namespace(
    {
        "path": RAW_DATA_PATH
        / "GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad",
        "rowsize": 14087,
        "obs": {
            "columns": [
                {
                    "org_name": "cell_type",
                    "new_name": "cell_type",
                    "remap_categories": False,
                },
                {"org_name": "batch", "new_name": "batch", "remap_categories": False},
                {"org_name": "Site", "new_name": "site", "remap_categories": False},
            ]
        },
    }
)
write_file_dataset_cfg = dev.dict_to_namespace(
    {
        "path": PREPROCESSED_DATA_PATH
        / "GSE194122_openproblems_neurips2021_cite_BMMC_processed_processed.h5",
        "rowsize": 14087,
        "obs": {
            "columns": [
                {
                    "org_name": "cell_type",
                    "new_name": "cell_type",
                    "remap_categories": False,
                },
                {"org_name": "batch", "new_name": "batch", "remap_categories": False},
                {"org_name": "Site", "new_name": "site", "remap_categories": False},
            ]
        },
    }
)

read_file_dataset = utils.data.dataset.hdf5SparseDataset(
    dataset_idxs=np.array([0, 4]), cfg=read_file_dataset_cfg
)
write_file_dataset = utils.data.dataset.hdf5SparseDataset(
    dataset_idxs=np.array([0, 4]), cfg=write_file_dataset_cfg
)

In [9]:
display(next(iter(read_file_dataset)))
display(next(iter(write_file_dataset)))

{'data': tensor([[0.0000, 0.0000, 0.0000,  ..., 0.1544, 0.9172, 0.6069]]),
 'cell_type': tensor([27]),
 'batch': tensor([0]),
 'site': tensor([0])}

{'data': tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.6722e-05, 9.9350e-05,
          6.5732e-05]]),
 'cell_type': tensor([27]),
 'batch': tensor([0]),
 'site': tensor([0])}

The values at least seem to be different.