## View Demo

In [None]:
from CellTOSG_Loader import CellTOSGSubsetBuilder

query = CellTOSGSubsetBuilder(root="./OmniCellTOSG_v1")

query.view({
    "tissue_general": "brain",
    # "cell_type": ["Microglia", "Astrocyte"],
    "disease": "Alzheimer's Disease",
    # "sex": "female"
    # "age": "adult",
    # "source": "CellxGene",
})


In [None]:
expression_matrix, label_df = query.extract(
    shuffle=True,
    balanced=True,
    # sample_size=10,
    sample_ratio=0.1,
    output_dir="./output/brain_ad"
)

In [None]:
print(expression_matrix.shape)

print(label_df.head())

## Dataset Demo

In [None]:
from CellTOSG_Loader import CellTOSGDataLoader
import numpy as np
import pandas as pd
import os

disease_name = "Alzheimer's Disease"  # Change this to your desired disease name
data_root = "/storage1/fs1/fuhai.li/Active/tianqi.x/OmniCellTOSG/dataset_outputs"
mapping_table_path = os.path.join(data_root, "mapping_table.csv")

dataset = CellTOSGDataLoader(
    root=data_root,
    conditions={
        "tissue_general": "brain",
        # "tissue": "Cerebral cortex",
        # "cell_type": "glutamatergic neuron",
        "disease": disease_name,
        # "gender": "female"
    },
    downstream_task="disease", # One of {"disease", "gender", "cell_type"}.
    label_column="disease", # One of {"disease", "gender", "cell_type"}.
    sample_ratio=0.01,
    sample_size=None,
    balanced=True,
    shuffle=True,
    random_state=2025,
    train_text=False,
    train_bio=False,
    output_dir="./Output/data_ad_disease"  # Change this to your desired output directory
)

X, Y, metadata = dataset.data, dataset.labels, dataset.metadata

# s_name = dataset.s_name
# s_desc = dataset.s_desc
# s_bio = dataset.s_bio

# x_bio_emb = dataset.x_bio_emb

# print(f"s_name shape: {s_name.shape}")
# print(f"s_desc shape: {s_desc.shape}")

# print(f"x_bio_emb shape: {x_bio_emb.shape}")

print(metadata)


In [None]:
print(len(metadata[metadata["sex_normalized"] == "male"]))
print(len(metadata[metadata["sex_normalized"] == "female"]))
print(len(metadata[metadata["sex_normalized"] == "unknown"]))

In [None]:
from CellTOSG_Loader import CellTOSGDataLoader
import numpy as np
import pandas as pd
import os

disease_name = "Alzheimer's Disease"  # Change this to your desired disease name
data_root = "./OmniCellTOSG_v1"

dataset = CellTOSGDataLoader(
    root=data_root,
    conditions={
        "tissue_general": "brain",
        "tissue": None,
        # "suspension_type": "nucleus",
        # "cell_type": "glutamatergic neuron",
        "disease": disease_name,
        # "gender": "female"
    },
    downstream_task="disease", # One of {"disease", "gender", "cell_type"}.
    label_column="disease", # One of {"disease", "gender", "cell_type"}.
    sample_ratio=0.1,
    sample_size=None,
    balanced=True,
    shuffle=True,
    random_state=2025,
    train_text=True,
    train_bio=False,
    output_dir="./output/data_ad_disease"  # Change this to your desired output directory
)

X, Y, metadata = dataset.data, dataset.labels, dataset.metadata

s_name = dataset.s_name
s_desc = dataset.s_desc
# s_bio = dataset.s_bio

x_bio_emb = dataset.x_bio_emb

print(f"s_name shape: {s_name.shape}")
print(f"s_desc shape: {s_desc.shape}")

print(f"x_bio_emb shape: {x_bio_emb.shape}")

print(metadata)


In [None]:
print(len(metadata[metadata["disease_BMG_name"] == "Alzheimer's Disease"]))
print(len(metadata[metadata["disease_BMG_name"] == "normal"]))

In [None]:
print(len(metadata[metadata["sex_normalized"] == "male"]))
print(len(metadata[metadata["sex_normalized"] == "female"]))
print(len(metadata[metadata["sex_normalized"] == "unknown"]))

In [None]:
def print_celltype_sex_age_distribution(metadata, label_column="label"):
    group_cols = ["CMT_name", "sex_normalized", "development_stage_category", label_column]
    grouped = metadata.groupby(group_cols).size().reset_index(name="count")
    
    print("\n[Distribution Summary]")
    for label in grouped[label_column].unique():
        print(f"\nLabel: {label}")
        subset = grouped[grouped[label_column] == label]
        for _, row in subset.iterrows():
            print(f"  {row['CMT_name']} | {row['sex_normalized']} | {row['development_stage_category']} -> {row['count']}")

print_celltype_sex_age_distribution(metadata, label_column="disease_BMG_name")


In [None]:
from CellTOSG_Loader import CellTOSGDataLoader
import numpy as np
import pandas as pd
import os

disease_name = "Alzheimer's Disease"  # Change this to your desired disease name
data_root = "./OmniCellTOSG_v1"

dataset = CellTOSGDataLoader(
    root=data_root,
    conditions={
        "tissue_general": "brain",
        # "tissue": "Cerebral cortex",
        # "suspension_type": "nucleus",
        # "cell_type": "glutamatergic neuron",
        "disease": disease_name,
        # "gender": "female"
    },
    downstream_task="cell_type", # One of {"disease", "gender", "cell_type"}.
    label_column="cell_type", # One of {"disease", "gender", "cell_type"}.
    sample_ratio=0.1,
    sample_size=None,
    balanced=True,
    shuffle=True,
    random_state=2025,
    train_text=True,
    train_bio=False,
    output_dir="./output/data_ad_cell_type"  # Change this to your desired output directory
)

X, Y, metadata = dataset.data, dataset.labels, dataset.metadata

s_name = dataset.s_name
s_desc = dataset.s_desc
# s_bio = dataset.s_bio

x_bio_emb = dataset.x_bio_emb

print(f"s_name shape: {s_name.shape}")
print(f"s_desc shape: {s_desc.shape}")

print(f"x_bio_emb shape: {x_bio_emb.shape}")

print(metadata)


In [None]:
output_dir="./output/data_ad_cell_type"
cell_type_labels = pd.read_csv(os.path.join(output_dir, "label_mapping_cell_type.csv"))
display(cell_type_labels)