In [None]:
import scanpy as sc
adata = sc.read_h5ad('data/LucaExtended_downloaded.06-21-2024.h5ad')

# Count unique samples
n_samples = adata.obs['sample'].nunique()

# Count unique donors
n_donors = adata.obs['donor_id'].nunique()

print(f"Number of unique samples: {n_samples}")
print(f"Number of unique donors: {n_donors}")


In [None]:
disease_categories = [
    'chronic obstructive pulmonary disease',
    'lung adenocarcinoma',
    'squamous cell lung carcinoma',
    'non-small cell lung carcinoma',
    'normal'
]


In [None]:
import pandas as pd

# Filter to relevant disease categories
adata_filtered = adata[adata.obs['disease'].isin(disease_categories)]

# Group by disease and aggregate unique counts
result = (
    adata_filtered.obs
    .groupby('disease')
    .agg(
        unique_samples=('sample', pd.Series.nunique),
        unique_donors=('donor_id', pd.Series.nunique)
    )
    .reindex(disease_categories)  # to keep the original order
)

print(result)
