# Subset

## Load Libraries

In [1]:
import anndata as ad
import numpy as np

## Input Data

In [2]:
file_path = "/home/ubuntu/projects/project_data/thesis/global_raw.h5ad"

adata = ad.read_h5ad(filename=file_path)

In [3]:
adata

AnnData object with n_obs × n_vars = 486134 × 33538
    obs: 'NRP', 'age_group', 'cell_source', 'cell_type', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'source', 'type', 'version', 'cell_states', 'Used'
    var: 'gene_ids-Harvard-Nuclei', 'feature_types-Harvard-Nuclei', 'gene_ids-Sanger-Nuclei', 'feature_types-Sanger-Nuclei', 'gene_ids-Sanger-Cells', 'feature_types-Sanger-Cells', 'gene_ids-Sanger-CD45', 'feature_types-Sanger-CD45'
    uns: 'cell_type_colors'
    obsm: 'X_pca', 'X_umap'

## Subset

In [4]:
shuffle = True

# Set subset size
number_rows = 50000
number_cols = 5000

if shuffle:
    # Shuffle indices before subsetting
    shuffled_rows = np.random.permutation(adata.n_obs)
    shuffled_cols = np.random.permutation(adata.n_vars)
    print(shuffled_rows)
    print(shuffled_cols)
    adata = adata[shuffled_rows, shuffled_cols]
    adata = adata[:number_rows, :number_cols]
else:
    # Indices are not shuffled
    adata = adata[:number_rows, :number_cols]

[196898 406930 229465 ... 439113 280043  86485]
[ 8815  3589 30451 ... 33006 17397  9241]


In [5]:
adata

View of AnnData object with n_obs × n_vars = 50000 × 5000
    obs: 'NRP', 'age_group', 'cell_source', 'cell_type', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'source', 'type', 'version', 'cell_states', 'Used'
    var: 'gene_ids-Harvard-Nuclei', 'feature_types-Harvard-Nuclei', 'gene_ids-Sanger-Nuclei', 'feature_types-Sanger-Nuclei', 'gene_ids-Sanger-Cells', 'feature_types-Sanger-Cells', 'gene_ids-Sanger-CD45', 'feature_types-Sanger-CD45'
    uns: 'cell_type_colors'
    obsm: 'X_pca', 'X_umap'

## Save Data

In [6]:
file_name = f"adata_{number_rows}x{number_cols}_sample.h5ad"

adata.write_h5ad(file_name)