# Subset

## Load Libraries

In [2]:
import anndata as ad
import numpy as np

## Input Data

In [3]:
file_path = "/home/ubuntu/projects/project_data/thesis/global_raw.h5ad"

adata = ad.read_h5ad(filename=file_path)

In [4]:
adata

AnnData object with n_obs × n_vars = 486134 × 33538
    obs: 'NRP', 'age_group', 'cell_source', 'cell_type', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'source', 'type', 'version', 'cell_states', 'Used'
    var: 'gene_ids-Harvard-Nuclei', 'feature_types-Harvard-Nuclei', 'gene_ids-Sanger-Nuclei', 'feature_types-Sanger-Nuclei', 'gene_ids-Sanger-Cells', 'feature_types-Sanger-Cells', 'gene_ids-Sanger-CD45', 'feature_types-Sanger-CD45'
    uns: 'cell_type_colors'
    obsm: 'X_pca', 'X_umap'

In [6]:
adata.var.columns

Index(['gene_ids-Harvard-Nuclei', 'feature_types-Harvard-Nuclei',
       'gene_ids-Sanger-Nuclei', 'feature_types-Sanger-Nuclei',
       'gene_ids-Sanger-Cells', 'feature_types-Sanger-Cells',
       'gene_ids-Sanger-CD45', 'feature_types-Sanger-CD45'],
      dtype='object')

## Subset

In [7]:
shuffle = True

# Set subset size
number_rows = 50000
number_cols = 5000

if shuffle:
    # Shuffle indices before subsetting
    shuffled_rows = np.random.permutation(adata.n_obs)
    shuffled_cols = np.random.permutation(adata.n_vars)
    print(shuffled_rows)
    print(shuffled_cols)
    adata = adata[shuffled_rows, shuffled_cols]
    adata = adata[:number_rows, :number_cols]
else:
    # Indices are not shuffled
    adata = adata[:number_rows, :number_cols]

[265456  16745  16635 ...  87587 131868  92741]
[26914  2285 20453 ... 24693 29950  8813]


In [8]:
adata

View of AnnData object with n_obs × n_vars = 50000 × 5000
    obs: 'NRP', 'age_group', 'cell_source', 'cell_type', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'source', 'type', 'version', 'cell_states', 'Used'
    var: 'gene_ids-Harvard-Nuclei', 'feature_types-Harvard-Nuclei', 'gene_ids-Sanger-Nuclei', 'feature_types-Sanger-Nuclei', 'gene_ids-Sanger-Cells', 'feature_types-Sanger-Cells', 'gene_ids-Sanger-CD45', 'feature_types-Sanger-CD45'
    uns: 'cell_type_colors'
    obsm: 'X_pca', 'X_umap'

In [11]:
adata.var

Unnamed: 0,gene_ids-Harvard-Nuclei,feature_types-Harvard-Nuclei,gene_ids-Sanger-Nuclei,feature_types-Sanger-Nuclei,gene_ids-Sanger-Cells,feature_types-Sanger-Cells,gene_ids-Sanger-CD45,feature_types-Sanger-CD45
PIK3R5,ENSG00000141506,Gene Expression,ENSG00000141506,0,ENSG00000141506,0,ENSG00000141506,0
AL031275.1,ENSG00000235736,Gene Expression,ENSG00000235736,0,ENSG00000235736,0,ENSG00000235736,0
PLEKHA5,ENSG00000052126,Gene Expression,ENSG00000052126,0,ENSG00000052126,0,ENSG00000052126,0
VCPKMT,ENSG00000100483,Gene Expression,ENSG00000100483,0,ENSG00000100483,0,ENSG00000100483,0
CREB1,ENSG00000118260,Gene Expression,ENSG00000118260,0,ENSG00000118260,0,ENSG00000118260,0
...,...,...,...,...,...,...,...,...
IL16,ENSG00000172349,Gene Expression,ENSG00000172349,0,ENSG00000172349,0,ENSG00000172349,0
LINC01840,ENSG00000230215,Gene Expression,ENSG00000230215,0,ENSG00000230215,0,ENSG00000230215,0
ARMH2,ENSG00000260286,Gene Expression,ENSG00000260286,0,ENSG00000260286,0,ENSG00000260286,0
WDR63,ENSG00000162643,Gene Expression,ENSG00000162643,0,ENSG00000162643,0,ENSG00000162643,0


## Save Data

In [6]:
file_name = f"adata_{number_rows}x{number_cols}_sample.h5ad"

adata.write_h5ad(file_name)