In [1]:
import sys
import os
import pandas as pd
import numpy as np
import scanpy as sc
from scipy import sparse
import gc 

In [2]:
%load_ext memory_profiler

In [3]:
%%memit
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/geneformer_adata/processed.anndata.h5ad"

adata = sc.read_h5ad(fpath)
adata

peak memory: 44357.14 MiB, increment: 44108.54 MiB


In [4]:
sc.logging.print_memory_usage()

Memory usage: current 43.31 GB, difference +43.31 GB


In [5]:
%%memit
del adata.obsp['connectivities']
del adata.obsp['distances']

del adata.layers['log_norm']
del adata.layers['raw_counts']

del adata.obsm['X_pca']
del adata.obsm['X_umap']
del adata.obsm['X_tsne']

del adata.uns['tsne']
del adata.uns['pca']
del adata.uns['umap']
del adata.uns['scrublet']
del adata.uns['neighbors']
del adata.uns['HSC_vs_FB']
del adata.uns['HSC_vs_FB_pure']
del adata.uns['fb_vs_hsc_up']
del adata.uns['hsc_v_fib_up']
del adata.uns['hvg']
del adata.uns['log1p']
del adata.uns['scenic_transcription_factors']
del adata.uns['tabula_sapiens_deg']

del adata.varm['PCs']

var_to_remove = [
    'n_cells',
    'mt',
    'n_cells_by_counts',
    'mean_counts',
    'pct_dropout_by_counts',
    'total_counts',
    'highly_variable',
    'means',
    'dispersions',
    'dispersions_norm',
    'ensembl_id',
]

adata.var = adata.var.drop(columns=var_to_remove)

obs_to_remove = [
    'n_genes',
    'doublet_score', 
    'predicted_doublet',
    'n_genes_by_counts',
    'total_counts',
    'total_counts_mt',
    'pct_counts_mt',
    'Barcode',
    'Library',
    'method',
    'donor',
    'anatomical_information',
    'n_counts_UMIs',
    'cell_ontology_class',
    'free_annotation',
    'manually_annotated',
    'compartment',
    'gender',
    'celltype',
    'record_id', 
    'cell_id',
    'nCount_RNA',
    'nFeature_RNA',
    'nCount_ATAC',
    'nFeature_ATAC',
    'nCount_SCT',
    'nFeature_SCT',
    'SCT.weight',
    'ATAC.weight',
    'seurat_clusters',
    'STD.CellType',
    'STD_Cat', 
    'STD_Cat2', 
    'Sample',
    'HLF',
    'CRHBP',
    'CD34',
    'MitoCoverage',
    'ClonalGroup',
    'Sig.HSC1',
    'Sig.Prog1',
    'Sig.EarlyE1',
    'Sig.LateE1',
    'Sig.ProMono1', 
    'Sig.Mono1',
    'Sig.ncMono1',
    'Sig.cDC1',
    'Sig.pDC1',
    'Sig.ProB1',
    'Sig.PreB1',
    'Sig.B1',
    'Sig.Plasma1',
    'Sig.T1',
    'Sig.CTL1',
    'Sig.NK1',
    'meanCov', 
    'ClonalGroup.Prob',
    'wsnn_res.0.8',
    'Origin.Seurat',
]

adata.obs = adata.obs.drop(columns=obs_to_remove)

adata

peak memory: 44354.01 MiB, increment: 0.00 MiB


In [6]:
sc.logging.print_memory_usage()

Memory usage: current 35.91 GB, difference -7.40 GB


In [7]:
%%memit
gc.collect()

peak memory: 36771.89 MiB, increment: 0.00 MiB


In [8]:
sc.logging.print_memory_usage()

Memory usage: current 35.91 GB, difference +0.00 GB


In [9]:
adata

AnnData object with n_obs × n_vars = 166685 × 19146
    obs: 'cell_type', 'dataset', 'organ_tissue'
    var: 'gene_name'
    uns: 'go_annotations', 'panglaodb'

In [10]:
%%memit
keep = [
    'iHSC',
    'LinNegCD34lowCD164high',
    'HSC',
    'LinNegCD34PosCD164Pos',
    'MPP',
    'MLP',
    'FB',
    'MKP',
    'Refined.HSC',
    'LMPP',
]

adata = adata[adata.obs['cell_type'].isin(keep), :]

adata

peak memory: 36772.72 MiB, increment: 0.82 MiB


In [11]:
%%memit
gc.collect()

peak memory: 36772.74 MiB, increment: 0.00 MiB


In [12]:
# downcast
adata.X = adata.X.astype('float32')
adata

View of AnnData object with n_obs × n_vars = 81376 × 19146
    obs: 'cell_type', 'dataset', 'organ_tissue'
    var: 'gene_name'
    uns: 'go_annotations', 'panglaodb'

In [13]:
%%memit
gc.collect()

peak memory: 36764.27 MiB, increment: 0.03 MiB


In [14]:
sc.logging.print_memory_usage()

Memory usage: current 35.90 GB, difference -0.01 GB


In [15]:
adata.to_df().head()

gene_name,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
AAACCCAAGGTTACCT_iHSC,0.0,0.000712,0.11865,0.150425,0.0,0.060194,0.0,0.026687,0.059761,0.324561,...,0.07042,0.055333,0.011605,0.029785,0.121771,4.7e-05,1.107014,0.04766,0.299082,0.001992
AAACCCAAGTTGAAGT_iHSC,0.0,0.000712,0.11865,0.177985,0.0,0.060194,0.0,0.026687,0.059761,0.0,...,0.07042,0.055333,0.011605,0.029785,0.121771,4.7e-05,0.0,0.04766,0.299082,0.001992
AAACCCAAGTTGTCGT_iHSC,0.0,0.000712,0.11865,0.0,0.0,0.060194,0.0,0.026687,0.059761,0.27006,...,0.07042,0.055333,0.011605,0.029785,0.121771,4.7e-05,0.0,0.04766,0.299082,0.001992
AAACCCACAGAAGCGT_iHSC,0.0,0.000712,0.11865,0.258408,0.0,0.060194,0.0,0.026687,0.059761,0.464914,...,0.07042,0.055333,0.011605,0.029785,0.121771,4.7e-05,0.0,0.04766,0.299082,0.001992
AAACCCACAGGAGGTT_iHSC,0.0,0.000712,0.11865,0.199586,0.0,0.060194,0.0,0.026687,0.059761,0.0,...,0.07042,0.055333,0.011605,0.029785,1.220238,4.7e-05,0.0,0.876175,0.299082,0.001992
