In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable, axes_size
from matplotlib.transforms import Bbox
from matplotlib.transforms import Affine2D
from matplotlib.collections import QuadMesh
import seaborn as sns
import scanpy as sc
import scanpy.external as sce
import scipy
from scipy.spatial.distance import cdist, pdist, squareform
from scipy.sparse import csr_matrix
import sklearn
from importlib import reload

# locals
import plotting as plt2

sc.settings.verbosity = 3  

In [2]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/geneformer_adata/merged.anndata.h5ad"

adata = sc.read_h5ad(fpath)
sc.logging.print_memory_usage()
adata.X = csr_matrix(adata.X) 
sc.logging.print_memory_usage()

adata

Memory usage: current 4.25 GB, difference +4.25 GB
Memory usage: current 4.25 GB, difference +0.00 GB


AnnData object with n_obs × n_vars = 166703 × 19311
    obs: 'n_genes', 'doublet_score', 'predicted_doublet', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'cell_type', 'Barcode', 'Library', 'dataset', 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender', 'celltype', 'record_id', 'cell_id', 'nCount_RNA', 'nFeature_RNA', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_SCT', 'nFeature_SCT', 'SCT.weight', 'ATAC.weight', 'seurat_clusters', 'STD.CellType', 'STD_Cat', 'STD_Cat2', 'Sample', 'HLF', 'CRHBP', 'CD34', 'MitoCoverage', 'ClonalGroup', 'Sig.HSC1', 'Sig.Prog1', 'Sig.EarlyE1', 'Sig.LateE1', 'Sig.ProMono1', 'Sig.Mono1', 'Sig.ncMono1', 'Sig.cDC1', 'Sig.pDC1', 'Sig.ProB1', 'Sig.PreB1', 'Sig.B1', 'Sig.Plasma1', 'Sig.T1', 'Sig.CTL1', 'Sig.NK1', 'meanCov', 'ClonalGroup.Prob', 'wsnn_res.0.8', 'Origin.Seurat'
    var: 'gene_name', 'ensemble_id'

In [3]:
adata.obs['cell_type'].value_counts()

cell_type
FB                        26586
CD4                       11925
HSC                       11863
EryP                      10788
MPP                        9553
Refined.HSC                9194
iHSC                       8574
CD8                        8376
MEP                        7593
Mono                       7251
ProB                       6862
LinNegCD34PosCD164Pos      6343
GMP                        4984
LinNegCD34NegCD164high     4434
MDP                        4273
LinNegCD34lowCD164high     4266
NK                         4246
MKP                        4058
CLP                        3702
B                          3466
CMP                        2931
pDC                        1882
cDC                         936
LMPP                        817
Plasma                      727
PreBNK                      592
LinNegCD34NegCD164low       358
MLP                         123
Name: count, dtype: int64

In [4]:
cell_type_map = {
    'B': 'B_cell',
    'CD4': 'T_cell',
    'CD8': 'T_cell',
    'CLP': 'CLP',
    'CMP': 'CMP',
    'EryP': 'EryP',
    'FB': 'Fib',
    'GMP': 'GMP',
    'HSC': 'HSC',
    'LMPP': 'LMPP',
    'LinNegCD34NegCD164high': 'LinNeg',
    'LinNegCD34NegCD164low': 'LinNeg',
    'LinNegCD34PosCD164Pos': 'LinNeg',
    'LinNegCD34lowCD164high': 'LinNeg',
    'MDP': 'MDP',
    'MEP': 'MEP',
    'MKP': 'MKP',
    'MLP': 'MLP',
    'MPP': 'MPP',
    'Mono': 'Mono',
    'NK': 'NK',
    'Plasma': 'B_cell',
    'PreBNK': 'PreBNK',
    'ProB': 'B_cell',
    'Refined.HSC': 'HSC',
    'cDC': 'Dendritic_cell',
    'iHSC': 'iHSC',
    'pDC': 'Dendritic_cell',
}

adata.obs['cell_type_standard'] = adata.obs['cell_type'].map(cell_type_map)
adata.obs['cell_type_standard'].value_counts().sort_index()

cell_type_standard
B_cell            11055
CLP                3702
CMP                2931
Dendritic_cell     2818
EryP              10788
Fib               26586
GMP                4984
HSC               21057
LMPP                817
LinNeg            15401
MDP                4273
MEP                7593
MKP                4058
MLP                 123
MPP                9553
Mono               7251
NK                 4246
PreBNK              592
T_cell            20301
iHSC               8574
Name: count, dtype: int64

In [5]:
gene_mask, counts = sc.pp.filter_genes(
    adata, 
    min_cells=100, 
    inplace=False,
)

sc.pp.highly_variable_genes(
    adata,
    n_top_genes=5000,
    batch_key='dataset',
    flavor='seurat_v3',
)

selected_genes = adata.var[gene_mask]
print(f"{selected_genes.shape=}")

print(f"Number highly variable genes: {adata.var['highly_variable'].sum()}")

selected_genes.head()

filtered out 1203 genes that are detected in less than 100 cells
extracting highly variable genes
--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)
selected_genes.shape=(18108, 8)
Number highly variable genes: 5000


Unnamed: 0_level_0,gene_name,ensemble_id,highly_variable,highly_variable_rank,means,variances,variances_norm,highly_variable_nbatches
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A1BG,A1BG,ENSG00000121410,True,602.0,0.046814,0.098671,0.909736,1
A2M,A2M,ENSG00000175899,True,1363.5,2.903289,46604.678173,1.566125,8
A2ML1,A2ML1,ENSG00000166535,False,2698.5,0.038596,1.061809,1.070525,4
A3GALT2,A3GALT2,ENSG00000184389,True,1880.0,0.009802,0.011709,0.818573,1
A4GALT,A4GALT,ENSG00000128274,False,2490.5,0.179235,31.156865,0.826409,6


In [6]:
target_sum = 1e6
sc.pp.normalize_total(adata, target_sum=target_sum)
sc.pp.log1p(adata) 

sc.logging.print_memory_usage()

adata

normalizing counts per cell
    finished (0:00:03)
Memory usage: current 4.32 GB, difference +0.07 GB


AnnData object with n_obs × n_vars = 166703 × 19311
    obs: 'n_genes', 'doublet_score', 'predicted_doublet', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'cell_type', 'Barcode', 'Library', 'dataset', 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender', 'celltype', 'record_id', 'cell_id', 'nCount_RNA', 'nFeature_RNA', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_SCT', 'nFeature_SCT', 'SCT.weight', 'ATAC.weight', 'seurat_clusters', 'STD.CellType', 'STD_Cat', 'STD_Cat2', 'Sample', 'HLF', 'CRHBP', 'CD34', 'MitoCoverage', 'ClonalGroup', 'Sig.HSC1', 'Sig.Prog1', 'Sig.EarlyE1', 'Sig.LateE1', 'Sig.ProMono1', 'Sig.Mono1', 'Sig.ncMono1', 'Sig.cDC1', 'Sig.pDC1', 'Sig.ProB1', 'Sig.PreB1', 'Sig.B1', 'Sig.Plasma1', 'Sig.T1', 'Sig.CTL1', 'Sig.NK1', 'meanCov', 'ClonalGroup.Prob', 'wsnn_res.0.8', 'Origin.Seurat', 'cell_type_standard'
    var: 'gene_name',

In [7]:
columns_to_drop = [
    'Barcode',
    'Library',
    'method',
    'donor',
    'anatomical_information', 
    'n_counts_UMIs',
    'manually_annotated',
    'gender', 
    'record_id', 
    'cell_id',
    'nCount_RNA', 
    'nFeature_RNA', 
    'nCount_ATAC', 
    'nFeature_ATAC',
    'nCount_SCT',
    'nFeature_SCT',
    'SCT.weight', 
    'ATAC.weight',
    'seurat_clusters', 
    'STD.CellType', 
    'STD_Cat', 
    'STD_Cat2',
    'Sample',
    'HLF',
    'CRHBP',
    'CD34', 
    'MitoCoverage', 
    'ClonalGroup', 
    'Sig.HSC1',
    'Sig.Prog1', 
    'Sig.EarlyE1',
    'Sig.LateE1',
    'Sig.ProMono1',
    'Sig.Mono1',
    'Sig.ncMono1', 
    'Sig.cDC1', 
    'Sig.pDC1',
    'Sig.ProB1',
    'Sig.PreB1',
    'Sig.B1',
    'Sig.Plasma1',
    'Sig.T1',
    'Sig.CTL1', 
    'Sig.NK1',
    'meanCov',
    'ClonalGroup.Prob', 
    'wsnn_res.0.8',
    'Origin.Seurat',
    
]

adata.obs = adata.obs.drop(columns=columns_to_drop)

sc.logging.print_memory_usage()

adata

Memory usage: current 4.27 GB, difference -0.05 GB


AnnData object with n_obs × n_vars = 166703 × 19311
    obs: 'n_genes', 'doublet_score', 'predicted_doublet', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'cell_type', 'dataset', 'organ_tissue', 'cell_ontology_class', 'free_annotation', 'compartment', 'celltype', 'cell_type_standard'
    var: 'gene_name', 'ensemble_id', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'hvg', 'log1p'

In [8]:
outpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/psuedotime/clean_data.h5ad"
adata.write(outpath)
adata

AnnData object with n_obs × n_vars = 166703 × 19311
    obs: 'n_genes', 'doublet_score', 'predicted_doublet', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'cell_type', 'dataset', 'organ_tissue', 'cell_ontology_class', 'free_annotation', 'compartment', 'celltype', 'cell_type_standard'
    var: 'gene_name', 'ensemble_id', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'hvg', 'log1p'

In [9]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# Magic

In [None]:
gene_list = adata.var[adata.var['highly_variable']].index.to_list()

# sce.pp.magic(
#     adata, 
#     name_list='highly_variable', 
#     solver='approximate',
#     n_pca=100,
#     knn=5,
# )

# adata

In [None]:
break

In [None]:
# get the closest cells to the HSC centroid

hsc_data = adata[adata.obs['cell_type'] == 'HSC', :]
X = hsc_data.obsm['X_pca']
centroid = X.mean(axis=0)
print(f"{X.shape=} {centroid.shape=}")

# Calculate Distances (using cdist for efficiency)
distances = cdist(X, [centroid], metric='euclidean')  # Euclidean distance

# Rank Rows (Ascending: closest first)
ranked_indices = np.argsort(distances, axis=0).flatten()
hsc_data.obs.iloc[ranked_indices].head(10)

In [None]:
center_cell = "CCGTTTGGTGATGAAA-3_young2_HSC"
adata.uns['iroot'] = np.flatnonzero(adata.obs.loc[center_cell])[0]

components = 7

sc.tl.diffmap(
    adata,  
    n_comps=components,
)
sc.tl.dpt(
    adata, 
    n_dcs=components-1, 
    n_branchings=4,
)

In [None]:
plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 5, 5

plt2.plot_umap_scatter(
    adata, 
    s=8, 
    alpha=0.9, 
    ec='none',
    cmap='inferno',
    vmin=0,
    vmax=0.075,
    # colorbar=False,
    color='dpt_pseudotime',
)

In [None]:
sns.boxplot(
    data=adata.obs,
    x='cell_type',
    y='dpt_pseudotime',
    showfliers=False,
)

plt.gca().tick_params(axis='x', rotation=90)

adata