In [1]:
import pandas as pd
import sys
import numpy as np
import os
import scanpy as sc
from cytotrace2_py.cytotrace2_py import *

# load the data

In [2]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/scanpy/hsc_landscape.anndata.h5ad"

adata = sc.read_h5ad(fpath)
sc.logging.print_memory_usage()

adata

Memory usage: current 8.90 GB, difference +8.90 GB


AnnData object with n_obs × n_vars = 54347 × 19070
    obs: 'n_counts', 'cell_type', 'dataset', 'organ_tissue', 'n_genes', 'new_cluster', 'cell_label', 'cluster_str', 'UMAP 1', 'UMAP 2'
    var: 'gene_name', 'ensemble_id', 'n_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'B_mem', 'B_naive', 'Baso', 'DC', 'Endo', 'Eos', 'Ery', 'Fibro', 'HSC', 'Macro', 'Mega', 'Mono', 'MDSC', 'Neut', 'Osteo', 'Plasma', 'pDC', 'Retic', 'Stromal', 'cluster_genes'
    uns: 'cell_type_colors', 'dataset_colors', 'go_annotations', 'hvg', 'log1p', 'neighbors', 'new_cluster', 'new_cluster_colors', 'organ_tissue_colors', 'panglaodb', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'log_norm', 'raw_counts'
    obsp: 'connectivities', 'distances'

# Set Up

In [3]:
# set the model directory
use_model_dir = pkg_resources.resource_filename("cytotrace2_py","resources/17_models_weights/")

# get the data
df = adata.to_df()
df.head()

gene_name,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
AAACCCAAGGTTACCT_iHSC,0.0,0.000675,0.138474,0.203302,0.0,0.099126,0.0,0.034469,0.066149,0.460292,...,0.032653,0.039942,0.007543,0.024654,0.011357,7.3e-05,0.752474,0.115562,0.089065,0.009961
AAACCCAAGTTGAAGT_iHSC,0.0,0.000675,0.138474,0.241128,0.0,0.099126,0.0,0.034469,0.066149,0.0,...,0.032653,0.039942,0.007543,0.024654,0.011357,7.3e-05,0.007223,0.115562,0.089065,0.009961
AAACCCAAGTTGTCGT_iHSC,0.0,0.000675,0.138474,0.0,0.0,0.099126,0.0,0.034469,0.066149,0.377951,...,0.032653,0.039942,0.007543,0.024654,0.011357,7.3e-05,0.007223,0.115562,0.089065,0.009961
AAACCCACAGAAGCGT_iHSC,0.0,0.000675,0.138474,0.355235,0.0,0.099126,0.0,0.034469,0.066149,0.680807,...,0.032653,0.039942,0.007543,0.024654,0.011357,7.3e-05,0.007223,0.115562,0.089065,0.009961
AAACCCACAGGAGGTT_iHSC,0.0,0.000675,0.138474,0.271254,0.0,0.099126,0.0,0.034469,0.066149,0.0,...,0.032653,0.039942,0.007543,0.024654,0.459458,7.3e-05,0.007223,0.794285,0.089065,0.009961


# get predictions

In [4]:
def iter_chunks(df, chunk_size=10000):
    for start in range(0, len(df), chunk_size):
        yield df.iloc[start:start + chunk_size]
        
results = []

chunk_num = 0
for chunk in iter_chunks(df):
    chunk_num += 1
    print(f"{chunk_num=}")
    
    cell_names, gene_names, ranked_data = preprocess(chunk, 'human')
    
    # top variable genes
    top_col_inds = top_var_genes(ranked_data)
    top_col_names = gene_names[top_col_inds]
    
    # predict the 
    predicted_df = predict(
        ranked_data, 
        cell_names, 
        use_model_dir , 
        chunk.shape[0],
    )
    
    # smoothing
    smooth_score = smoothing_by_diffusion(
        predicted_df, 
        ranked_data, 
        top_col_inds, 
        3000, # smoothing size  
        42, # random seed
    )
    
    binned_score_pred_df = binning(
        predicted_df, 
        smooth_score,
    )
    
    binned_score_pred_df = binned_score_pred_df.reset_index(names='cell_id')
    results.append(binned_score_pred_df)

results = pd.concat(results)
results.head()

chunk_num=1
    Mapped 13954 input gene names to mouse orthologs
    13953 input genes are present in the model features.
Please consider reducing the smooth_batch_size to a number in range 1000 - 3000 for runtime and memory efficiency.
chunk_num=2
    Mapped 13954 input gene names to mouse orthologs
    13953 input genes are present in the model features.
Please consider reducing the smooth_batch_size to a number in range 1000 - 3000 for runtime and memory efficiency.
chunk_num=3
    Mapped 13954 input gene names to mouse orthologs
    13953 input genes are present in the model features.
Please consider reducing the smooth_batch_size to a number in range 1000 - 3000 for runtime and memory efficiency.
chunk_num=4
    Mapped 13954 input gene names to mouse orthologs
    13953 input genes are present in the model features.
Please consider reducing the smooth_batch_size to a number in range 1000 - 3000 for runtime and memory efficiency.
chunk_num=5
    Mapped 13954 input gene names to mou

Unnamed: 0,cell_id,preKNN_CytoTRACE2_Score,preKNN_CytoTRACE2_Potency
0,AAACCCAAGGTTACCT_iHSC,0.540109,Multipotent
1,AAACCCAAGTTGAAGT_iHSC,0.354983,Oligopotent
2,AAACCCAAGTTGTCGT_iHSC,0.057476,Differentiated
3,AAACCCACAGAAGCGT_iHSC,0.04624,Differentiated
4,AAACCCACAGGAGGTT_iHSC,0.644041,Multipotent


In [5]:
outpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/cytotrace/cytotrace_results.csv"

results.to_csv(outpath, index=False)
results.head()

Unnamed: 0,cell_id,preKNN_CytoTRACE2_Score,preKNN_CytoTRACE2_Potency
0,AAACCCAAGGTTACCT_iHSC,0.540109,Multipotent
1,AAACCCAAGTTGAAGT_iHSC,0.354983,Oligopotent
2,AAACCCAAGTTGTCGT_iHSC,0.057476,Differentiated
3,AAACCCACAGAAGCGT_iHSC,0.04624,Differentiated
4,AAACCCACAGGAGGTT_iHSC,0.644041,Multipotent


In [6]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# Set up

In [None]:
cell_names, gene_names, ranked_data = preprocess(X, 'human')

# top variable genes
top_col_inds = top_var_genes(ranked_data)
top_col_names = gene_names[top_col_inds]

In [None]:
# predict by unrandomized chunked batches
predicted_df = predict(ranked_data, cell_names, use_model_dir , X.shape[0])
predicted_df.head()