In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable, axes_size
import matplotlib.colors as mcolors
from matplotlib.transforms import Bbox
from matplotlib.colors import to_rgba
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
import scvi
import scanpy as sc
import anndata as an
import scanpy.external as sce
import scipy
import scipy.sparse as sp
import time
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
import torch
from scipy.sparse import csr_matrix
import logging

from importlib import reload

# local imports
import utils as ut
import plotting as plt2

sc.settings.verbosity = 3 
torch.set_float32_matmul_precision("high")

In [2]:
print(torch.version.cuda) 

12.0


In [3]:
num_processors = os.cpu_count()
print(f"Number of processors: {num_processors}") 

Number of processors: 36


In [4]:
# Check CUDA availability
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

if cuda_available:
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs: {num_gpus}")

CUDA available: False




# load the data

In [5]:
fpath = "/scratch/indikar_root/indikar1/shared_data/sc_HSC/SCANVI/model/reference_adata.h5ad"
adata = sc.read_h5ad(fpath)
sc.logging.print_memory_usage()
print(adata)

Memory usage: current 2.25 GB, difference +2.25 GB
AnnData object with n_obs × n_vars = 81442 × 3000
    obs: 'n_genes', 'dataset', 'n_genes_by_counts', 'total_counts', 'obs_index', 'cell_type', 'standard_cell_type', 'cell_label', '_scvi_batch', '_scvi_labels', 'scvi_clusters', 'scanvi_clusters', '_scvi_raw_norm_scaling'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'gene_id', 'token_id', 'gene_biotype', 'Chromosome', 'Start', 'End', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: '_scvi_manager_uuid', '_scvi_uuid', 'dataset_colors', 'hvg', 'neighbors', 'scanvi_clusters', 'scanvi_clusters_colors', 'scvi_clusters', 'scvi_clusters_colors', 'standard_cell_type_colors', 'umap'
    obsm: 'X_scANVI', 'X_scVI', 'X_umap'
    layers: 'SCANVI_counts', 'counts'
    obsp: 'connectivities', 'distances'


# Filtering

In [6]:
adata = adata[adata.obs['cell_label'].isin(['Fib', 'HSC']), :].copy()
adata = adata[~adata.obs['dataset'].isin(['tabula_sapiens']), :].copy()

adata

AnnData object with n_obs × n_vars = 22086 × 3000
    obs: 'n_genes', 'dataset', 'n_genes_by_counts', 'total_counts', 'obs_index', 'cell_type', 'standard_cell_type', 'cell_label', '_scvi_batch', '_scvi_labels', 'scvi_clusters', 'scanvi_clusters', '_scvi_raw_norm_scaling'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'gene_id', 'token_id', 'gene_biotype', 'Chromosome', 'Start', 'End', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: '_scvi_manager_uuid', '_scvi_uuid', 'dataset_colors', 'hvg', 'neighbors', 'scanvi_clusters', 'scanvi_clusters_colors', 'scvi_clusters', 'scvi_clusters_colors', 'standard_cell_type_colors', 'umap'
    obsm: 'X_scANVI', 'X_scVI', 'X_umap'
    layers: 'SCANVI_counts', 'counts'
    obsp: 'connectivities', 'distances'

# Load the model

In [7]:
fpath = "/scratch/indikar_root/indikar1/shared_data/sc_HSC/SCANVI/model"
model = scvi.model.SCANVI.load(
    fpath, 
    adata=adata,
    prefix="reference_",
)

[34mINFO    [0m File [35m/scratch/indikar_root/indikar1/shared_data/sc_HSC/SCANVI/model/[0m[95mreference_model.pt[0m already downloaded 


/home/cstansbu/miniconda3/envs/scanpy/lib/python3.12/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/cstansbu/miniconda3/envs/scanpy/lib/python3.12 ...
  model = torch.load(model_path, map_location=map_location)


In [8]:
adata.obs['prediction'] = model.predict()
adata.obs['prediction'].value_counts()

prediction
HSC     14096
Fib      7736
MPP       205
MKP        24
MEP        20
LMPP        2
GMP         1
CLP         1
CMP         1
Name: count, dtype: int64

In [9]:
pd.crosstab(adata.obs['prediction'].values, adata.obs['cell_label'].values)

col_0,Fib,HSC
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
CLP,0,1
CMP,0,1
Fib,7736,0
GMP,0,1
HSC,0,14096
LMPP,0,2
MEP,0,20
MKP,0,24
MPP,0,205


In [10]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# Feature Importance

In [None]:
def train_gbt_on_anndata(adata, target_column):
  """
  Trains a Gradient Boosting Tree classifier on an AnnData object.
  Handles string target columns by encoding them numerically.

  Args:
    adata: An AnnData object. The .X attribute should contain the feature data.
    target_column: The name of the column in adata.obs to use as the target variable.

  Returns:
    A trained GradientBoostingClassifier object.
  """

  # Extract features and target variable
  X = adata.X
  y = adata.obs[target_column]

  # Encode the target variable if it's a string
  if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

  # Initialize the Gradient Boosting Classifier
  gbt = GradientBoostingClassifier()

  # Train the model
  gbt.fit(X, y)

  return gbt


gbt = train_gbt_on_anndata(adata, 'prediction')
gbt

In [None]:
fdf = pd.DataFrame({
    'gene_name' : adata.var_names,   
    'weight' : gbt.feature_importances_,
})

fdf = fdf.sort_values(by='weight', ascending=False)
fdf.head(15)

In [None]:
break

# Pertubation Experiment

In [None]:
sample_size = 10
fibs = [x for x in adata.obs_names if 'fib' in x]
sample_cells = np.random.choice(fibs, sample_size, replace=False)

sdata = adata[sample_cells, :].copy()
sdata.X = sdata.X.todense()

pred = model.predict(
    sdata,
    soft=True,
)

pred[['Fib', 'HSC']].mean()

# perturb each gene

In [None]:
scvi.settings.verbosity = 30

result = []

up_by = 100

for gene in sdata.var.index:

    pert = sdata.copy()
    pert[:, gene].X = pert[:, gene].X + up_by

    pred = model.predict(
        pert,
        soft=True,
    )

    row = pred[['Fib', 'HSC']].mean().to_dict()
    row['gene_name'] = gene
    result.append(row)
    break


result = pd.DataFrame(result)
result.head()

In [None]:
def generate_sparse_normal_matrix(n, m, density, mean=0, std=1):
  """
  Generates a random sparse matrix with normally distributed integer values.

  Args:
    n: Number of rows.
    m: Number of columns.
    density: The desired density of the matrix (proportion of non-zero elements).
    mean: Mean of the normal distribution.
    std: Standard deviation of the normal distribution.

  Returns:
    A sparse matrix in CSR format.
  """

  # Generate random indices for non-zero elements
  row_ind = np.random.randint(0, n, int(n * m * density))
  col_ind = np.random.randint(0, m, int(n * m * density))

  # Generate normally distributed values, then round to integers
  data = np.random.normal(loc=mean, scale=std, size=len(row_ind)).astype(int)

  # Create the sparse matrix in CSR format
  return sp.csr_matrix((data, (row_ind, col_ind)), shape=(n, m))

n = len(sdata.obs)
m = len(sdata.var)
density = 0.10

noise = generate_sparse_normal_matrix(n, m, density)
print(f"{noise.shape=}")
sdata.X = sdata.X + noise

pred = model.predict(
    sdata,
    soft=True,
)

pred[['Fib', 'HSC']]

In [None]:
?model.predict

In [None]:
break

# Load data

In [None]:
"""
DATA
"""
fpath = "/scratch/indikar_root/indikar1/shared_data/sc_HSC/SCANVI/raw_data.h5ad"
adata = sc.read_h5ad(fpath)
adata.X = adata.layers['counts'].copy()
sc.logging.print_memory_usage()
print(adata)