In [1]:
import pandas as pd
import scanpy as sc
import numpy as np
import scipy.sparse as sp

In [2]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/TS_Spleen.h5ad"

adata = sc.read_h5ad(fpath)
adata

AnnData object with n_obs × n_vars = 34004 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'cell_ontology_class_colors', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'decontXcounts', 'raw_counts'
    obsp: 'connectivities', 'distances'

In [5]:
# get all cells of given type
cell_type = 'endothelial cell'
cell_mask = (adata.obs['free_annotation'] == cell_type)

# get non-zero genes
nonzero_indices = np.argwhere(np.ravel(adata.layers['raw_counts'].sum(axis=0)) > 0)
nonzero_indices = np.ravel(nonzero_indices)
genes_to_keep = adata.var.iloc[nonzero_indices]

# drop mitochondrial and ribosomal genes
genes_to_keep = genes_to_keep[~genes_to_keep['gene_symbol'].str.startswith('MT')]
genes_to_keep = genes_to_keep[~genes_to_keep['gene_symbol'].str.startswith('RP')]

gene_mask = genes_to_keep.index

print(f"{len(cell_mask)=}")
print(f"{len(gene_mask)=}")

# extract the data 
pdf = adata[cell_mask, gene_mask]
pdf

len(cell_mask)=34004
len(gene_mask)=49340


View of AnnData object with n_obs × n_vars = 596 × 49340
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'cell_ontology_class_colors', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'decontXcounts', 'raw_counts'
    obsp: 'connectivities', 'distances'

In [6]:
df = pdf.to_df(layer='raw_counts')
outpath = "/home/cstansbu/temp/tabula_sapeins_endothelial_cell.csv" 
df = df.astype(int)
df = df.reset_index()
df.to_csv(outpath, index=False)
df.head()

Unnamed: 0,cell_id,DDX11L1,WASH7P,MIR6859-1,MIR1302-2HG,OR4F5,AL627309.1,AL627309.3,CICP27,AL627309.6,...,AC006386.2,LINC00266-4P,CICP1,AC007562.1,AC024067.1,RF00156-25,ANKRD36P1,PARP4P1,CCNQP2,SPRY3-1
0,AAACCCATCCGTGTCT_TSP7_Spleen_NA_10X_1_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AAAGGTAAGGTCATAA_TSP7_Spleen_NA_10X_1_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AACAGGGAGTGCCGAA_TSP7_Spleen_NA_10X_1_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AACCAACGTAGAGATT_TSP7_Spleen_NA_10X_1_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AATCACGCAACTCGAT_TSP7_Spleen_NA_10X_1_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df.set_index('cell_id').to_numpy().max()

80518

In [None]:
break

In [None]:
def nonzero_median(df):
    """
    Calculates the median of nonzero values for each column in a Pandas DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame to compute medians for.

    Returns:
        pandas.Series: A Series containing the median of nonzero values for each column.
    """
    # Replace zeros with NaN
    df_nonzero = df.replace(0, np.nan) 

    # Calculate medians, skipping NaNs (which were the zeros)
    medians = df_nonzero.median(skipna=True)  
    return medians


def normalize_counts(X: sp.csr_matrix, 
                     norm_factor_vector: np.ndarray, 
                     n_counts: np.ndarray, 
                     target_sum: int = 10000) -> sp.csr_matrix:
    """
    Normalizes and scales count data in a sparse matrix.

    This function is commonly used for library size normalization and scaling 
    in single-cell RNA sequencing analysis. It adjusts raw counts to account 
    for differences in sequencing depth and scales them to a target sum 
    (e.g., CPM or TPM normalization).

    Args:
        X: A sparse matrix (csr_matrix) where rows are features (e.g., genes) 
           and columns are samples (e.g., cells). Values represent raw counts.
        norm_factor_vector: A 1D array containing the normalization factor for 
                             each sample (e.g., library size, sequencing depth).
        n_counts: A 1D array containing the total count per sample.
        target_sum: The desired sum of normalized counts per sample. 
                     Defaults to 10,000.

    Returns:
        A sparse matrix (csr_matrix) with normalized and scaled counts.

    Raises:
        ValueError: If dimensions of `X`, `norm_factor_vector`, and `n_counts`
                    are incompatible.
    """
    X_norm = (X / n_counts) * target_sum / norm_factor_vector
    return sp.csr_matrix(X_norm)


n_cells = 1000
X = df.sample(n_cells)
X = X.set_index('cell_id')

# row sums 
n_counts = X.sum(axis=1)

# nonzero column medians - needs work
norm_factor_vector = nonzero_median(X)

X_norm = normalize_counts(X, norm_factor_vector, n_counts)
X.shape

In [None]:
break

In [None]:
A = "cooper"
?A

In [None]:
dir()

In [None]:
?pd.DataFrame