A pipeline for processing single-cell RNA-seq data for bladder cancer

This pipeline is based on dataset [GSE146137](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE146137).


# Preprocessing

In [1]:
import os
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from scipy.stats import median_abs_deviation as mad
%matplotlib inline

In [2]:
data_path = "data/"
check_path = "checkpoints/"

# os.makedirs(check_path)

os.chdir("../")
print(os.getcwd())

/Users/flynnzhang/CMU/Spring24/02620-ML4Scientists/scRNA-seq_ML


In [3]:
import warnings
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)

## Data Preprocess Done!

In [4]:
adata = sc.read_h5ad(check_path + 'cell_type.h5ad')

In [11]:
adata

AnnData object with n_obs × n_vars = 23873 × 2723
    obs: 'Sample', 'Title', 'Marker', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'batch', 'leiden', 'major_celltype'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'batch_colors', 'dendrogram_leiden', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [7]:
adata.obs['major_celltype'].value_counts()

major_celltype
immune cells          10372
fibroblasts            9046
epithelial cells       3181
endothelial cells      1039
smooth muscle cell      235
Name: count, dtype: int64

## Add cluster label

In [9]:
adata_combined = sc.read_h5ad(check_path + 'combined.h5ad')

In [12]:
# Map the cluster results
adata_combined.obs['kmeans_clusters'] = adata.obs['leiden']
adata_combined.obs['cell_types'] = adata.obs['major_celltype']
adata_combined

AnnData object with n_obs × n_vars = 23873 × 19724
    obs: 'Sample', 'Title', 'Marker', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'batch', 'kmeans_clusters', 'cell_types'
    var: 'n_cells'
    uns: 'log1p'
    layers: 'counts'

In [13]:
adata_combined.write_h5ad(check_path + 'combined_labeled.h5ad')