## Data integration

### -- Scanorama

In [2]:
import scanpy as sc
import numpy as np
import scanorama

In [3]:
sc.settings.verbosity = 3 # errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.8 anndata==0.10.5.post1 umap==0.5.5 numpy==1.26.4 scipy==1.12.0 pandas==2.2.1 scikit-learn==1.4.1.post1 statsmodels==0.14.1 pynndescent==0.5.11


In [4]:
# Stroma
stroma = sc.read_h5ad("2.BatchCorrection/stroma.h5ad")
stroma

AnnData object with n_obs × n_vars = 106638 × 1000
    obs: 'method', 'sampleID', 'disease', 'dataset', 'sex', 'age', 'smoking_status', 'major_cell_type', 'cellID', 'histology'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dataset_colors', 'disease_colors', 'hvg', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [5]:
batch_cats = stroma.obs["dataset"].cat.categories.tolist()
stroma_list = [stroma[stroma.obs["dataset"] == b].copy() for b in batch_cats]

In [6]:
scanorama.integrate_scanpy(stroma_list, dimred = 40, knn = 100)
stroma.obsm["X_scanorama"] = np.zeros((stroma.shape[0], stroma_list[0].obsm["X_scanorama"].shape[1]))
for i, b in enumerate(batch_cats):
    stroma.obsm["X_scanorama"][stroma.obs["dataset"] == b] = stroma_list[i].obsm["X_scanorama"]
stroma

Found 1000 genes among all datasets
[[0.00000000e+00 8.66242038e-01 3.80087209e-01 8.42105263e-01
  7.35294118e-01 1.37124755e-01 5.71785268e-01 6.61904762e-01
  8.98967844e-01 7.00000000e-01 7.58201701e-01 4.11157025e-01
  6.00000000e-01 4.09144893e-01 7.81954887e-01 6.37740612e-01
  6.70866659e-01 7.51708428e-01 7.38060518e-01 6.31505899e-01
  2.60869565e-01 5.69364162e-01 4.88467875e-01 3.82004483e-01
  4.21947674e-01 7.94602699e-01 4.42528736e-01 3.38411819e-01
  1.00000000e+00 1.00000000e+00 6.60256410e-01]
 [0.00000000e+00 0.00000000e+00 9.41401274e-01 2.10526316e-01
  2.05882353e-01 1.27388535e-03 2.66242038e-01 2.16560510e-01
  6.63694268e-01 3.33333333e-01 2.34394904e-01 1.10828025e-01
  0.00000000e+00 3.05732484e-01 1.65605096e-01 4.80254777e-01
  4.34394904e-01 8.42824601e-02 3.97452229e-01 7.14649682e-01
  5.47770701e-02 1.21019108e-01 1.07006369e-01 4.26751592e-01
  9.79617834e-01 0.00000000e+00 2.87356322e-02 1.97452229e-01
  0.00000000e+00 0.00000000e+00 1.47770701e-01]


AnnData object with n_obs × n_vars = 106638 × 1000
    obs: 'method', 'sampleID', 'disease', 'dataset', 'sex', 'age', 'smoking_status', 'major_cell_type', 'cellID', 'histology'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dataset_colors', 'disease_colors', 'hvg', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'X_scanorama'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [7]:
stroma.write_h5ad("2.BatchCorrection/stroma_scanorama_unclustered.h5ad")
stroma

AnnData object with n_obs × n_vars = 106638 × 1000
    obs: 'method', 'sampleID', 'disease', 'dataset', 'sex', 'age', 'smoking_status', 'major_cell_type', 'cellID', 'histology'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dataset_colors', 'disease_colors', 'hvg', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'X_scanorama'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [8]:
# Endo
endo = sc.read_h5ad("2.BatchCorrection/endo.h5ad")
endo

AnnData object with n_obs × n_vars = 97190 × 1000
    obs: 'method', 'sampleID', 'disease', 'dataset', 'sex', 'age', 'smoking_status', 'major_cell_type', 'cellID', 'histology'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dataset_colors', 'disease_colors', 'hvg', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [9]:
batch_cats = endo.obs["dataset"].cat.categories.tolist()
endo_list = [endo[endo.obs["dataset"] == b].copy() for b in batch_cats]

In [10]:
scanorama.integrate_scanpy(endo_list, dimred = 40, knn = 100)
endo.obsm["X_scanorama"] = np.zeros((endo.shape[0], endo_list[0].obsm["X_scanorama"].shape[1]))
for i, b in enumerate(batch_cats):
    endo.obsm["X_scanorama"][endo.obs["dataset"] == b] = endo_list[i].obsm["X_scanorama"]
endo

Found 1000 genes among all datasets
[[0.00000000e+00 8.45306122e-01 4.96420299e-01 7.77777778e-01
  1.92505231e-01 8.41004184e-01 8.02666667e-01 8.80765988e-01
  9.10112360e-01 7.07328386e-01 6.88253012e-01 6.26707132e-01
  7.85714286e-01 8.24662813e-01 7.62566138e-01 8.35868695e-01
  7.87649189e-01 5.00555761e-01 6.17511521e-01 8.84328358e-01
  9.18918919e-01 4.41640379e-01 0.00000000e+00 0.00000000e+00
  5.74458683e-01 7.22222222e-01 7.90322581e-01 5.87719298e-01
  7.79631761e-01]
 [0.00000000e+00 0.00000000e+00 4.15102041e-01 6.66666667e-01
  8.16326531e-04 1.21224490e-01 2.17959184e-01 5.21224490e-01
  3.03370787e-01 1.72653061e-01 9.87755102e-02 2.74693878e-01
  1.95918367e-01 4.11836735e-01 3.67346939e-02 3.66122449e-01
  2.51428571e-01 9.47346939e-01 1.01382488e-01 6.37959184e-01
  1.66931638e-01 5.35510204e-01 0.00000000e+00 0.00000000e+00
  1.02040816e-01 0.00000000e+00 1.45161290e-01 5.63673469e-01
  1.48571429e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 3.33333333e-0

AnnData object with n_obs × n_vars = 97190 × 1000
    obs: 'method', 'sampleID', 'disease', 'dataset', 'sex', 'age', 'smoking_status', 'major_cell_type', 'cellID', 'histology'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dataset_colors', 'disease_colors', 'hvg', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'X_scanorama'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [11]:
endo.write_h5ad("2.BatchCorrection/endo_scanorama_unclustered.h5ad")
endo

AnnData object with n_obs × n_vars = 97190 × 1000
    obs: 'method', 'sampleID', 'disease', 'dataset', 'sex', 'age', 'smoking_status', 'major_cell_type', 'cellID', 'histology'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dataset_colors', 'disease_colors', 'hvg', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'X_scanorama'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'