### Tutorial 4: Use stCAMBL model to train on the hippocampal dataset to obtain reconstructed features

In this tutorial, we will show how to use the model to train on the hippocampal dataset to obtain reconstructed features, this will prepare for the subsequent Celina analysis. Relevant data can be obtained from github.

**Import the relevant python analysis package**

In [1]:
import scanpy as sc
import pandas as pd
from sklearn import metrics
import numpy as np
import anndata as ad
import torch
import scipy
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')
import stCAMBL
import os
import rpy2.robjects as ro
#Please change this path to your local R environment path
os.environ['R_HOME'] = '/data3/wkcui/env/anaconda3/envs/stCAMBL/lib/R'
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')

**Read data and perform data preprocessing**

In [3]:
ro.r('load("/data3/yfchen/stCAMBL/data/hp/hippocampus_scRNA_reference.RData")')
# obtain scRNA-seq data and metadata
counts = ro.r("""
    list(
        matrix = as.matrix(scRNA_count_subset),
        genes = rownames(scRNA_count_subset),
        cells = colnames(scRNA_count_subset)
    )
""")
meta = ro.r("""
    list(
        cellType = sc_meta_in_subset$cellType,
        sampleInfo = sc_meta_in_subset$sampleInfo,
        cellID = sc_meta_in_subset$cellID
    )
""")
ro.r('load("/data3/yfchen/stCAMBL/data/hp/starmap_plus_data_use.RData")')

# obtain spatial transcriptomics data
data_use = ro.r('data_use')
spot_ids = np.array(ro.r('rownames(data_use$celltype_proportion)'))

# get location_use
location_df = data_use.rx2('location_use')
location_use = np.column_stack((
    np.array(location_df.rx2('x')),
    np.array(location_df.rx2('y'))
))

# obtain expression matrix for selected spots
all_colnames = np.array(ro.r('colnames(data_use$raw_matrix)'))
spot_indices = np.where(np.isin(all_colnames, spot_ids))[0]
expr_use = np.array(data_use.rx2('raw_matrix'))[:, spot_indices]

# obtain cell type proportions for selected spots
celltype_rows = np.array(ro.r('rownames(data_use$celltype_proportion)'))
mask = np.isin(celltype_rows, spot_ids)
celltype_use = np.array(data_use.rx2('celltype_proportion'))[mask, :]

# obtain spatial coordinates for selected spots
location_rows = np.array(ro.r('rownames(data_use$location_use)'))
mask = np.isin(location_rows, spot_ids)
location_use = location_use[mask, :]

# create AnnData object
adata = ad.AnnData(
    X=scipy.sparse.csr_matrix(expr_use.T), 
    var=pd.DataFrame(index=np.array(data_use.rx2('raw_matrix').rownames)),  
    obs=pd.DataFrame(index=spot_ids)  
)
df_meta = pd.read_csv('/data3/yfchen/stCAMBL/data/hp/ground_truth_hp.csv', sep='\t')
adata.obs['ground_truth'] = df_meta['layer_guess'].values
adata.obsm['spatial'] = location_use
adata.obsm['celltype_proportion'] = celltype_use

# Data preprocessing
sc.pp.filter_genes(adata, min_cells=50)
sc.pp.filter_genes(adata, min_counts=10)
adata_X_ori = adata.X.copy()  # 保存原始数据
sc.pp.normalize_total(adata, target_sum=1e6)
sc.pp.highly_variable_genes(adata, flavor="seurat_v3",n_top_genes=2000)
adata_X_ori = adata_X_ori[:, adata.var['highly_variable'] == True]
adata = adata[:, adata.var['highly_variable'] == True]
sc.pp.scale(adata, max_value=10)
dataset = 'Hippocampus'

**Perform stCAMBL analysis**

In [4]:
from sklearn.decomposition import PCA
pca = PCA(n_components=200, random_state=42)
adata_X = pca.fit_transform(adata.X)

adata.obsm['X_pca'] = adata_X
graph_dict = stCAMBL.graph_construction(adata, 12)
model = stCAMBL.stCAMBL(adata.obsm['X_pca'], graph_dict, device=device)
# Begin to train the model
model.train_model(epochs=300, dataset=dataset)
mapgcl_feat, defeat, _, _, _ = model.process()
adata.obsm['emb'] = mapgcl_feat

100%|██████████| 300/300 [01:28<00:00,  3.38it/s]


**Reconstruct features to the original dimension**

In [5]:
def inverse_normalize_total(adata_norm):
    original_sums = adata_norm.var['n_counts'].values  
    X_norm = adata_norm.X.toarray() if hasattr(adata_norm.X, "toarray") else adata_norm.X
    X_original = X_norm * (original_sums / 1e6) 
    adata_original = adata_norm.copy()
    adata_original.X = X_original
    return adata_original

def inverse_scale(adata_scaled):
    mean = adata_scaled.var['mean'].values
    std = adata_scaled.var['std'].values
    X_scaled = adata_scaled.X.toarray() if hasattr(adata_scaled.X, "toarray") else adata_scaled.X
    X_original = X_scaled * std + mean
    adata_original = adata_scaled.copy()
    adata_original.X = X_original
    return adata_original

adata_rec = pca.inverse_transform(defeat)  
adata.X = adata_rec 
adata_inv = inverse_scale(adata) 
adata_inv = inverse_normalize_total(adata_inv) 
adata.obsm['X_rec'] = adata_inv.X
# Please replace the save path
np.savetxt(
        f"/data3/yfchen/stCAMBL/hippocampus.txt",
        adata.obsm['X_rec'], 
        fmt='%.6f', 
        delimiter=' ',
    )