In [1]:
import scanpy as sc



In [2]:
# Load the single-cell and spatial transcriptomics data
sc_data = sc.read_h5ad('../../../scvi-tools-DestVI/data/sc_lymph_node_preprocessed.h5ad')
st_data = sc.read_h5ad('../../../scvi-tools-DestVI/data/st_lymph_node_preprocessed.h5ad')

print(sc_data)

AnnData object with n_obs × n_vars = 14989 × 1888
    obs: 'n_genes', 'cell_types', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'pred_cell_types', 'doublet_scores', 'doublet_predictions', 'MS', 'louvain_r0.5', 'louvain_r0.7', 'louvain_r1.0', 'leiden_r0.5', 'leiden_r0.7', 'leiden_r1.0', 'DC_A', 'DC_B', 'mono_1', 'mono_2', 'louvain_sub_0.1', 'louvain_sub_0.2', 'louvain_sub_0.3', 'louvain_sub', 'louvain_sub_1', 'louvain_sub_2', 'louvain_sub_3', 'SCANVI_pred_cell_types', 'SCVI_pred_cell_types', 'broad_cell_types'
    var: 'gene_ids-0', 'genome-0', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'n_counts', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'batch_colors', 'hvg', 'leiden', 'leiden_r1.0_colors', 'log1p', 'louvain', 'louvain_r0.5_colors', 'louvain_r0.7_colors', 'louvain_r1.0_colors', 'louvain_sub_0.2_colors', 'louvain_sub_0.3_colors', 'louvain_sub_1_colo

In [3]:
import scanpy as sc
import numpy as np

# sc.pp.normalize_total(sc_data, target_sum=1e4)
# sc.pp.log1p(sc_data)  # 可选，对数归一化

X = sc_data.X
if not isinstance(X, np.ndarray):
    X = X.toarray()
# Min-Max归一化到0-1|
X = (X - X.min()) / (X.max() - X.min())

In [4]:
# sc_data.obs['cell_types'].unique

In [None]:
from models.vae import VAE
import torch
from torch.utils.data import DataLoader, TensorDataset

# X 是单细胞表达矩阵，已归一化
X_tensor = torch.tensor(X, dtype=torch.float32)

# 假设 X_tensor 是 shape [num_cells, num_genes] 的表达矩阵
dataset = TensorDataset(X_tensor)
data_loader = DataLoader(dataset, batch_size=128, shuffle=True)

vae = VAE(input_dim=X_tensor.shape[1], hidden_dim=128, latent_dim=15)
optimizer = torch.optim.Adam(vae.parameters(), lr=1e-3)
vae.train_model(data_loader, optimizer, num_epochs=50)  # 切换到训练模式

# 前向传播
recon_x, mean, log_var = vae(X_tensor)

epoch 0, loss 4717.75048828125
epoch 1, loss 4235.58740234375
epoch 2, loss 4027.042724609375
epoch 3, loss 4533.4560546875
epoch 4, loss 4492.40576171875
epoch 5, loss 3857.777587890625
epoch 6, loss 3884.084716796875
epoch 7, loss 3879.18505859375
epoch 8, loss 4586.4140625
epoch 9, loss 3762.984130859375
epoch 10, loss 4056.07275390625
epoch 11, loss 4031.45166015625
epoch 12, loss 3786.229736328125
epoch 13, loss 4058.63720703125
epoch 14, loss 3989.249267578125
epoch 15, loss 4102.36328125
epoch 16, loss 3780.335693359375
epoch 17, loss 5013.81982421875
epoch 18, loss 3890.435546875
epoch 19, loss 4228.0419921875
epoch 20, loss 3782.214599609375
epoch 21, loss 3725.918212890625
epoch 22, loss 3864.86083984375
epoch 23, loss 3812.39794921875
epoch 24, loss 3808.947021484375
epoch 25, loss 4134.3388671875
epoch 26, loss 4204.74072265625
epoch 27, loss 3802.994873046875
epoch 28, loss 3772.408447265625
epoch 29, loss 3784.59814453125
epoch 30, loss 3902.76220703125
epoch 31, loss 431

这行代码的含义如下：

- `recon_x`：VAE 重构后的表达矩阵（即用潜在变量还原出来的输入），形状和输入一样。
- `mean`：每个细胞在潜在空间的均值向量（编码器输出）。
- `log_var`：每个细胞在潜在空间的对数方差向量（编码器输出）。

**总结：**  
- `recon_x` 用于衡量重构效果（和原始输入对比）。
- `mean` 和 `log_var` 是潜在空间的参数，可用于后续特征提取或下游分析。

In [11]:
import pandas as pd
cell_types = sc_data.obs['cell_types'].values  # 获取每个细胞的类型标签
mean_np = mean.detach().cpu().numpy()         # 转为 numpy

# 按细胞类型聚合
df = pd.DataFrame(mean_np)
df['cell_types'] = cell_types
type_latent = df.groupby('cell_types').mean().values  # shape: [num_types, latent_dim]

In [7]:
print(st_data)

AnnData object with n_obs × n_vars = 1092 × 1888
    obs: 'in_tissue', 'array_row', 'array_col', 'batch', 'LN', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_counts', 'leiden', 'lymph_node'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells'
    uns: 'LN_colors', 'log1p'
    obsm: 'X_pca', 'X_umap', 'location', 'modules', 'spatial'
    layers: 'counts'


In [12]:
# 2. 处理 ST 数据
st_X = st_data.X
if not isinstance(st_X, np.ndarray):
    st_X = st_X.toarray()
st_X = (st_X - st_X.min()) / (st_X.max() - st_X.min())
st_tensor = torch.tensor(st_X, dtype=torch.float32)

# 3. 构建邻接矩阵（假设已获得 adjacency_matrix）
import squidpy as sq

# 构建空间邻接矩阵，n_neigh 可根据实际情况调整
sq.gr.spatial_neighbors(st_data, coord_type="generic", n_neighs=6, spatial_key="spatial")

# 邻接矩阵存储在 obsp['spatial_connectivities']
adjacency_matrix = st_data.obsp['spatial_connectivities'].toarray()
adj_tensor = torch.tensor(adjacency_matrix, dtype=torch.float32)
print(adjacency_matrix.shape)

# 4. 初始化并运行 GCN
from models.gcn import GCN
gcn = GCN(input_dim=st_tensor.shape[1], hidden_dim=64, output_dim=type_latent.shape[1])
gcn.eval()
with torch.no_grad():
    deconv_result = gcn(st_tensor, adj_tensor)

# 5. deconv_result 即为每个空间点的细胞类型组成或概率

(1092, 1092)


In [9]:
deconv_result.shape

torch.Size([1092, 15])

In [10]:
deconv_result[:5]

tensor([[0.0877, 0.0718, 0.0321, 0.0530, 0.0558, 0.0439, 0.0599, 0.0564, 0.0602,
         0.1136, 0.0587, 0.0918, 0.0687, 0.0307, 0.1156],
        [0.1150, 0.0725, 0.0332, 0.0523, 0.0474, 0.0367, 0.0631, 0.0444, 0.0634,
         0.1116, 0.0534, 0.1017, 0.0551, 0.0321, 0.1179],
        [0.1020, 0.0650, 0.0404, 0.0517, 0.0529, 0.0424, 0.0643, 0.0526, 0.0626,
         0.1145, 0.0582, 0.1034, 0.0548, 0.0354, 0.0998],
        [0.1061, 0.0682, 0.0411, 0.0567, 0.0498, 0.0411, 0.0709, 0.0496, 0.0664,
         0.1093, 0.0576, 0.0946, 0.0497, 0.0471, 0.0915],
        [0.1008, 0.0643, 0.0431, 0.0566, 0.0557, 0.0449, 0.0655, 0.0546, 0.0623,
         0.1017, 0.0571, 0.0937, 0.0578, 0.0404, 0.1015]])