In [2]:
import numpy as np
import pandas as pd
import squidpy as sq
import scanpy as sc

# 假设你已经有 simulated_adata 和 real_adata
simulated_adata = sc.read_h5ad("/Users/chen_yiru/Desktop/simulation/data/simulated/simulated_Sample_data_151676.h5ad")
real_adata = sc.read_h5ad("/Users/chen_yiru/Desktop/simulation/data/raw/Sample_data_151676.h5ad")

sc.pp.normalize_total(real_adata)
sc.pp.log1p(real_adata)
sc.pp.normalize_total(simulated_adata)
sc.pp.log1p(simulated_adata)


#计算highly variable genes
sc.pp.highly_variable_genes(real_adata, n_top_genes=2000)
sc.pp.highly_variable_genes(simulated_adata, n_top_genes=2000)

# 获取真实数据和模拟数据中的高度可变基因集
real_hvg_genes = set(real_adata.var[real_adata.var.highly_variable].index)
sim_hvg_genes = set(simulated_adata.var[simulated_adata.var.highly_variable].index)

# 取交集，确保在两者中都高度可变
common_genes = list(real_hvg_genes.intersection(sim_hvg_genes))

# 如果交集基因数量过少，可以考虑增加基因集的大小或改变选择方式
print(f"Number of common highly variable genes: {len(common_genes)}")

# 如果交集基因少于一定阈值，你可以调整策略，比如增加基因集大小或选择所有基因
if len(common_genes) < 100:
    print("Warning: The number of common highly variable genes is low. Consider adjusting the selection criteria.")
    
# 使用 Squidpy 计算真实数据的空间邻居
sq.gr.spatial_neighbors(real_adata)

# 计算真实数据的 Moran's I 只对交集基因进行操作
sq.gr.spatial_autocorr(
    real_adata,
    mode="moran",
    genes=common_genes,
    n_perms=100,
    n_jobs=1
)

# 获取真实数据的 Moran's I 结果
real_moran = real_adata.uns["moranI"][["I"]].copy()

# 对模拟数据做相同的处理
sq.gr.spatial_neighbors(simulated_adata)

# 计算模拟数据的 Moran's I 只对交集基因进行操作
sq.gr.spatial_autocorr(
    simulated_adata,
    mode="moran",
    genes=common_genes,
    n_perms=100,
    n_jobs=1
)

# 获取模拟数据的 Moran's I 结果
sim_moran = simulated_adata.uns["moranI"][["I"]].copy()

# 合并真实数据和模拟数据的 Moran's I 结果
moran_comparison = pd.DataFrame({
    'real_moran': real_moran['I'].values,
    'sim_moran': sim_moran['I'].values
}, index=real_moran.index)

# 计算 Moran's I 之间的差异
moran_comparison['diff'] = np.abs(moran_comparison['real_moran'] - moran_comparison['sim_moran'])

# 输出前几行查看结果
print(moran_comparison.head())

# 计算 Pearson 相关系数来评估整体相似度
pearson_corr = moran_comparison['real_moran'].corr(moran_comparison['sim_moran'])
print(f"Pearson correlation between real and simulated Moran's I: {pearson_corr:.4f}")


# 计算 RMSE
rmse = np.sqrt(np.mean((moran_comparison['real_moran'] - moran_comparison['sim_moran'])**2))

# 计算 MAE
mae = np.mean(np.abs(moran_comparison['real_moran'] - moran_comparison['sim_moran']))

print(f"Pearson correlation between real and simulated Moran's I: {pearson_corr:.4f}")
print(f"RMSE between real and simulated Moran's I: {rmse:.4f}")
print(f"MAE between real and simulated Moran's I: {mae:.4f}")

# 可视化部分也相应更新
plt.figure(figsize=(6, 6))
plt.scatter(moran_comparison['real_moran'], moran_comparison['sim_moran'], alpha=0.7)
plt.plot([min(moran_comparison['real_moran']), max(moran_comparison['real_moran'])],
         [min(moran_comparison['real_moran']), max(moran_comparison['real_moran'])],
         color='red', linestyle='--')
plt.xlabel("Real Moran's I")
plt.ylabel("Simulated Moran's I")
plt.title(f"Real vs Simulated Moran's I\nPearson correlation: {pearson_corr:.4f}\nRMSE: {rmse:.4f}, MAE: {mae:.4f}")
plt.show()


# [前面的代码保持不变，直到计算common_genes]

# 按Ground Truth分组计算局部Moran's I
domain_results = []
for domain in real_adata.obs["Ground Truth"].unique():
    # 获取该domain的细胞
    real_domain = real_adata[real_adata.obs["Ground Truth"] == domain].copy()
    sim_domain = simulated_adata[simulated_adata.obs["Ground Truth"] == domain].copy()
    
    if len(real_domain) < 2 or len(sim_domain) < 2:  # 跳过细胞数太少的domain
        continue
        
    # 计算空间邻居和Moran's I
    sq.gr.spatial_neighbors(real_domain)
    sq.gr.spatial_neighbors(sim_domain)
    
    sq.gr.spatial_autocorr(
        real_domain,
        mode="moran",
        genes=common_genes,
        n_perms=100,
        n_jobs=1
    )
    
    sq.gr.spatial_autocorr(
        sim_domain,
        mode="moran",
        genes=common_genes,
        n_perms=100,
        n_jobs=1
    )
    
    # 获取该domain的Moran's I结果
    real_domain_moran = real_domain.uns["moranI"][["I"]].copy()
    sim_domain_moran = sim_domain.uns["moranI"][["I"]].copy()
    
    # 计算该domain的评估指标
    domain_pearson = real_domain_moran['I'].corr(sim_domain_moran['I'])
    domain_rmse = np.sqrt(np.mean((real_domain_moran['I'] - sim_domain_moran['I'])**2))
    domain_mae = np.mean(np.abs(real_domain_moran['I'] - sim_domain_moran['I']))
    
    domain_results.append({
        'domain': domain,
        'pearson': domain_pearson,
        'rmse': domain_rmse,
        'mae': domain_mae,
        'n_cells': len(real_domain)
    })

# 转换为DataFrame并展示结果
domain_df = pd.DataFrame(domain_results)
print("\nDomain-specific results:")
print(domain_df)

# [保留原有的全局Moran's I计算和可视化代码]

# 添加按domain的boxplot可视化
plt.figure(figsize=(12, 6))
for domain in domain_df['domain']:
    real_domain = real_adata[real_adata.obs["Ground Truth"] == domain]
    sim_domain = simulated_adata[simulated_adata.obs["Ground Truth"] == domain]
    
    real_moran_values = real_domain.uns["moranI"]['I']
    sim_moran_values = sim_domain.uns["moranI"]['I']
    
    plt.boxplot([real_moran_values, sim_moran_values], 
                positions=[domain_df.index.get_loc(domain)*3, domain_df.index.get_loc(domain)*3+1],
                labels=['Real', 'Sim'])

plt.title("Moran's I Distribution by Domain")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


Number of common highly variable genes: 1602


  0%|          | 0/100 [00:00<?, ?/s]

  0%|          | 0/100 [00:00<?, ?/s]

         real_moran  sim_moran      diff
MBP        0.793287   0.726031  0.067256
SCGB2A2    0.733647   0.673637  0.060010
PLP1       0.697791   0.589404  0.108387
GFAP       0.602681   0.572352  0.030329
SCGB1D2    0.589269   0.521684  0.067585
Pearson correlation between real and simulated Moran's I: 0.9991
Pearson correlation between real and simulated Moran's I: 0.9991
RMSE between real and simulated Moran's I: 0.0054
MAE between real and simulated Moran's I: 0.0016


NameError: name 'plt' is not defined