In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
data_path = '../dataset/Chromium_FFPE_Human_Breast_Cancer_Chromium_FFPE_Human_Breast_Cancer_count_sample_filtered_feature_bc_matrix.h5'
sc_data = sc.read_10x_h5(data_path)
sc_data

sc_data.X = sc_data.X.toarray()
# Randomly sample 80,000 cells
n_cells = sc_data.shape[0]
sample_size = min(80000, n_cells)  # Ensure we don't try to sample more than available cells
random_indices = np.random.choice(n_cells, size=sample_size, replace=False)

# Create a new AnnData object with the sampled cells
sc_data = sc_data[random_indices, :]

# Print to confirm the shape
print(sc_data)
st_data_path = '../dataset/Xenium_breast_cancer_sample1_replicate1.h5ad'
st_data = sc.read_h5ad(st_data_path)
st_data
# 检查 sc_data.var_names 是否有重复项
duplicates = sc_data.var_names[sc_data.var_names.duplicated()]
print(f"重复的基因: {duplicates}")
# 删除 sc_data.var_names 中的重复基因
sc_data = sc_data[:, ~sc_data.var_names.duplicated()]
st_data.X = st_data.X.toarray()

# Define genes to hold out for validation
holdout_genes = ['POSTN', 'IL7R', 'ITGAX', 'ACTA2', 'KRT15', 'VWF', 'FASN', 'CEACAM6']

# Store the original expression values of holdout genes for later comparison
holdout_gene_data = {}
for gene in holdout_genes:
    if gene in st_data.var_names:
        gene_idx = st_data.var_names.get_loc(gene)
        holdout_gene_data[gene] = st_data.X[:, gene_idx].copy()
        print(f"Stored original expression for {gene}")
    else:
        print(f"Gene {gene} not found in spatial data")

# Create a copy of the spatial data without the holdout genes
st_data_holdout = st_data.copy()
st_data_holdout = st_data_holdout[:, ~st_data_holdout.var_names.isin(holdout_genes)]
print(f"Original gene count: {st_data.n_vars}, After holdout: {st_data_holdout.n_vars}")

import tangram as tg
tg.pp_adatas(sc_data, st_data_holdout, genes=None)
ad_map = tg.map_cells_to_space(sc_data, st_data_holdout)
ad_ge = tg.project_genes(ad_map, sc_data)

# Calculate Pearson correlation for holdout genes
correlations = {}
for gene in holdout_genes:
    if gene in ad_ge.var_names and gene in holdout_gene_data:
        imputed_values = ad_ge[:, gene].X.flatten()
        original_values = holdout_gene_data[gene]
        corr = np.corrcoef(imputed_values, original_values)[0, 1]
        correlations[gene] = corr
        print(f"Pearson correlation for {gene}: {corr:.4f}")

# Print average correlation
if correlations:
    avg_corr = np.mean(list(correlations.values()))
    print(f"Average correlation across holdout genes: {avg_corr:.4f}")

imputed_st_data_path = '../dataset/ad_ge_imputed_Xenium_breast_cancer_sample1_replicate1_hold_out.h5ad'
imputed_intermediate_st_data_path = '../dataset/ad_map_imputed_Xenium_breast_cancer_sample1_replicate1_hold_out.h5ad'
ad_ge.X = ad_ge.X.astype(np.float16)
ad_map.X = ad_map.X.astype(np.float16)
ad_ge.uns['hold_out_genes'] = holdout_genes
ad_ge.uns['hold_out_correlations'] = correlations
ad_ge.write_h5ad(imputed_st_data_path)
ad_map.write_h5ad(imputed_intermediate_st_data_path)

  utils.warn_names_duplicates("var")


  utils.warn_names_duplicates("var")


View of AnnData object with n_obs × n_vars = 30365 × 18082
    var: 'gene_ids', 'feature_types', 'genome'


重复的基因: Index(['TBCE', 'HSPA14', 'TMSB15B'], dtype='object')
Stored original expression for POSTN
Stored original expression for IL7R
Stored original expression for ITGAX
Stored original expression for ACTA2
Stored original expression for KRT15
Stored original expression for VWF
Stored original expression for FASN
Stored original expression for CEACAM6


Original gene count: 313, After holdout: 305


INFO:root:298 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.


INFO:root:298 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.


INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.


INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.


INFO:root:Allocate tensors for mapping.


INFO:root:Begin training with 298 genes and rna_count_based density_prior in cells mode...


INFO:root:Printing scores every 100 epochs.


Score: 0.310, KL reg: 0.222


Score: 0.708, KL reg: 0.037


Score: 0.759, KL reg: 0.017


Score: 0.778, KL reg: 0.011


Score: 0.789, KL reg: 0.008


Score: 0.796, KL reg: 0.006


Score: 0.801, KL reg: 0.005


Score: 0.805, KL reg: 0.004


Score: 0.808, KL reg: 0.004


Score: 0.810, KL reg: 0.003


INFO:root:Saving results..
