In [1]:
from SpaGE.main import SpaGE
import scanpy as sc
import numpy as np
import os
import random
import pandas as pd
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0" # Change to -1 if you want to use CPU!


sc_data_path = '../dataset/Chromium_FFPE_Human_Breast_Cancer_Chromium_FFPE_Human_Breast_Cancer_count_sample_filtered_feature_bc_matrix.h5'
sc_data = sc.read_10x_h5(sc_data_path)
sc_data.X = sc_data.X.toarray()
st_data_path = '../dataset/Xenium_breast_cancer_sample1_replicate1.h5ad'
original_st_data = sc.read_h5ad(st_data_path)
# 检查 sc_data.var_names 是否有重复项
duplicates = sc_data.var_names[sc_data.var_names.duplicated()]
print(f"duplicated genes: {duplicates}")
# 删除 sc_data.var_names 中的重复基因
sc_data = sc_data[:, ~sc_data.var_names.duplicated()]

# Get shared genes between sc_data and original spatial data
shared_genes = sc_data.var_names.intersection(original_st_data.var_names)
print(f"Number of shared genes: {len(shared_genes)}")

# Define discard rates
discard_rates = [0.2, 0.4, 0.6, 0.8]
num_runs = 5

import warnings
warnings.filterwarnings('ignore')

# Loop through each discard rate and run 5 times with different random seeds
for discard_rate in discard_rates:
    for run_index in range(num_runs):
        print(f"Processing discard rate: {discard_rate}, run: {run_index+2}/{num_runs}")
        
        sc_data_run = sc.read_10x_h5(sc_data_path)
        sc_data_run.X = sc_data_run.X.toarray()
        sc_data_run = sc_data_run[:, ~sc_data_run.var_names.duplicated()]
        
        # Create a copy of the original spatial data
        st_data = sc.read_h5ad(st_data_path)
        st_data.X = st_data.X.toarray()
        

        # Randomly select genes to discard
        num_genes_to_discard = int(len(shared_genes) * discard_rate)
        random.seed(run_index)  # Set seed for reproducibility
        genes_to_discard = random.sample(list(shared_genes), num_genes_to_discard)
        
        # Create a mask for genes to keep
        genes_to_keep = [gene for gene in st_data.var_names if gene not in genes_to_discard]
        
        # Create a new spatial dataset with discarded genes dropped
        discarded_st_data = st_data[:, genes_to_keep].copy()
        
        print(f"Discarded {len(genes_to_discard)} genes out of {len(shared_genes)} shared genes")
        
        sc_df = pd.DataFrame(sc_data_run.X, index=sc_data_run.obs.index, columns=sc_data_run.var.index)
        
        st_df = pd.DataFrame(discarded_st_data.X, index=discarded_st_data.obs.index, columns=discarded_st_data.var.index)
        
        imputed_genes = SpaGE(Spatial_data=st_df, RNA_data=sc_df, n_pv=5)
        imputed_genes.index = imputed_genes.index + 1
        imputed_genes.index = imputed_genes.index.astype(str)
        # First, check if there are any overlapping genes between the original data and imputed genes
        overlapping_genes = np.intersect1d(st_df.columns, imputed_genes.columns)
        if len(overlapping_genes) > 0:
            print(f"Warning: Found {len(overlapping_genes)} overlapping genes between original data and imputed genes.")
            print("These genes will be taken from the original data.")

        # Create a combined dataframe with both original and imputed genes
        # Use axis=1 to concatenate columns (genes) not rows
        combined_df = pd.concat([st_df, imputed_genes.loc[st_df.index]], axis=1)

        # If there are duplicated columns, keep the first occurrence (from original data)
        if combined_df.columns.duplicated().any():
            combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

        print(f"Original data shape: {st_df.shape}")
        print(f"Imputed genes shape: {imputed_genes.shape}")
        print(f"Combined data shape: {combined_df.shape}")

        # Verify that the row count matches the original data
        assert combined_df.shape[0] == st_df.shape[0], "Row count mismatch after concatenation"
        # Verify that the column count is the sum of unique genes
        assert combined_df.shape[1] == len(set(st_df.columns) | set(imputed_genes.columns)), "Column count mismatch"
        combined_df = combined_df.astype(np.float16)

        original_st_data = sc.read_h5ad(st_data_path)
        imputed_st_data = sc.AnnData(X=combined_df, 
                             obs=original_st_data.obs,
                             uns=original_st_data.uns
                             )
  
        imputed_st_data.uns['discarded_genes'] = genes_to_discard
        imputed_st_data.uns['COVET_genes'] = imputed_genes.columns.tolist()

        # Save the imputed dataset
        output_path = f'/mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_{discard_rate}_random_genes_discarded_{run_index}.h5ad'
        print(f"Run {run_index+1} of {num_runs} Saving to {output_path}")
        imputed_st_data.write_h5ad(output_path)


  utils.warn_names_duplicates("var")


  utils.warn_names_duplicates("var")


duplicated genes: Index(['TBCE', 'HSPA14', 'TMSB15B'], dtype='object')
Number of shared genes: 307
Processing discard rate: 0.2, run: 2/5


Discarded 61 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 252)
Imputed genes shape: (167780, 17833)
Combined data shape: (167780, 18085)


Run 1 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.2_random_genes_discarded_0.h5ad


Processing discard rate: 0.2, run: 3/5


Discarded 61 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 252)
Imputed genes shape: (167780, 17833)
Combined data shape: (167780, 18085)


Run 2 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.2_random_genes_discarded_1.h5ad


Processing discard rate: 0.2, run: 4/5


Discarded 61 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 252)
Imputed genes shape: (167780, 17833)
Combined data shape: (167780, 18085)


Run 3 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.2_random_genes_discarded_2.h5ad


Processing discard rate: 0.2, run: 5/5


Discarded 61 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 252)
Imputed genes shape: (167780, 17833)
Combined data shape: (167780, 18085)


Run 4 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.2_random_genes_discarded_3.h5ad


Processing discard rate: 0.2, run: 6/5


Discarded 61 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 252)
Imputed genes shape: (167780, 17833)
Combined data shape: (167780, 18085)


Run 5 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.2_random_genes_discarded_4.h5ad


Processing discard rate: 0.4, run: 2/5


Discarded 122 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 191)
Imputed genes shape: (167780, 17894)
Combined data shape: (167780, 18085)


Run 1 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.4_random_genes_discarded_0.h5ad


Processing discard rate: 0.4, run: 3/5


Discarded 122 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 191)
Imputed genes shape: (167780, 17894)
Combined data shape: (167780, 18085)


Run 2 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.4_random_genes_discarded_1.h5ad


Processing discard rate: 0.4, run: 4/5


Discarded 122 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 191)
Imputed genes shape: (167780, 17894)
Combined data shape: (167780, 18085)


Run 3 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.4_random_genes_discarded_2.h5ad


Processing discard rate: 0.4, run: 5/5


Discarded 122 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 191)
Imputed genes shape: (167780, 17894)
Combined data shape: (167780, 18085)


Run 4 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.4_random_genes_discarded_3.h5ad


Processing discard rate: 0.4, run: 6/5


Discarded 122 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 191)
Imputed genes shape: (167780, 17894)
Combined data shape: (167780, 18085)


Run 5 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.4_random_genes_discarded_4.h5ad


Processing discard rate: 0.6, run: 2/5


Discarded 184 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 129)
Imputed genes shape: (167780, 17956)
Combined data shape: (167780, 18085)


Run 1 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.6_random_genes_discarded_0.h5ad


Processing discard rate: 0.6, run: 3/5


Discarded 184 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 129)
Imputed genes shape: (167780, 17956)
Combined data shape: (167780, 18085)


Run 2 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.6_random_genes_discarded_1.h5ad


Processing discard rate: 0.6, run: 4/5


Discarded 184 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 129)
Imputed genes shape: (167780, 17956)
Combined data shape: (167780, 18085)


Run 3 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.6_random_genes_discarded_2.h5ad


Processing discard rate: 0.6, run: 5/5


Discarded 184 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 129)
Imputed genes shape: (167780, 17956)
Combined data shape: (167780, 18085)


Run 4 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.6_random_genes_discarded_3.h5ad


Processing discard rate: 0.6, run: 6/5


Discarded 184 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 129)
Imputed genes shape: (167780, 17956)
Combined data shape: (167780, 18085)


Run 5 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.6_random_genes_discarded_4.h5ad


Processing discard rate: 0.8, run: 2/5


Discarded 245 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 68)
Imputed genes shape: (167780, 18017)
Combined data shape: (167780, 18085)


Run 1 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.8_random_genes_discarded_0.h5ad


Processing discard rate: 0.8, run: 3/5


Discarded 245 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 68)
Imputed genes shape: (167780, 18017)
Combined data shape: (167780, 18085)


Run 2 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.8_random_genes_discarded_1.h5ad


Processing discard rate: 0.8, run: 4/5


Discarded 245 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 68)
Imputed genes shape: (167780, 18017)
Combined data shape: (167780, 18085)


Run 3 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.8_random_genes_discarded_2.h5ad


Processing discard rate: 0.8, run: 5/5


Discarded 245 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 68)
Imputed genes shape: (167780, 18017)
Combined data shape: (167780, 18085)


Run 4 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.8_random_genes_discarded_3.h5ad


Processing discard rate: 0.8, run: 6/5


Discarded 245 genes out of 307 shared genes


Genes to predict:  ['A1CF' 'A2M' 'A2ML1' ... 'ZYG11B' 'ZYX' 'ZZEF1']


Spatial_data_scaled NaN values:  0


RNA_data_scaled NaN values:  11720890


Spatial_data_scaled NaN values after filling:  0


RNA_data_scaled NaN values after filling:  0
Common_data NaN values:  0


Original data shape: (167780, 68)
Imputed genes shape: (167780, 18017)
Combined data shape: (167780, 18085)


Run 5 of 5 Saving to /mnt/data-node3/chuangyihan/ssl_in_scg_dataset/SpaGE_imputed/SpaGE_imputed_Xenium_breast_cancer_sample1_replicate1_0.8_random_genes_discarded_4.h5ad
