# Tutorial for simulated data
This tutorial provides step-by-step instructions for reproducing the simulation results presented in our paper. For each simulated dataset, we generated two modalities: one representing the transcriptome and the other representing the proteome. These datasets include multiple ground truth patterns—some shared between both modalities and others unique to a single modality. The simulated data are then used as input for our algorithm, which produces two types of outputs based on user preference.

# Domain clustering


In [None]:
# SpaMV results
import os
import sys
import anndata
from sklearn.metrics import adjusted_rand_score

# Get the current working directory
current_dir = os.getcwd()
# Add the parent directory to sys.path
sys.path.insert(0, os.path.dirname(os.path.dirname(current_dir)))
from SpaMV.spamv import SpaMV
from SpaMV.utils import clr_normalize_each_cell, mclust
import scanpy as sc
import matplotlib.pyplot as plt

for dataset in ['1_Simulation', '2_Simulation', '3_Simulation']:
    data_rna = sc.read_h5ad('Data/' + dataset + '/adata_RNA.h5ad')
    data_pro = sc.read_h5ad('Data/' + dataset + '/adata_ADT.h5ad')
    omics_names = ['Omics 1', 'Omics 2']
    sc.pp.normalize_total(data_rna)
    sc.pp.log1p(data_rna)
    sc.pp.pca(data_rna, n_comps=50)
    data_rna = anndata.AnnData(data_rna.obsm['X_pca'], obs=data_rna.obs, obsm=data_rna.obsm)
    data_pro = clr_normalize_each_cell(data_pro)
    sc.pp.pca(data_pro, n_comps=50)
    data_pro = anndata.AnnData(data_pro.obsm['X_pca'], obs=data_pro.obs, obsm=data_pro.obsm)

    # wandb.init(project=dataset)
    # wandb.login()
    model = SpaMV([data_rna, data_pro], interpretable=False, omics_names=omics_names)
    model.train(dataset)
    # wandb.finish()

    data_rna.obsm['SpaMV'] = model.get_embedding()
    mclust(data_rna, n_clusters=10, key='SpaMV')

    fig, axes = plt.subplots(1, 2, figsize=(8, 4))  # 1 row, 2 columns

    sc.pp.neighbors(data_rna, use_rep='SpaMV')
    sc.tl.umap(data_rna)
    sc.pl.umap(data_rna, color='SpaMV', ax=axes[0], show=False, legend_loc='none', s=20, title='UMAP')
    sc.pl.embedding(data_rna, color='SpaMV', basis='spatial', s=200, show=False, title='SpaMV on {}\nARI: {:.3f}'.format(dataset, adjusted_rand_score(data_rna.obs['cluster'], data_rna.obs['SpaMV'])), ax=axes[1])
    plt.tight_layout()
    plt.show()

# Interpretable dimension reduction

In [None]:
# SpaMV results
import os
import sys
import numpy as np

# Get the current working directory
current_dir = os.getcwd()
# Add the parent directory to sys.path
sys.path.insert(0, os.path.dirname(os.path.dirname(current_dir)))

import torch
from SpaMV.spamv import SpaMV
from SpaMV.utils import clr_normalize_each_cell, plot_embedding_results
import scanpy as sc

omics_names = ['Omics 1', 'Omics 2']
for dataset in ['1_Simulation', '2_Simulation', '3_Simulation']:
# for dataset in ['3_Simulation_o']:
    print('dataset:', dataset)
    data_rna = sc.read_h5ad('Data/' + dataset + '/adata_RNA.h5ad')
    sc.pp.normalize_total(data_rna)
    sc.pp.log1p(data_rna)
    data_pro = sc.read_h5ad('Data/' + dataset + '/adata_ADT.h5ad')
    data_pro = clr_normalize_each_cell(data_pro)
    # wandb.init(project=dataset + '_interpretable')
    # wandb.login()

    model = SpaMV([data_rna, data_pro], zs_dim=10, zp_dims=[10, 10], betas=[3, 3], interpretable=True, omics_names=omics_names)
    model.train(dataset)
    # wandb.finish()

    z, w = model.get_embedding_and_feature_by_topic(threshold=.1)
    # Define the softmax function
    def softmax(x):
        e_x = np.exp(x - np.max(x))  # Subtract max for numerical stability
        return e_x / e_x.sum()
    z = z.apply(lambda row: softmax(row), axis=1)
    plot_embedding_results([data_rna, data_pro], omics_names, z, w, save=False, show=True, size=350)
    # z.to_csv('../../Results/' + dataset + '/SpaMV_z.csv', index=False)
    # w[0].to_csv('../../Results/' + dataset + '/SpaMV_w0.csv')
    # w[1].to_csv('../../Results/' + dataset + '/SpaMV_w1.csv')