### R code, generate count data

In [None]:
library(splatter)
library("rhdf5")

all.dropout.rate <- c()

g_prob = c(0.07560233, 0.07006369, 0.27388535, 0.06037109, 0.18637497, 0.19163667, 0.14206591)

for (i in 1:10){
    file <- paste('.../path/sim', i, ".h5", sep="")
    
    sim <- splatSimulate(group.prob = g_prob, nGenes=2500, batchCells=5000,
                     dropout.type="experiment", method='groups', de.downProb = 0.,
                     dropout.shape=-1, dropout.mid=2., de.facScale = 0.3, de.prob = 0.05, 
                     seed = 0+i)

    counts <- counts(sim)

    dropcounts <- sim@assays@data$counts
    truecounts <- sim@assays@data$TrueCounts
    dropout.rate <- (sum(dropcounts==0)-sum(truecounts==0))/sum(truecounts>0)
 
    marker = rowData(sim)[c("DEFacGroup1", "DEFacGroup2", "DEFacGroup3", "DEFacGroup4",
                            "DEFacGroup5", "DEFacGroup6", "DEFacGroup7")]

    h5write(as.matrix(marker), file, "mark")
    h5write(as.matrix(counts), file, "X")
    h5write(as.matrix(truecounts), file, "X_true")
    h5write( as.integer(colData(sim)$Group), file, "Y")
}

### Python code, assign count data to the spatial spot

In [1]:
import numpy as np
import pandas as pd
import h5py
import scanpy as sc
import seaborn as sns
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os
import pdb

In [2]:
neighbor_file = '.../path/neighbor.csv'
df = pd.read_csv(neighbor_file,)

In [3]:
# y: cluster label
# N1~N*: index of the neighbor of the spot i (-1: not existed)
df[:2]

Unnamed: 0,y,N1,N2,N3,N4,N5,N6
0,2,393,1051,3516,2106,480,2311
1,0,685,1604,2316,-1,-1,-1


In [6]:
y_count_gt = np.unique(df['y'], return_counts = True)[1]
N = df.shape[0]
print(y_count_gt)

[273 253 989 218 673 692 513]


In [None]:
folder = '.../path'
mark_level = 1.5

for i in range(1,10):
    data_file='{}/sim{}.h5'.format(folder, i)
    data_mat = h5py.File(data_file, 'r')
    x = np.array(data_mat['X'])
    x_true = np.array(data_mat['X_true'])
    label = np.array(data_mat['Y'])
    marker = np.array(data_mat['mark'])
    data_mat.close()
    
    label = label - 1
    temp = []
    
    # sample exact number to fill the spacial 
    for c in range(7):
        sub = np.where(label==c)[0]
        np.random.seed(i)
        temp.append( np.random.choice(sub, size=y_count_gt[c], replace=False) )
        
    idx = - np.ones(N).astype(int)
    for c in range(np.unique(label).shape[0]):
        idx[df_c_nb.y==c] = temp[c]
        
        
    x_sub = x[idx,:]
    label_sub = label[idx]
    
    marker = marker.T
    mask = np.sum(marker > mark_level, axis=1)
    mask = (mask!=0)
    marker1 = (marker > mark_level)[mask]
    
    x_marker = x_sub[:, mask]

  
    h5 = h5py.File('{}/sim_data_{}.h5'.format(folder, i),'w')
    h5.create_dataset('genes', data = x_marker)
    h5.create_dataset('rho', data = marker1.astype(int))
    h5.create_dataset('group', data = label_sub)
    h5.create_dataset('Z_neighbor_idx', data = df_c_nb.loc[:,'N1':].values)
    h5.close()