In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import os

# Generate loom files

In [2]:
for seed in range(1, 11):
    base_dir = './Sim/Sim%d' % seed
    count = pd.read_table('%s/counts.txt' % base_dir)
    cellinfo = pd.read_table('%s/cellinfo.txt' % base_dir)
    geneinfo = pd.read_table('%s/geneinfo.txt' % base_dir)
    adata = sc.AnnData(count.values)
    adata.obs = cellinfo[['Batch', 'Group', 'ExpLibSize']].rename(columns={'Batch': 'batch', 'Group': 'celltype'})
    adata.var = geneinfo
    adata.var_names.name = 'Gene'
    adata.obs_names.name = 'CellID'
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)
    adata.write_loom('%s/Sim%d_raw.loom' % (base_dir, seed))

# Generate subsampled files for different scenarios

In [3]:
batch1_prop_cells = [1/2, 1/4, 1/8]
rare1_prop_cells = [1/2, 1/5, 1/10]
common_cells = [5, 3, 1]

In [4]:
for seed in range(1, 11):
    base_dir = './Sim/Sim%d' % seed
    adata = sc.read_loom('%s/Sim%d_raw.loom' % (base_dir, seed))
    for number in common_cells:
        os.mkdir('%s/common_%d' % (base_dir, number))
        common_ct = ['Group%d' % i for i in range(1, number+1)]
        index0 = adata.obs_names[adata.obs.celltype.isin(common_ct)]
        specific_number = 7 - number
        # Get specific cell types for Batch1
        specific_ct = ['Group%d' % i for i in range(number+1, int(number+1+specific_number/2))]
        index1 = adata.obs_names[adata.obs.celltype.isin(specific_ct) & (adata.obs.batch=='Batch1')]
        # Get specific cell types for Batch2
        specific_ct = ['Group%d' % i for i in range(int(number+1+specific_number/2), 7+1)]
        index2 = adata.obs_names[adata.obs.celltype.isin(specific_ct) & (adata.obs.batch=='Batch2')]
        adata_sub = adata[index0.union(index1).union(index2)]
        print(pd.crosstab(adata_sub.obs.celltype, adata_sub.obs.batch))
        adata_sub.write_loom('%s/common_%d/Sim%d_raw.loom' % (base_dir, number, seed))

batch     Batch1  Batch2
celltype                
Group1       229     247
Group2       241     263
Group3       227     224
Group4       227     243
Group5       232     232
Group6       194       0
Group7         0     223
batch     Batch1  Batch2
celltype                
Group1       229     247
Group2       241     263
Group3       227     224
Group4       227       0
Group5       232       0
Group6         0     201
Group7         0     223
batch     Batch1  Batch2
celltype                
Group1       229     247
Group2       241       0
Group3       227       0
Group4       227       0
Group5         0     232
Group6         0     201
Group7         0     223
batch     Batch1  Batch2
celltype                
Group1       246     209
Group2       244     225
Group3       237     220
Group4       221     223
Group5       210     232
Group6       209       0
Group7         0     217
batch     Batch1  Batch2
celltype                
Group1       246     209
Group2       244     225


In [5]:
for seed in range(1,11):
    base_dir = './Sim/Sim%d' % seed
    adata = sc.read_loom('%s/Sim%d_raw.loom' % (base_dir, seed))
    total = np.sum(adata.obs.celltype == 'Group1')
    for number in rare1_prop_cells:
        os.mkdir('%s/rare1_%.1f' % (base_dir, number))
        obs_names = np.random.choice(adata.obs_names[adata.obs.celltype == 'Group1'], 
                                     int(total*(1-number)), replace=False)
        adata_sub = adata[~adata.obs_names.isin(obs_names)]
        print(adata_sub.obs.celltype.value_counts())
        adata_sub.write_loom('%s/rare1_%.1f/Sim%d_raw.loom' % (base_dir, number, seed))

Group2    504
Group4    470
Group5    464
Group7    458
Group3    451
Group6    395
Group1    238
Name: celltype, dtype: int64
Group2    504
Group4    470
Group5    464
Group7    458
Group3    451
Group6    395
Group1     96
Name: celltype, dtype: int64
Group2    504
Group4    470
Group5    464
Group7    458
Group3    451
Group6    395
Group1     48
Name: celltype, dtype: int64
Group2    469
Group3    457
Group6    446
Group4    444
Group5    442
Group7    430
Group1    228
Name: celltype, dtype: int64
Group2    469
Group3    457
Group6    446
Group4    444
Group5    442
Group7    430
Group1     91
Name: celltype, dtype: int64
Group2    469
Group3    457
Group6    446
Group4    444
Group5    442
Group7    430
Group1     46
Name: celltype, dtype: int64
Group7    434
Group4    431
Group6    427
Group2    422
Group3    419
Group5    414
Group1    213
Name: celltype, dtype: int64
Group7    434
Group4    431
Group6    427
Group2    422
Group3    419
Group5    414
Group1     85
Name: celltyp

In [6]:
for seed in range(1,11):
    base_dir = './Sim/Sim%d' % seed
    adata = sc.read_loom('%s/Sim%d_raw.loom' % (base_dir, seed))
    batch1_n = np.sum(adata.obs.batch == 'Batch1')
    for number in batch1_prop_cells:
        os.mkdir('%s/batch1_%d' % (base_dir, number))
        obs_names = np.random.choice(adata.obs_names[adata.obs.batch == 'Batch1'], 
                                     int((1-number)*batch1_n), replace=False)
        adata_sub = adata[~adata.obs_names.isin(obs_names)]
        print(adata_sub.obs.batch.value_counts())
        adata_sub.write_loom('%s/batch1_%.3f/Sim%d_raw.loom' % (base_dir, number, seed))

Batch2    1633
Batch1     793
Name: batch, dtype: int64
Batch2    1633
Batch1     397
Name: batch, dtype: int64
Batch2    1633
Batch1     199
Name: batch, dtype: int64
Batch2    1563
Batch1     790
Name: batch, dtype: int64
Batch2    1563
Batch1     395
Name: batch, dtype: int64
Batch2    1563
Batch1     198
Name: batch, dtype: int64
Batch2    1418
Batch1     777
Name: batch, dtype: int64
Batch2    1418
Batch1     389
Name: batch, dtype: int64
Batch2    1418
Batch1     195
Name: batch, dtype: int64
Batch2    1601
Batch1     815
Name: batch, dtype: int64
Batch2    1601
Batch1     408
Name: batch, dtype: int64
Batch2    1601
Batch1     204
Name: batch, dtype: int64
Batch2    1568
Batch1     822
Name: batch, dtype: int64
Batch2    1568
Batch1     411
Name: batch, dtype: int64
Batch2    1568
Batch1     206
Name: batch, dtype: int64
Batch2    1552
Batch1     791
Name: batch, dtype: int64
Batch2    1552
Batch1     396
Name: batch, dtype: int64
Batch2    1552
Batch1     198
Name: batch, dtype