In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
import anndata
from collections import Counter
np.random.seed(11)
import os

### Popescu et al. (2019)

In [None]:
data_dir = ''
dat = sc.read_h5ad(os.path.join(data_dir, 'fetal_liver_alladata_-Copy2.h5ad'))
eryth = dat[dat.obs['cell.labels'].isin(['Early Erythroid', 'Mid Erythroid', 'Late Erythroid', 'MEMP', 'HSC_MPP'])]
eryth.write(os.path.join(data_dir, 'Eryth_subset.h5ad'))
Counter(eryth.obs['cell.labels'])

In [None]:
Ery_adatas = [eryth[eryth.obs['cell.labels'].isin([x])] for x in eryth.obs['cell.labels'].cat.categories]
Ery_target_obs = [1000, 1000, 1000, 1000, 1000]
for dat, num in zip(Ery_adatas, Ery_target_obs):
         dat = sc.pp.subsample(data=dat,  n_obs=num)
ery_abund = Ery_adatas[0].concatenate(*Ery_adatas[1:])

In [None]:
Ery_adatas = [eryth[eryth.obs['cell.labels'].isin([x])] for x in eryth.obs['cell.labels'].cat.categories]
Ery_target_obs = [2200, 100, 17, 3, 1680]
for dat, num in zip(Ery_adatas, Ery_target_obs):
         dat = sc.pp.subsample(data=dat, n_obs=num)
ery_rare = Ery_adatas[0].concatenate(*Ery_adatas[1:])

In [None]:
datasets = {
    'Eryth_Subset1': ery_abund,
    'Eryth_Subset2': ery_rare}   

In [None]:
for dataset in datasets.keys():
    del datasets[dataset].obsm['X_fdg'], datasets[dataset].obsm['X_tsne'], datasets[dataset].obsm['X_umap']
    datasets[dataset].obs = datasets[dataset].obs.drop(['AnnatomicalPart','tissue', 'doublets','combined.labels','n_counts','nGene', 'nUMI'], axis = 1)
    datasets[dataset].obs = datasets[dataset].obs.rename({'cell.labels': 'labels_truth'}, axis='columns')
    datasets[dataset].var_names_make_unique()
    datasets[dataset].write(os.path.join(data_dir, dataset + '_Raw.h5ad'))

### Wegmann et al. (2019)

In [None]:
c_sub1 = sc.read_h5ad(os.path.join(data_dir,'Cellsius_Subset1_Raw.h5ad'))

c_sub2 = sc.read_h5ad(os.path.join(data_dir, 'Cellsius_Subset2_Raw.h5ad'))

In [None]:
datasets = {
    'Cellsius_Subset1': c_sub1,
    'Cellsius_Subset2': c_sub2}

In [None]:
#Change var names to gene symbols for cellsius datasets, rather than ENSB IDs
for dataset in datasets.keys():
    datasets[dataset].obs = datasets[dataset].obs.drop(['log10_total_counts','total_counts', 'total_features', 'log10_total_features', 'pct_dropout', 'exprs_feature_controls_MT', 'pct_exprs_feature_controls_MT', 'counts_feature_controls_MT', 'pct_counts_feature_controls_MT', 'n_detected_feature_controls_MT', 'n_detected_feature_controls', 'counts_feature_controls', 'pct_counts_feature_controls', 'pct_counts_top_50_features', 'pct_counts_top_100_features', 'pct_counts_top_200_features', 'pct_counts_top_500_features', 'pct_counts_top_50_endogenous_features', 'pct_counts_top_100_endogenous_features', 'pct_counts_top_200_endogenous_features', 'pct_counts_top_500_endogenous_features', 'counts_endogenous_features', 'log10_counts_feature_controls_MT', 'log10_counts_feature_controls', 'log10_counts_endogenous_features'], axis = 1)
    datasets[dataset].var = datasets[dataset].var.drop(['mean_exprs', 'exprs_rank', 'n_cells_exprs', 'total_feature_exprs', 'pct_total_exprs', 'pct_dropout', 'total_feature_counts', 'log10_total_feature_counts', 'pct_total_counts', 'is_feature_control_MT', 'is_feature_control'], axis = 1)
    datasets[dataset].obs = datasets[dataset].obs.rename({'cell_line': 'labels_truth'}, axis='columns')
    datasets[dataset].var_names = datasets[dataset].var['symbol'].values.astype('str')
    datasets[dataset].var_names_make_unique()
    datasets[dataset].write(os.path.join(data_dir, dataset + '_Raw.h5ad'))

# HVG datasets generation

In [None]:
c_sub1 = sc.read_h5ad(os.path.join(data_dir,'Cellsius_Subset1_Raw.h5ad'))
c_sub2 = sc.read_h5ad(os.path.join(data_dir,'Cellsius_Subset2_Raw.h5ad'))
ery_sub1 = sc.read_h5ad(os.path.join(data_dir,'FetalLiver_Subset1_Raw.h5ad'))
ery_sub2 = sc.read_h5ad(os.path.join(data_dir,'FetalLiver_Subset2_Raw.h5ad'))

In [None]:
datasets = {
    'Discrete_Abundant': c_sub1,
    'Discrete_Rare': c_sub2,
    'Continuous_Abundant': ery_sub1,
    'Continuous_Rare': ery_sub2}    

In [None]:
for dataset in datasets.keys():
    sc.pp.highly_variable_genes(datasets[dataset], n_top_genes=500, min_mean=0.0125, max_mean=3, min_disp=0.5)
    dat_hvg = datasets[dataset][:, datasets[dataset].var.highly_variable]
    del dat_hvg.uns['hvg']
    print(dat_hvg)
    dat_hvg.write(os.path.join(data_dir, dataset + f'_HVG500.h5ad'))

In [None]:
for dataset in datasets.keys():
    sc.pp.highly_variable_genes(datasets[dataset], n_top_genes=2000, min_mean=0.0125, max_mean=3, min_disp=0.5)
    dat_hvg = datasets[dataset][:, datasets[dataset].var.highly_variable]
    del dat_hvg.uns['hvg']
    print(dat_hvg)
    dat_hvg.write(os.path.join(data_dir, dataset + f'_HVG2000.h5ad'))