# Generate manual splits for Jiang24 and Frangieh21

In [1]:
import scanpy as sc
import pandas as pd
from sklearn.model_selection import train_test_split

from perturbench.data.datasplitter import PerturbationDataSplitter

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%load_ext autoreload
%autoreload 2

In [3]:
data_cache_dir = 'perturbench_data/'

## Jiang24

In [12]:
adata = sc.read_h5ad(f'{data_cache_dir}/jiang24_processed.h5ad', backed='r')
adata

AnnData object with n_obs × n_vars = 1628476 × 15476 backed at 'perturbench_data/jiang24_processed.h5ad'
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'bc1_well', 'bc2_well', 'bc3_well', 'percent.mito', 'cell_type', 'pathway', 'sample_ID', 'Batch_info', 'guide', 'gene', 'mixscale_score', 'treatment', 'condition', 'perturbation', 'ncounts', 'ngenes', 'perturbation_type', 'dataset', 'cov_merged', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes'
    var: 'n_cells', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'hvg', 'log1p', 'rank_genes_groups_cov'
    layers: 'counts'

### Create a manual split

In [14]:
jiang24_heldout_covariates = []
cytokines_holdout = ['IFNG', 'INS', 'TGFB']
cell_lines_holdout = ['k562', 'mcf7', 'ht29', 'hap1']
for cytokine in cytokines_holdout:
    for cell_line in cell_lines_holdout:
        jiang24_heldout_covariates.append({cytokine, cell_line})
jiang24_heldout_covariates

[{'IFNG', 'k562'},
 {'IFNG', 'mcf7'},
 {'IFNG', 'ht29'},
 {'IFNG', 'hap1'},
 {'INS', 'k562'},
 {'INS', 'mcf7'},
 {'INS', 'ht29'},
 {'INS', 'hap1'},
 {'TGFB', 'k562'},
 {'TGFB', 'mcf7'},
 {'TGFB', 'ht29'},
 {'TGFB', 'hap1'}]

In [13]:
manual_splitter = PerturbationDataSplitter(
    adata.obs.copy(),
    perturbation_key='condition',
    covariate_keys=['cell_type', 'treatment'],
    perturbation_control_value='control',
)
manual_splitter

<perturbench.data.datasplitter.PerturbationDataSplitter at 0x7f1900a37a10>

Holdout 70% of perturbations in 4 cytokine treatments for 4 cell lines

In [15]:
jiang24_split = manual_splitter.split_covariates_manual(
    seed=0, 
    covariates_holdout=jiang24_heldout_covariates,
    max_heldout_fraction_per_covariate=0.7, ## Maximum fraction of perturbations held out per covariate
)

Split summary: 
                   train  val  test
('INS', 'mcf7')       15   16    16
('TNFA', 'a549')      56    1     1
('mcf7', 'IFNB')      62    1     1
('INS', 'hap1')       15   16    16
('TGFB', 'a549')      53    1     1
('TGFB', 'mcf7')      17   19    19
('IFNG', 'hap1')      19   22    21
('k562', 'INS')       15   16    16
('TGFB', 'hap1')      17   19    19
('INS', 'ht29')       15   16    16
('TNFA', 'bxpc3')     56    1     1
('IFNB', 'hap1')      62    1     1
('IFNG', 'a549')      60    1     1
('a549', 'IFNB')      62    1     1
('IFNG', 'ht29')      19   21    22
('IFNB', 'ht29')      62    1     1
('k562', 'TGFB')      17   19    19
('TNFA', 'mcf7')      56    1     1
('INS', 'a549')       45    1     1
('IFNB', 'bxpc3')     62    1     1
('IFNG', 'bxpc3')     60    1     1
('IFNG', 'k562')      19   22    21
('k562', 'IFNB')      62    1     1
('TGFB', 'ht29')      17   19    19
('TNFA', 'hap1')      56    1     1
('TGFB', 'bxpc3')     53    1     1
('TNFA', 'k5

In [16]:
jiang24_split.to_csv(f'{data_cache_dir}/jiang24_split.csv', header=False)

## frangieh21

In [4]:
adata = sc.read_h5ad(f'{data_cache_dir}/frangieh21_processed.h5ad')
adata

AnnData object with n_obs × n_vars = 218331 × 23712
    obs: 'library_preparation_protocol', 'perturbation_2', 'MOI', 'sgRNA', 'UMI_count', 'guide_id', 'umap_x', 'umap_y', 'perturbation', 'tissue_type', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation_type_2', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo', 'cell_type', 'treatment', 'condition', 'dataset'
    var: 'ensembl_id', 'ncounts', 'ncells'

In [5]:
adata.obs.treatment.value_counts()

treatment
IFNγ          87590
co-culture    73114
none          57627
Name: count, dtype: int64

In [6]:
manual_splitter = PerturbationDataSplitter(
    adata.obs.copy(),
    perturbation_key='condition',
    covariate_keys=['treatment'],
    perturbation_control_value='control',
)
manual_splitter

<perturbench.data.datasplitter.PerturbationDataSplitter at 0x7fa22a1e5790>

Holdout 70% of perturbations in the Co-culture treatment

In [7]:
frangieh21_split = manual_splitter.split_covariates_manual(
    seed=0, 
    covariates_holdout=[{'co-culture'}],
    max_heldout_fraction_per_covariate=0.7, ## Maximum fraction of perturbations held out per covariate
)

Split summary: 
                 train  val  test
('IFNγ',)          249    1     1
('none',)          249    1     1
('co-culture',)     76   87    88


In [8]:
frangieh21_split.to_csv(f'{data_cache_dir}/frangieh21_split.csv', header=False)