In [None]:
import scanpy as sc
import numpy as np
import os
import subprocess as sp
from scipy.sparse import csr_matrix

%load_ext autoreload
%autoreload 2

In [None]:
data_url = 'https://zenodo.org/records/7041849/files/FrangiehIzar2021_RNA.h5ad?download=1'
data_cache_dir = '../perturbench_data' ## Change this to your local data directory

if not os.path.exists(data_cache_dir):
    os.makedirs(data_cache_dir)

tmp_data_dir = f'{data_cache_dir}/frangieh21_downloaded.h5ad'

if not os.path.exists(tmp_data_dir):
    sp.call(f'wget {data_url} -O {tmp_data_dir}', shell=True)

In [None]:
adata = sc.read_h5ad(tmp_data_dir)
adata

In [None]:
adata.obs.celltype.value_counts()

In [None]:
adata.obs['cell_type'] = 'melanocyte'

In [None]:
adata.obs.perturbation_2.value_counts()

In [None]:
treatment_map = {
    'Co-culture': 'co-culture',
    'Control': 'none',
}

adata.obs['treatment'] = [treatment_map[x] if x in treatment_map else x for x in adata.obs.perturbation_2]
adata.obs.treatment.value_counts()

In [None]:
adata.obs.perturbation.value_counts()

In [None]:
adata.obs['condition'] = adata.obs.perturbation.copy()
adata.obs['perturbation_type'] = 'CRISPRi'
adata.obs['dataset'] = 'frangieh21'

In [None]:
from perturbench.analysis.preprocess import preprocess
adata = adata.copy()
adata.X = csr_matrix(adata.X)
adata = preprocess(
    adata,
    perturbation_key='condition',
    covariate_keys=['cell_type'],
).copy()

In [None]:
required_cols = [
    'condition',
    'cell_type',
    'treatment',
    'perturbation_type',
    'dataset',
    'ngenes',
    'ncounts',
]

for col in required_cols:
    assert col in adata.obs.columns
    if np.any(adata.obs[col].isnull()):
        print(col)
    if np.any(adata.obs[col].isna()):
        print(col)

In [None]:
adata.var.head()

In [None]:
adata.write_h5ad(f'{data_cache_dir}/frangieh21_processed.h5ad')

In [None]:
adata.X.data

In [None]:
adata.X
