In [0]:
%load_ext autoreload
%autoreload 2

import corescpy as cr
from corescpy.class_sc import Omics
from corescpy.processing.guide_rna import detect_guide_targets    
import re
import copy
import scanpy as sc
import copy
import os
from warnings import warn
import seaborn as sns
import matplotlib.pyplot as plt
import functools
import pandas as pd
import numpy as np

direc = "/home/elizabeth/elizabeth/corescpy/examples/data/"
file_path = dict(directory=os.path.join(direc, "crispr-screening/HH-Hu-CR5"), 
                 subdirectory_mtx="filtered_feature_bc_matrix",
                 file_protospacer="crispr_analysis/protospacer_calls_per_cell.csv")

kws_grna_no_process = dict(max_pct_control_drop=None,
                           min_n_target_control_drop=None,
                           min_pct_avg_n=None,
                           min_pct_dominant=None,
                           drop_multi_control=False, 
                           remove_multi_transfected=False,
                           feature_split="|", guide_split="-")

kws_init = dict(assay=None, assay_protein=None, col_sample_id=None,
                col_gene_symbols="gene_symbols",
                col_cell_type="predicted_labels",
                col_perturbed="perturbation",
                col_guide_rna="feature_call",
                col_num_umis="num_umis",
                col_condition="target_gene_name",
                key_control="NT", key_treatment="KD")

self = cr.Crispr(file_path, kws_process_guide_rna=kws_grna_no_process,
                 **kws_init)  # no gRNA processing yet

col_guide_rna = "feature_call"
col_num_umis = "num_umis"
guide_split = "-"
key_control_patterns = ["CTRL"]
key_control = "NT"
feature_split = "|"
min_n_target_control_drop = 100
max_pct_control_drop = 75
min_pct_dominant = 80
min_pct_avg_n = 40
remove_multi_transfected = True
col_guide_rna_new = self._columns["col_target_genes"]
conserve_memory = False

kws_process_guide_rna=dict(
    col_guide_rna=col_guide_rna,
    col_num_umis=col_num_umis,
    guide_split=guide_split,
    key_control_patterns=key_control_patterns,
    key_control=key_control,
    feature_split=feature_split,
    min_n_target_control_drop=min_n_target_control_drop,
    max_pct_control_drop=max_pct_control_drop,
    min_pct_dominant=min_pct_dominant,
    min_pct_avg_n=min_pct_avg_n,
    remove_multi_transfected=remove_multi_transfected,
    col_guide_rna_new=col_guide_rna_new,
    conserve_memory=conserve_memory)

ann = self.adata.copy()
ann.obs = ann.obs.iloc[:, :3]
ann.obs.columns = [re.sub("_original", "", x) for x in ann.obs.columns]

adata = ann.copy()

# Then copy-paste the code from the corescpy/processing/guide_rna.py functions to run line-by-line

Downloading data from `https://omnipathdb.org/queries/enzsub?format=json`
Downloading data from `https://omnipathdb.org/queries/interactions?format=json`
Downloading data from `https://omnipathdb.org/queries/complexes?format=json`
Downloading data from `https://omnipathdb.org/queries/annotations?format=json`
Downloading data from `https://omnipathdb.org/queries/intercell?format=json`
Downloading data from `https://omnipathdb.org/about?format=text`




<<< INITIALIZING OMICS CLASS OBJECT >>>

Unused keyword arguments: {'kws_process_guide_rna': {'col_guide_rna': 'feature_call', 'col_num_umis': 'num_umis', 'key_control': 'NT', 'col_guide_rna_new': 'target_gene_name', 'max_pct_control_drop': None, 'min_n_target_control_drop': None, 'min_pct_avg_n': None, 'min_pct_dominant': None, 'drop_multi_control': False, 'remove_multi_transfected': False, 'feature_split': '|', 'guide_split': '-'}}.

col_gene_symbols="gene_symbols"
col_cell_type="leiden"
col_sample_id=None
col_batch=None
col_subject=None
col_condition="target_gene_name"
col_num_umis="num_umis"
key_control="NT"
key_treatment="KD"


<<< LOADING PROTOSPACER METADATA >>>


Cell Counts: Initial

2084


Gene Counts: Initial



<<< PERFORMING gRNA PROCESSING & FILTERING >>>

{'max_pct_control_drop': None, 'min_n_target_control_drop': None, 'min_pct_avg_n': None, 'min_pct_dominant': None, 'drop_multi_control': False, 'feature_split': '|', 'guide_split': '-'}


	*** Removing filtered-out ce

In [2]:
    adata = ann.copy()
    
    ann, kws_pga = adata.copy(), copy.deepcopy(kws_process_guide_rna)
    print(f"\n\n<<< PERFORMING gRNA PROCESSING & FILTERING >>>\n\n{kws_pga}")



<<< PERFORMING gRNA PROCESSING & FILTERING >>>

{'col_guide_rna': 'feature_call', 'col_num_umis': 'num_umis', 'guide_split': '-', 'key_control_patterns': ['CTRL'], 'key_control': 'NT', 'feature_split': '|', 'min_n_target_control_drop': 100, 'max_pct_control_drop': 75, 'min_pct_dominant': 80, 'min_pct_avg_n': 40, 'remove_multi_transfected': True, 'col_guide_rna_new': 'target_gene_name', 'conserve_memory': False}


In [3]:
    ann = adata.copy()
    if guide_split is None:
        guide_split = "$"
    if key_control_patterns is None:
        key_control_patterns = [np.nan]
    guides = ann.obs[col_guide_rna].copy()  # guide names

In [4]:
guides

AAACCCATCGTGGCTG-1                          NEG_CTRL-4-1|NEG_CTRL-4-2
AAACCCATCTACCTTA-1          HIPK1-1|NEG_CTRL-1-1|HIPK1-2|NEG_CTRL-1-2
AAACGAAAGCCATTGT-1                     ZFP36-1|PAF1-1|GPX4-2|POLR2E-2
AAACGAACATGACGTT-1                                           IL10RB-2
AAACGAACATTGTGCA-1                                    RIPK2-1|SNX20-2
                                           ...                       
TTTCGATTCTCGTCGT-1                               RSBN1-1|NEG_CTRL-1-2
TTTGATCGTAGCGCTC-1                                    SP100-1|SP100-2
TTTGATCTCGCTCATC-1    CYLD-1|WSB1-1|DUSP1-1|HIPK1-2|POLR2E-2|IL10RB-2
TTTGGAGCATGAAGGC-1                                   FASLG-1|SP140L-2
TTTGTTGCACATTACG-1                            FASLG-1|HIPK1-2|SBNO2-2
Name: feature_call, Length: 1811, dtype: object

grs = None

In [7]:
    if guide_split is not None:
        split_char = [guide_split in g for g in ann.var_names]
        if any(split_char):
            grs = "==="
            bad_symb = np.array(ann.var_names)[np.where(split_char)[0]]
            if grs in guide_split:
                raise ValueError(f"{grs} is a reserved name and cannot be "
                                 "contained within `guide_split`.")
            warn(f"`guide_split` ({guide_split}) found in at least "
                 f"one gene name ({', '.join(bad_symb)}). Using {grs}. "
                 "as temporary substitute. Will attempt to replace later, "
                 "but note that there are risks in having a `guide_split` "
                 "as a character also found in gene names.")
            guides = guides.apply(lambda x: re.sub(bad_symb[np.where(
                [i in str(x) for i in bad_symb])[0][0]], re.sub(
                guide_split, grs, bad_symb[np.where(
                    [i in str(x) for i in bad_symb])[0][0]]), 
                str(x)) if any((i in str(x) for i in bad_symb)) else x)