In [1]:
from pathlib import Path
import pandas as pd
import scanpy as sc
from tqdm import tqdm

In [2]:
data_dir = Path('/Users/djemec/data/jepa/v0_3')
raw_data = data_dir / 'raw_data'

In [3]:
#source: downloaded from https://plus.figshare.com/ndownloader/files/35773070
# https://gwps.wi.mit.edu/data_download

raw_h5 = raw_data / 'K562_essential_raw_singlecell_01.h5ad'
crispr_lib_v2 = raw_data / 'hcrispri_v2.csv'
crispr_lib_v2_1 = raw_data / 'hcrispri_v2_1.csv'

In [4]:
adata = sc.read_h5ad(raw_h5)
adata.obs.columns

Index(['gem_group', 'gene', 'gene_id', 'transcript', 'gene_transcript',
       'sgID_AB', 'mitopercent', 'UMI_count', 'z_gemgroup_UMI',
       'core_scale_factor', 'core_adjusted_UMI_count'],
      dtype='object')

In [5]:
v2_lib_df = pd.read_csv(crispr_lib_v2)
v2_lib_df['sgID_clean'] = v2_lib_df['sgID'].astype(str).str.replace(',', '-', regex=False).str.strip()
v2_lib_df.head()

Unnamed: 0,sgID,gene,transcript,protospacer sequence,selection rank,predicted score,empirical score,off-target stringency,Sublibrary half,sgID_clean
0,A1BG_-_58858617.23-P1,A1BG,P1,GGAGACCCAGCGCTAACCAG,1.0,1.008816,,0,Top5,A1BG_-_58858617.23-P1
1,A1BG_-_58858788.23-P1,A1BG,P1,GGGGCACCCAGGAGCGGTAG,2.0,0.901176,,0,Top5,A1BG_-_58858788.23-P1
2,A1BG_+_58858964.23-P1,A1BG,P1,GCTCCGGGCGACGTGGAGTG,3.0,0.836188,,0,Top5,A1BG_+_58858964.23-P1
3,A1BG_-_58858630.23-P1,A1BG,P1,GAACCAGGGGTGCCCAAGGG,4.0,0.827551,,0,Top5,A1BG_-_58858630.23-P1
4,A1BG_+_58858549.23-P1,A1BG,P1,GGCGAGGAACCGCCCAGCAA,5.0,0.775395,,0,Top5,A1BG_+_58858549.23-P1


In [6]:
v21_lib_df = pd.read_csv(crispr_lib_v2_1)
v21_lib_df['sgID_clean'] = v21_lib_df['sgID'].astype(str).str.replace(',', '-', regex=False).str.strip()
v21_lib_df.head()

Unnamed: 0,sgID,gene,transcript,protospacer sequence,selection rank,predicted score,empirical score,off-target stringency,Sublibrary half,sgID_clean
0,A1BG_+_58858964.23-P1,A1BG,P1,GCTCCGGGCGACGTGGAGTG,1,0.870837,,0,Top5,A1BG_+_58858964.23-P1
1,A1BG_-_58858788.23-P1,A1BG,P1,GGGGCACCCAGGAGCGGTAG,2,0.782793,,0,Top5,A1BG_-_58858788.23-P1
2,A1BG_-_58858991.23-P1,A1BG,P1,GTCCACGTCGCCCGGAGCTG,3,0.722963,,0,Top5,A1BG_-_58858991.23-P1
3,A1BG_-_58858950.23-P1,A1BG,P1,GGCAGCGCAGGACGGCATCT,4,0.706643,,0,Top5,A1BG_-_58858950.23-P1
4,A1BG_-_58858915.23-P1,A1BG,P1,GAGCAGCTCGAAGGTGACGT,5,0.700359,,0,Top5,A1BG_-_58858915.23-P1


In [7]:
id_to_seq_2 = dict(zip(v2_lib_df['sgID_clean'].str.strip(), v2_lib_df['protospacer sequence'].str.strip()))
id_to_seq_21 = dict(zip(v21_lib_df['sgID_clean'].str.strip(), v21_lib_df['protospacer sequence'].str.strip()))


In [8]:
def get_sequences(dual_id_string):
    '''
    Parses 'GuideA_ID|GuideB_ID' and returns 'SeqA|SeqB'
    '''
    if pd.isna(dual_id_string) or dual_id_string == '':
        return None, None
        
    try:
        # The IDs in sgID_AB are separated by a pipe '|'
        parts = dual_id_string.split('|')

        def lookup(gid):
            gid = gid.strip()
            seq = id_to_seq_2.get(gid, id_to_seq_21.get(gid))
            
            if seq is None:
                gid_alt = gid.replace(',', '-')
                seq = id_to_seq_2.get(gid_alt, id_to_seq_21.get(gid_alt, 'NOT_FOUND'))
            
            return seq
        
        if len(parts) == 2:
            id_a, id_b = parts
            return lookup(id_a), lookup(id_b)
            
        elif len(parts) == 1:
            return lookup(parts[0]), 'NONE'
            
    except Exception as e:
        return 'ERROR', 'ERROR'

    return 'MISSING', 'MISSING'

In [9]:
dual_ids = adata.obs['sgID_AB'].astype(str).tolist()
len(list(set(adata.obs['sgID_AB'].astype(str).tolist())))

2273

In [10]:
seq_a_list = []
seq_b_list = []

In [11]:
for dual_id in tqdm(dual_ids):
    sa, sb = get_sequences(dual_id)
    seq_a_list.append(sa)
    seq_b_list.append(sb)

100%|██████████████████████████████████████████████████████████████| 310385/310385 [00:00<00:00, 1784640.78it/s]


In [12]:
adata.obs['guide_seq_a'] = seq_a_list
adata.obs['guide_seq_b'] = seq_b_list

In [13]:
print(adata.obs[['sgID_AB', 'guide_seq_a', 'guide_seq_b']].head(5))

                                                               sgID_AB  \
cell_barcode                                                             
AAACCCAAGAAATCCA-27  NAF1_+_164087918.23-P1P2|NAF1_-_164087674.23-P1P2   
AAACCCAAGAACTTCC-31  BUB1_-_111435363.23-P1P2|BUB1_-_111435372.23-P1P2   
AAACCCAAGAAGCCAC-34      UBL5_-_9938639.23-P1P2|UBL5_+_9938801.23-P1P2   
AAACCCAAGAATAGTC-43  C9orf16_+_130922603.23-P1P2|C9orf16_+_13092264...   
AAACCCAAGACAGCGT-28  TIMM9_-_58893843.23-P1P2|TIMM9_-_58893848.23-P1P2   

                              guide_seq_a           guide_seq_b  
cell_barcode                                                     
AAACCCAAGAAATCCA-27  GGAGCCGTGAGCTTGTCCAG  GCCGCGACGGCGTTCAGAAC  
AAACCCAAGAACTTCC-31  GGACAAGCGCCGGGCCTCAG  GCGGGCCTCAGCGGAACCCA  
AAACCCAAGAAGCCAC-34  GGGTGAGGAGCTGGTGGCGT  GCCCAGGGCCGCGAACCCCG  
AAACCCAAGAATAGTC-43  GGCCGGCGCCGGATGGAAGG  GGCCGCGCGACGATGGAACG  
AAACCCAAGACAGCGT-28  GGGGACGGTTGAGCCTTGGG  GGGTTGAGCCTTGGGAGGGA  


In [33]:
a_mask = adata.obs['guide_seq_a'].astype(str).str.contains(r'NOT_FOUND|MISSING|ERROR', na=False)
len(set(adata.obs[a_mask]['sgID_AB'].astype(str)))


0

In [34]:
b_mask = adata.obs['guide_seq_b'].astype(str).str.contains(r'NOT_FOUND|MISSING|ERROR', na=False)

len(set(adata.obs[b_mask]['sgID_AB'].astype(str)))

0

In [35]:
set(adata.obs[a_mask]['sgID_AB'])

set()