## Dede et al (2020) double perturbation screens

#### Multiplex enCas12a screens detect functional buffering among paralogs otherwise masked in monogenic Cas9 knockout screens
Merve Dede, Megan McLaughlin, Eiru Kim & Traver Hart

https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02173-2#Sec17

**Input**: Table S2 from paper

**Output**: Paralog pairs from screen annotated with SL status

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import os

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../../local_data/' +'/'.join(folders) +'/'+ fname)

# Supplementary Table 2: Table of zdLFC scores from the paralog screen.
file_table_s2 = get_data_path(['GI_screens', 'dede_2020'], 'Table_S2.txt')
file_all_pairs = get_local_data_path(['processed', 'ensembl93'], 'all_pairs.csv')
file_depmap_pairs = get_local_data_path(['results'], 'bronze_standard_SL_pairs_24_09_20.csv')

# Output
file_hart_pairs = get_local_data_path(['processed','screen_pairs'], 'hart_pairs.csv')

### Process Table S2 from paper
Add SL calls based on score < -3

In [4]:
# Table S2 has all pairs with score in each of the three screened cell lines
table_s2 = pd.read_csv(file_table_s2, sep='\t')
table_s2 = table_s2.rename(columns={'Unnamed: 0':'pair'})
print('N pairs:', table_s2.shape[0])
table_s2['A1'] = table_s2.pair.apply(lambda x: x.split('_')[0])
table_s2['A2'] = table_s2.pair.apply(lambda x: x.split('_')[1])
table_s2[:2]

N pairs: 403


Unnamed: 0,pair,A549,HT29,OVCAR8,A1,A2
0,ABHD4_ABHD5,-0.609,-0.036,-0.046,ABHD4,ABHD5
1,ABL1_ABL2,0.088,0.887,0.08,ABL1,ABL2


#### Label SL gene pairs and merge with Ensembl paralogs

In [8]:
# Call pair SL in cell line if score < -3
all_screen_pairs = table_s2.assign(A549_SL = table_s2.A549 < -3,
                                   HT29_SL = table_s2.HT29 < -3,
                                   OVCAR8_SL = table_s2.OVCAR8 < -3)
all_screen_pairs['n_SL'] = all_screen_pairs[['A549_SL','HT29_SL','OVCAR8_SL']].sum(axis=1)

# Sort A1, A2
all_screen_pairs = pd.concat([pd.DataFrame(np.sort(all_screen_pairs[['A1','A2']]), columns=['A1','A2']),
                              all_screen_pairs.drop(columns=['A1','A2'])], axis=1)

# Merge with our Ensembl list of paralog pairs
all_pairs = pd.read_csv(file_all_pairs, index_col=0)[['A1', 'A2', 'min_seq_id']]
screen_pairs = pd.merge(all_pairs, all_screen_pairs)

# Summary numbers
print('SL in 1+:', screen_pairs[screen_pairs.n_SL >= 1].shape[0])
print('SL in 2+:', screen_pairs[screen_pairs.n_SL >= 2].shape[0])
print('SL in all 3:', screen_pairs[screen_pairs.n_SL == 3].shape[0])
print('%% SL 2+: %.2f' % (screen_pairs[screen_pairs.n_SL>=2].shape[0]/screen_pairs.shape[0]*100))

screen_pairs[:2]

SL in 1+: 24
SL in 2+: 19
SL in all 3: 14
% SL 2+: 4.83


Unnamed: 0,A1,A2,min_seq_id,pair,A549,HT29,OVCAR8,A549_SL,HT29_SL,OVCAR8_SL,n_SL
0,SRSF4,SRSF5,0.346154,SRSF4_SRSF5,0.281,0.04,0.853,False,False,False,0
1,ENTPD4,ENTPD7,0.616883,ENTPD4_ENTPD7,0.995,0.503,0.278,False,False,False,0


In [9]:
screen_pairs.to_csv(file_hart_pairs, index=0)

#### Check which pairs are not in our Ensembl list of paralog pairs

In [10]:
all_pairs = pd.read_csv(file_all_pairs, index_col=0)[['A1', 'A2', 'min_seq_id']]
df = pd.merge(all_pairs, all_screen_pairs, how='right')
print('N overlap:', df[~df.min_seq_id.isna()].shape[0], '/', df.shape[0])
display(df[df.min_seq_id.isna()])

N overlap: 393 / 403


Unnamed: 0,A1,A2,min_seq_id,pair,A549,HT29,OVCAR8,A549_SL,HT29_SL,OVCAR8_SL,n_SL
393,DCUN1D1,DCUN1D2,,DCUN1D1_DCUN1D2,0.423,0.135,-0.408,False,False,False,0
394,HIST1H4H,HIST4H4,,HIST1H4H_HIST4H4,-0.062,0.184,0.797,False,False,False,0
395,HIST2H2BE,HIST2H2BF,,HIST2H2BE_HIST2H2BF,0.183,-0.535,0.855,False,False,False,0
396,PPP1CC,PPP2CB,,PPP1CC_PPP2CB,1.599,1.838,1.116,False,False,False,0
397,RHOC,RHOG,,RHOC_RHOG,-0.694,0.383,-0.441,False,False,False,0
398,RNF185,RNF5,,RNF185_RNF5,0.684,0.579,0.937,False,False,False,0
399,SEPT11,SEPT8,,SEPT11_SEPT8,0.706,-0.434,0.292,False,False,False,0
400,ZNF286A,ZNF286B,,ZNF286A_ZNF286B,1.264,0.563,-0.066,False,False,False,0
401,BCL2L1,MCL1,,BCL2L1_MCL1,0.935,-1.995,-1.615,False,False,False,0
402,BRCA1,PARP1,,BRCA1_PARP1,0.902,0.165,-0.609,False,False,False,0


### Overlap w/ our bronze standard set

In [11]:
depmap_pairs = pd.read_csv(file_depmap_pairs)
print('N=', depmap_pairs[depmap_pairs.SL].shape[0], '/', depmap_pairs.shape[0])
depmap_pairs[:1]

N= 127 / 3637


Unnamed: 0,A1,A2,SL,ols_p,p_adj,nearly_SL,A1_ensembl,A2_ensembl
0,AAK1,BMP2K,False,0.083471,0.872559,False,ENSG00000115977,ENSG00000138756


In [13]:
# Considering all screened pairs
overlap = pd.merge(screen_pairs, depmap_pairs[['A1','A2','SL']])
print('Overlap w/ DepMap pairs:', overlap.shape[0])
print('Same status: %d, %.2f%% - SL: %d' % 
      (overlap[overlap.SL == (overlap.n_SL>0)].shape[0], 
       overlap[overlap.SL == (overlap.n_SL>0)].shape[0] / overlap.shape[0]*100,
       overlap[overlap.SL & (overlap.SL == (overlap.n_SL>0))].shape[0]))
print('SL DepMap only:', overlap[(overlap.SL) & (overlap.n_SL<1)].shape[0])
print('SL Thompson only (1+):', overlap[(~overlap.SL) & (overlap.n_SL>0)].shape[0])
ctab = pd.crosstab(overlap.n_SL>0, overlap.SL)
print('FET:', stats.fisher_exact(ctab))
ctab

Overlap w/ DepMap pairs: 156
Same status: 144, 92.31% - SL: 4
SL DepMap only: 10
SL Thompson only (1+): 2
FET: (28.0, 0.000567584436038887)


SL,False,True
n_SL,Unnamed: 1_level_1,Unnamed: 2_level_1
False,140,10
True,2,4


In [14]:
# Pairs SL in DepMap and Hart GI screen
overlap[overlap.SL & (overlap.n_SL>0)][['A1','A2','n_SL','A549_SL','HT29_SL','OVCAR8_SL']]

Unnamed: 0,A1,A2,n_SL,A549_SL,HT29_SL,OVCAR8_SL
14,ATP6V0A1,ATP6V0A2,1,False,True,False
36,ARFGEF1,ARFGEF2,2,False,True,True
51,HDAC1,HDAC2,2,False,True,True
106,CNOT7,CNOT8,3,True,True,True
