## Dede et al (2020) double perturbation screens

#### Multiplex enCas12a screens detect functional buffering among paralogs otherwise masked in monogenic Cas9 knockout screens
Merve Dede, Megan McLaughlin, Eiru Kim & Traver Hart

https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02173-2#Sec17

**Input**: Table S2 from the paper

**Output**: Paralog pairs from screen annotated with SL status

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import os

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../../local_data/' +'/'.join(folders) +'/'+ fname)

# Supplementary Table 2: Table of zdLFC scores from the paralog screen.
file_table_s2 = get_data_path(['GI_screens', 'dede_2020'], 'Table_S2.txt')
file_all_pairs = lambda ens_v: get_local_data_path(['processed', 'ensembl'+ens_v], 'all_pairs.csv')

# Output
file_dede_pairs = get_local_data_path(['processed','screen_pairs'], 'dede_pairs.csv')

### Process Table S2 from paper
Add SL calls based on score < -3

In [2]:
# Table S2 has all pairs with score in each of the three screened cell lines
table_s2 = pd.read_csv(file_table_s2, sep='\t')
table_s2 = table_s2.rename(columns={'Unnamed: 0':'pair'})
print('N pairs:', table_s2.shape[0])
table_s2['A1'] = table_s2.pair.apply(lambda x: x.split('_')[0])
table_s2['A2'] = table_s2.pair.apply(lambda x: x.split('_')[1])
table_s2[:2]

N pairs: 403


Unnamed: 0,pair,A549,HT29,OVCAR8,A1,A2
0,ABHD4_ABHD5,-0.609,-0.036,-0.046,ABHD4,ABHD5
1,ABL1_ABL2,0.088,0.887,0.08,ABL1,ABL2


#### Label SL gene pairs

In [3]:
# Call pair SL in cell line if score < -3
screen_hits = table_s2.assign(A549_SL = table_s2.A549 < -3, 
                              HT29_SL = table_s2.HT29 < -3, 
                              OVCAR8_SL = table_s2.OVCAR8 < -3)
screen_hits['n_SL'] = screen_hits[['A549_SL','HT29_SL','OVCAR8_SL']].sum(axis=1)
screen_hits[:1]

Unnamed: 0,pair,A549,HT29,OVCAR8,A1,A2,A549_SL,HT29_SL,OVCAR8_SL,n_SL
0,ABHD4_ABHD5,-0.609,-0.036,-0.046,ABHD4,ABHD5,False,False,False,0


#### Merge with Ensembl paralogs

In [4]:
# Check overlap with different Ensembl versions
all_pairs_93 = pd.read_csv(file_all_pairs('93'))[['A1','A2']]
all_pairs_102 = pd.read_csv(file_all_pairs('102'))[['A1','A2']]

In [5]:
# Merge with our Ensembl list of paralog pairs
screen_pairs = pd.merge(screen_hits, all_pairs_93, how='left', indicator='in_93')
screen_pairs = pd.merge(screen_pairs, all_pairs_102, how='left', indicator='in_102')
screen_pairs['in_93'] = screen_pairs['in_93'] == 'both'
screen_pairs['in_102'] = screen_pairs['in_102'] == 'both'
assert(screen_hits.shape[0] == screen_pairs.shape[0])
print('Overlap w/ Ens.93:', sum(screen_pairs.in_93))
print('Overlap w/ Ens.102:', sum(screen_pairs.in_102))

# Sorted gene pair column
screen_pairs = screen_pairs.drop(columns=['pair'])
screen_pairs.insert(0, 'sorted_gene_pair', screen_pairs.apply(lambda x: '_'.join(sorted([x.A1, x.A2])), axis=1))

# Summary numbers
print('SL in 1+:', screen_pairs[screen_pairs.n_SL >= 1].shape[0])
print('SL in 2+:', screen_pairs[screen_pairs.n_SL >= 2].shape[0])
print('SL in all 3:', screen_pairs[screen_pairs.n_SL == 3].shape[0])
print('%% SL 2+: %.2f%%' % (screen_pairs[screen_pairs.n_SL>=2].shape[0]/screen_pairs.shape[0]*100))
screen_pairs[:2]

Overlap w/ Ens.93: 393
Overlap w/ Ens.102: 396
SL in 1+: 24
SL in 2+: 19
SL in all 3: 14
% SL 2+: 4.71%


Unnamed: 0,sorted_gene_pair,A549,HT29,OVCAR8,A1,A2,A549_SL,HT29_SL,OVCAR8_SL,n_SL,in_93,in_102
0,ABHD4_ABHD5,-0.609,-0.036,-0.046,ABHD4,ABHD5,False,False,False,0,True,True
1,ABL1_ABL2,0.088,0.887,0.08,ABL1,ABL2,False,False,False,0,True,True


In [8]:
screen_pairs.drop(columns=['A1','A2']).to_csv(file_dede_pairs, index=0)