## Thompson et al (2020) double perturbation screens

https://www.nature.com/articles/s41467-021-21478-9

**Input**: CRISPR double perturbation screen data from Thompson et al.

- Table 1 = Gene pairs used in the screen and analysis results
- Table 3 = Gene pairs and associated gene symbols
- Table 5 = Statistically significant gene pairs after filtering; SL gene pairs in each of the three cell lines

**Output**: Paralog pairs from screen annotated with SL status

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import os
import seaborn as sns
import matplotlib.pyplot as plt

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../../local_data/' +'/'.join(folders) +'/'+ fname)

# Input
file_tab_s1 = get_data_path(['GI_screens', 'thompson_2020'], 'Supplementary_Data_1.xlsx')
file_tab_s3 = get_data_path(['GI_screens', 'thompson_2020'], 'Supplementary_Data_3.xlsx')
file_tab_s5 = get_data_path(['GI_screens', 'thompson_2020'], 'Supplementary_Data_5.xlsx')

file_paralog_pairs = lambda ens_v: get_local_data_path(['processed', 'ensembl'+ens_v], 'all_pairs.csv')

# Output
file_thompson_pairs = get_local_data_path(['processed', 'screen_pairs'], 'thompson_pairs.csv')

### Load supplemental data tables

#### Table 1

In [2]:
# RRA = Robust Rank Aggregation
# Significance called at FDR < 0.1 for both statistics
table1_raw = pd.read_excel(file_tab_s1, sheet_name=0, skiprows=2)
table1_raw = table1_raw.drop(columns=['Unnamed: 42'])
#print(table1_raw.columns)
table1_raw[:1]

Unnamed: 0,Pair,GENE PAIR,Gene pair class,A375_D28_rra_fdr_low,A375_D28_t_fdr_low,A375_D14_rra_fdr_low,A375_D14_t_fdr_low,Passes A375 filter?,Mewo_D28_rra_fdr_low,Mewo_D28_t_fdr_low,...,MEWO_D14_Bagel_Gene1,MEWO_D14_Bagel_Gene2,RPE_D14_Bagel_Gene1,RPE_D14_Bagel_Gene2,A375_D28_Bagel_Gene1,A375_D28_Bagel_Gene2,MEWO_D28_Bagel_Gene1,MEWO_D28_Bagel_Gene2,RPE_D28_Bagel_Gene1,RPE_D28_Bagel_Gene2
0,1,AARS2_AARS,Paralogous_gene_pair,0.228786,1.0,0.999996,1.0,No,0.999996,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
# Clean up filter columns
table1 = table1_raw.rename(columns={'Passes the Mewo filter?':'passes_Mewo_filter', 
                                    'Passes A375 filter?':'passes_A375_filter',
                                    'Passes RPE filter?':'passes_RPE_filter'})
table1['passes_A375_filter'] = table1.passes_A375_filter.apply(lambda x: x=='Yes')
table1['passes_Mewo_filter'] = table1.passes_Mewo_filter.apply(lambda x: x=='Yes')
table1['passes_RPE_filter'] = table1.passes_RPE_filter.apply(lambda x: x=='Yes')

# Extract gene symbols
table1['gene1'] = table1['GENE PAIR'].apply(lambda x: x.split('_')[0])
table1['gene2'] = table1['GENE PAIR'].apply(lambda x: x.split('_')[1])
table1[:1]

Unnamed: 0,Pair,GENE PAIR,Gene pair class,A375_D28_rra_fdr_low,A375_D28_t_fdr_low,A375_D14_rra_fdr_low,A375_D14_t_fdr_low,passes_A375_filter,Mewo_D28_rra_fdr_low,Mewo_D28_t_fdr_low,...,RPE_D14_Bagel_Gene1,RPE_D14_Bagel_Gene2,A375_D28_Bagel_Gene1,A375_D28_Bagel_Gene2,MEWO_D28_Bagel_Gene1,MEWO_D28_Bagel_Gene2,RPE_D28_Bagel_Gene1,RPE_D28_Bagel_Gene2,gene1,gene2
0,1,AARS2_AARS,Paralogous_gene_pair,0.228786,1.0,0.999996,1.0,False,0.999996,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,AARS2,AARS


In [4]:
# From paper: "identified between 177 and 201 candidate SL interactions per cell line"
def count_hits(cell_line, w_filter=False):
    if w_filter:
        return table1[(table1[cell_line+'_D28_t_fdr_low'] < 0.1) & (table1[cell_line+'_D28_rra_fdr_low'] < 0.1) & 
                      (table1['passes_'+cell_line+'_filter'])].shape[0]
    else:
        return table1[(table1[cell_line+'_D28_t_fdr_low'] < 0.1) & (table1[cell_line+'_D28_rra_fdr_low'] < 0.1)].shape[0]

print('N pairs total:', table1.shape[0])
print('Unfiltered A375 hits:', count_hits('A375'))
print('Unfiltered Mewo hits:', count_hits('Mewo'))
print('Unfiltered RPE hits:', count_hits('RPE'))

# From paper: "refined our candidate list to 40–57 candidate SL interactions per cell line"
print('Filtered A375 hits:', count_hits('A375', w_filter=True))
print('Filtered Mewo hits:', count_hits('Mewo', w_filter=True))
print('Filtered RPE hits:', count_hits('RPE', w_filter=True))

N pairs total: 1192
Unfiltered A375 hits: 199
Unfiltered Mewo hits: 177
Unfiltered RPE hits: 201
Filtered A375 hits: 44
Filtered Mewo hits: 40
Filtered RPE hits: 57


#### Table 3

In [5]:
table3_raw = pd.read_excel(file_tab_s3, sheet_name=0, skiprows=2).dropna()
print('N:', table3_raw.shape[0])
table3_raw[:2]

N: 2222


Unnamed: 0,GeneA_ENSEMBL_gene_ID,GeneASymbol,GeneA_ENSEMBL_gene_ID_dom_hits_count_Zero_ex_offtar,GeneB_ENSEMBL_gene_ID,GeneBSymbol,GeneB_ENSEMBL_gene_ID_dom_hits_count_Zero_ex_offtar,gene_pair_origin,pair_gRNA_with_protdom_hits,uniq_pair_id
0,ENSG00000012048,BRCA1,56.0,ENSG00000143799,PARP1,77.0,SynLethDB_gene_pair,1.0,ENSG00000012048_BRCA1_ENSG00000143799_PARP1
1,ENSG00000175054,ATR,132.0,ENSG00000136997,MYC,28.0,SynLethDB_gene_pair,1.0,ENSG00000175054_ATR_ENSG00000136997_MYC


In [6]:
# Check if any Table 1 genes are not in Table 3
display(table1[~table1['gene1'].isin(table3_raw.GeneASymbol)][['GENE PAIR','Gene pair class']])
display(table1[~table1['gene2'].isin(table3_raw.GeneBSymbol)][['GENE PAIR','Gene pair class']])

Unnamed: 0,GENE PAIR,Gene pair class
639,Non_Tar_RND_control_Non_Tar_RND_control (these...,Control pair (non-targeting controls paired)
716,PLPP6_PLPP7,Paralogous_gene_pair
823,RP11-111K18.1_PSMA2,Paralogous_gene_pair


Unnamed: 0,GENE PAIR,Gene pair class
639,Non_Tar_RND_control_Non_Tar_RND_control (these...,Control pair (non-targeting controls paired)


In [7]:
table3 = table3_raw.rename(columns={'GeneASymbol':'gene1', 'GeneBSymbol':'gene2', 'GeneA_ENSEMBL_gene_ID':'A1_ensembl',
                                    'GeneB_ENSEMBL_gene_ID':'A2_ensembl'})[['gene1','gene2','A1_ensembl','A2_ensembl']]
# Manual curation: replacing PPAPDC2 with PLPP6 in Table 3, to match Table 1 (PPAPDC2 is the prev. symbol)
table3.loc[(table3.gene1=='PPAPDC2') & (table3.gene2=='PLPP7'), 'gene1'] = 'PLPP6'
# Manula curation: replace the Ensembl ID for SCO2 
table3 = table3.replace('ENSG00000130489', 'ENSG00000284194')
table3[:1]

Unnamed: 0,gene1,gene2,A1_ensembl,A2_ensembl
0,BRCA1,PARP1,ENSG00000012048,ENSG00000143799


#### Table 5

In [8]:
# Final (filtered) hits in each cell lines
table5 = pd.read_excel(file_tab_s5, sheet_name=0, skiprows=3)
print('A375 hits:', sum(~table5.A375.isna()))
print('Mewo hits:', sum(~table5.Mewo.isna()))
print('RPE hits:', sum(~table5.RPE.isna()))
table5[:1]

A375 hits: 44
Mewo hits: 40
RPE hits: 57


Unnamed: 0,A375,Mewo,RPE
0,AP2A2_AP2A1,ARID1A_ARID1B,ALAS1_ALAS2


#### Merge Table 1 with Table 3 IDs

In [9]:
# Merge with table 3 which has ensembl ids for the symbols
table1_merged = pd.merge(table1, table3[['gene1','gene2','A1_ensembl','A2_ensembl']])
# Expect 2 fewer pairs than in Table 1
assert(table1_merged.shape[0]==table1.shape[0]-2)

# Note several paralog pairs are in the Miscellaneous category instead so don't filter on this!
print('N paralog pairs screened:', sum(table1_merged['Gene pair class']=='Paralogous_gene_pair'), '/', table1_merged.shape[0])

# Re-order columns to show gene ids near the start
cols = table1_merged.columns.to_list()
table1_merged = table1_merged[cols[:3] + cols[-4:] + cols[3:-4]]
table1_merged[:1]

N paralog pairs screened: 644 / 1190


Unnamed: 0,Pair,GENE PAIR,Gene pair class,gene1,gene2,A1_ensembl,A2_ensembl,A375_D28_rra_fdr_low,A375_D28_t_fdr_low,A375_D14_rra_fdr_low,...,MEWO_D14_Bagel_Gene1,MEWO_D14_Bagel_Gene2,RPE_D14_Bagel_Gene1,RPE_D14_Bagel_Gene2,A375_D28_Bagel_Gene1,A375_D28_Bagel_Gene2,MEWO_D28_Bagel_Gene1,MEWO_D28_Bagel_Gene2,RPE_D28_Bagel_Gene1,RPE_D28_Bagel_Gene2
0,1,AARS2_AARS,Paralogous_gene_pair,AARS2,AARS,ENSG00000124608,ENSG00000090861,0.228786,1.0,0.999996,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Label SL gene pairs

In [10]:
# Call hits based on table S5 list
screen_hits = table1_merged.assign(A375_SL= table1_merged['GENE PAIR'].isin(table5['A375']),
                                   Mewo_SL= table1_merged['GENE PAIR'].isin(table5['Mewo']),
                                   RPE_SL= table1_merged['GENE PAIR'].isin(table5['RPE']))
# Sum hits across cell lines
screen_hits['n_SL'] = screen_hits[['A375_SL','Mewo_SL','RPE_SL']].sum(axis=1)

# Label which pairs pass at least 1 of the 'individually essential' filters
screen_hits['passes_min_1_filter'] = screen_hits.apply(lambda x: x.passes_A375_filter | x.passes_Mewo_filter | 
                                                                 x.passes_RPE_filter, axis=1)
assert(screen_hits[~screen_hits.passes_min_1_filter & (screen_hits.n_SL>=1)].shape[0]==0)
print('Pairs passing at least 1 filter:', sum(screen_hits.passes_min_1_filter), '/', screen_hits.shape[0])

# Filter to relevant columns
screen_hits = screen_hits[['gene1','gene2','A1_ensembl','A2_ensembl','A375_SL','Mewo_SL','RPE_SL','n_SL','passes_min_1_filter',
                           'passes_A375_filter','passes_Mewo_filter','passes_RPE_filter','Gene pair class']]

print('N hits (1+ cell lines):', sum(screen_hits.n_SL>=1))
print('N pairs marked as paralogs:', screen_hits[screen_hits['Gene pair class']=='Paralogous_gene_pair'].shape[0])
screen_hits[:1]

Pairs passing at least 1 filter: 880 / 1190
N hits (1+ cell lines): 105
N pairs marked as paralogs: 644


Unnamed: 0,gene1,gene2,A1_ensembl,A2_ensembl,A375_SL,Mewo_SL,RPE_SL,n_SL,passes_min_1_filter,passes_A375_filter,passes_Mewo_filter,passes_RPE_filter,Gene pair class
0,AARS2,AARS,ENSG00000124608,ENSG00000090861,False,False,False,0,False,False,False,False,Paralogous_gene_pair


### Merge with Ensembl paralog pairs & export

In [11]:
# Symmetric dataframes
all_pairs_93 = pd.read_csv(file_paralog_pairs('93'))
print('N=', all_pairs_93.shape[0])
all_pairs_102 = pd.read_csv(file_paralog_pairs('102'))
print('N=', all_pairs_102.shape[0])

N= 73296
N= 179968


In [12]:
# Merge with Ensembl list of paralog pairs (both versions), on Ensembl ID
overlap_93 = pd.merge(all_pairs_93[['A1','A2','A1_ensembl','A2_ensembl']], screen_hits)
overlap_102 = pd.merge(all_pairs_102[['A1','A2','A1_ensembl','A2_ensembl']], screen_hits)
print('Overlap w/ Ens.93:', overlap_93.shape[0])
print('Overlap w/ Ens.102:', overlap_102.shape[0])

# Combine both merges to get pairs in either Ensembl version
screen_pairs = pd.merge(overlap_93, overlap_102, how='outer', indicator='in_version')
screen_pairs = screen_pairs.assign(in_93 = ((screen_pairs.in_version=='both') | (screen_pairs.in_version=='left_only')),
                                   in_102 = ((screen_pairs.in_version=='both') | (screen_pairs.in_version=='right_only')))
screen_pairs = screen_pairs.drop(columns=['in_version','gene1','gene2','A1_ensembl','A2_ensembl'])

screen_pairs.insert(0, 'sorted_gene_pair', screen_pairs.apply(lambda x: '_'.join(sorted([x.A1, x.A2])), axis=1))
screen_pairs[:1]

Overlap w/ Ens.93: 592
Overlap w/ Ens.102: 567


Unnamed: 0,sorted_gene_pair,A1,A2,A375_SL,Mewo_SL,RPE_SL,n_SL,passes_min_1_filter,passes_A375_filter,passes_Mewo_filter,passes_RPE_filter,Gene pair class,in_93,in_102
0,CASKIN1_CASKIN2,CASKIN1,CASKIN2,False,False,False,0,True,True,False,True,Paralogous_gene_pair,True,True


In [13]:
# Save unfiltered version
screen_pairs.drop(columns=['Gene pair class','A1','A2']).to_csv(file_thompson_pairs, index=0)

### Check SLs in final dataset

In [14]:
# Filter out pairs that were individually essential in all three cell lines
# The authors exclude these from their candidate SL pairs
screen_pairs_filt = screen_pairs[screen_pairs.passes_min_1_filter & screen_pairs.in_93].reset_index(drop=True)
print('That pass 1+ filter in Ens93:', screen_pairs_filt.shape[0])
screen_pairs_filt[:2]

That pass 1+ filter in Ens93: 475


Unnamed: 0,sorted_gene_pair,A1,A2,A375_SL,Mewo_SL,RPE_SL,n_SL,passes_min_1_filter,passes_A375_filter,passes_Mewo_filter,passes_RPE_filter,Gene pair class,in_93,in_102
0,CASKIN1_CASKIN2,CASKIN1,CASKIN2,False,False,False,0,True,True,False,True,Paralogous_gene_pair,True,True
1,MIB1_MIB2,MIB2,MIB1,False,False,False,0,True,True,True,True,Paralogous_gene_pair,True,True


In [15]:
def print_SL_stats(pairs):
    print('A375 SL: %d -> %.2f%%' % (sum(pairs.A375_SL), sum(pairs.A375_SL)/sum(pairs.passes_A375_filter)*100))
    print('Mewo SL: %d -> %.2f%%' % (sum(pairs.Mewo_SL), sum(pairs.Mewo_SL)/sum(pairs.passes_Mewo_filter)*100))
    print('RPE SL: %d -> %.2f%%' % (sum(pairs.RPE_SL), sum(pairs.RPE_SL)/sum(pairs.passes_RPE_filter)*100))
    print('SL 2+: %d -> %.2f%%' % (pairs[pairs.n_SL >= 2].shape[0],
                                   pairs[pairs.n_SL >= 2].shape[0]/sum(pairs.passes_min_1_filter)*100))

print_SL_stats(screen_pairs_filt)

A375 SL: 38 -> 9.82%
Mewo SL: 26 -> 6.18%
RPE SL: 39 -> 9.01%
SL 2+: 23 -> 4.84%


In [16]:
# Check screen pairs that were not marked as paralogs
screen_pairs[screen_pairs['Gene pair class'] != 'Paralogous_gene_pair']

Unnamed: 0,sorted_gene_pair,A1,A2,A375_SL,Mewo_SL,RPE_SL,n_SL,passes_min_1_filter,passes_A375_filter,passes_Mewo_filter,passes_RPE_filter,Gene pair class,in_93,in_102
180,KDM6A_UTY,KDM6A,UTY,False,False,False,0,True,True,True,True,Miscellaneous_pair,True,True
181,KDM6B_UTY,KDM6B,UTY,False,False,False,0,True,True,True,True,Miscellaneous_pair,True,True
182,KDM6A_KDM6B,KDM6A,KDM6B,False,False,False,0,True,True,True,True,Miscellaneous_pair,True,True
226,EGFR_ERBB3,EGFR,ERBB3,False,False,False,0,True,True,True,True,SynLethDB_gene_pair,True,True
593,EGFR_FGFR2,EGFR,FGFR2,False,False,False,0,True,True,True,True,SynLethDB_gene_pair,False,True
