## Thompson et al (2020) double perturbation screens

**Input**: CRISPR double perturbation screen data from Thompson et al

**Output**: Paralog pairs from screen annotated with SL status

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import os

import seaborn as sns
import matplotlib.pyplot as plt

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../../local_data/' +'/'.join(folders) +'/'+ fname)

# Input
file_thompson_gi_screen = get_data_path(['GI_screens', 'thompson_2020'], 'thompson_2020.xlsx')
file_gene_scores = get_local_data_path(['processed', 'depmap20Q2'], 'gene_scores_26_05_20.csv')
file_paralog_pairs_unfiltered = get_local_data_path(['processed', 'ensembl93'], 'all_pairs_unfiltered.csv')
file_paralog_pairs = get_local_data_path(['processed', 'ensembl93'], 'all_pairs.csv')
file_gene_id_map = get_local_data_path(['processed'], 'HGNC_gene_id_map.csv')

file_plos_tabS4 = get_local_data_path(['other'], 'closest_pairs_annotated_17_07_19.csv')
file_depmap_pairs = get_local_data_path(['results'], 'bronze_standard_SL_pairs_24_09_20.csv')

# Output
file_thompson_pairs = get_local_data_path(['processed', 'screen_pairs'], 'thompson_pairs.csv')

### Extract paralog pairs from the gene pairs screened

#### Load Tables from file
- Table 1 = results for all screened gene pairs
- Table 3 = gene id mappings
- Table 5 = SL gene pairs in each of the three cell lines

In [2]:
# RRA = Robust Rank Aggregation
# Significance called at FDR < 0.1 for both statistics
table1_raw = pd.read_excel(file_thompson_gi_screen, sheet_name=0, skiprows=2)
print(table1_raw.columns)
table1_raw[:1]

Index(['Pair', 'GENE PAIR', 'Gene pair class', 'A375_D28_rra_fdr_low',
       'A375_D28_t_fdr_low', 'A375_D14_rra_fdr_low', 'A375_D14_t_fdr_low',
       'Passes A375 filter?', 'Mewo_D28_rra_fdr_low', 'Mewo_D28_t_fdr_low',
       'Mewo_D14_rra_fdr_low', 'Mewo_D14_t_fdr_low', 'Passes the Mewo filter?',
       'RPE_D14_Mageck_gene1', 'RPE_D14_Mageck_gene2', 'A375_Mageck_D28_gene1',
       'A375_Mageck_D28_Gene2', 'MEWO_Mageck_D28_Gene1',
       'MEWO_Mageck_D28_Gene2', 'RPE_D28_Mageck_gene1', 'RPE_D28_Mageck_gene2',
       'A375_D14_Bagel_Gene1', 'A375_D14_Bagel_Gene2', 'MEWO_D14_Bagel_Gene1',
       'MEWO_D14_Bagel_Gene2', 'RPE_D14_Bagel_Gene1', 'RPE_D14_Bagel_Gene2',
       'A375_D28_Bagel_Gene1', 'A375_D28_Bagel_Gene2', 'MEWO_D28_Bagel_Gene1',
       'MEWO_D28_Bagel_Gene2', 'RPE_D28_Bagel_Gene1', 'RPE_D28_Bagel_Gene2'],
      dtype='object')


Unnamed: 0,Pair,GENE PAIR,Gene pair class,A375_D28_rra_fdr_low,A375_D28_t_fdr_low,A375_D14_rra_fdr_low,A375_D14_t_fdr_low,Passes A375 filter?,Mewo_D28_rra_fdr_low,Mewo_D28_t_fdr_low,...,MEWO_D14_Bagel_Gene1,MEWO_D14_Bagel_Gene2,RPE_D14_Bagel_Gene1,RPE_D14_Bagel_Gene2,A375_D28_Bagel_Gene1,A375_D28_Bagel_Gene2,MEWO_D28_Bagel_Gene1,MEWO_D28_Bagel_Gene2,RPE_D28_Bagel_Gene1,RPE_D28_Bagel_Gene2
0,1,AARS2_AARS,Paralogous_gene_pair,0.228786,1.0,0.999996,1.0,No,0.999996,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
# Check that I call significant hits correctly
# From Nicky's thesis, number of hits should be: A375=44, Mewo=40, RPE=58
print('A375 hits:', table1_raw[(table1_raw.A375_D28_t_fdr_low < 0.1) & (table1_raw.A375_D28_rra_fdr_low < 0.1) & 
                               (table1_raw['Passes A375 filter?']=='Yes')].shape[0])
print('Mewo hits:', table1_raw[(table1_raw.Mewo_D28_t_fdr_low < 0.1) & (table1_raw.Mewo_D28_rra_fdr_low < 0.1) & 
                               (table1_raw['Passes the Mewo filter?']=='Yes')].shape[0])

A375 hits: 44
Mewo hits: 40


In [4]:
# Gene id - symbol mappings - Table 3 in file
table3_raw = pd.read_excel(file_thompson_gi_screen, sheet_name=2, skiprows=2)
table3_raw[:1]

Unnamed: 0,GeneA_ENSEMBL_gene_ID,GeneASymbol,GeneA_ENSEMBL_gene_ID_dom_hits_count_Zero_ex_offtar,GeneB_ENSEMBL_gene_ID,GeneBSymbol,GeneB_ENSEMBL_gene_ID_dom_hits_count_Zero_ex_offtar,gene_pair_origin,pair_gRNA_with_protdom_hits,uniq_pair_id
0,ENSG00000012048,BRCA1,56.0,ENSG00000143799,PARP1,77.0,SynLethDB_gene_pair,1.0,ENSG00000012048_BRCA1_ENSG00000143799_PARP1


In [5]:
table3 = table3_raw.rename(columns={'GeneASymbol':'A1', 'GeneBSymbol':'A2',
                                    'GeneA_ENSEMBL_gene_ID':'A1_ensembl',
                                    'GeneB_ENSEMBL_gene_ID':'A2_ensembl'})[['A1','A2','A1_ensembl','A2_ensembl']]

# Manual curation: replacing PPAPDC2 with PLPP6 to match Table 1 (PPAPDC2 is the prev. symbol)
table3.loc[(table3.A1=='PPAPDC2') & (table3.A2=='PLPP7'), 'A1'] = 'PLPP6'
table3[:1]

Unnamed: 0,A1,A2,A1_ensembl,A2_ensembl
0,BRCA1,PARP1,ENSG00000012048,ENSG00000143799


In [6]:
# SL pairs - Table 5 in file
table5 = pd.read_excel(file_thompson_gi_screen, sheet_name=4, skiprows=3)
table5[:2]

Unnamed: 0,A375,Mewo,RPE
0,AP2A2_AP2A1,ARID1A_ARID1B,ALAS1_ALAS2
1,ARID4B_ARID4A,EAF1_EAF2,ALKBH4_ATR


#### Table 1 transformations: derive RPE filter, extract gene symbols from pairs, merge with Table 3 IDs

In [7]:
# Note several paralog pairs are in the Miscellaneous category instead so don't filter!
print('N paralogs:', table1_raw[table1_raw['Gene pair class']=='Paralogous_gene_pair'].shape[0], '/', table1_raw.shape[0])

# Clean up filter
table1 = table1_raw.rename(columns={'Passes A375 filter?':'passes_A375_filter', 
                                    'Passes the Mewo filter?':'passes_Mewo_filter'})
table1.passes_A375_filter = table1.passes_A375_filter.apply(lambda x: x=='Yes')
table1.passes_Mewo_filter = table1.passes_Mewo_filter.apply(lambda x: x=='Yes')

# Derive RPE filter from provided BAGEL and Mageck scores (Day 14)
table1 = table1.assign(passes_rpe_filter = ~((table1['RPE_D14_Bagel_Gene1']==1) | (table1['RPE_D14_Bagel_Gene2']==1) |
                                             (table1['RPE_D14_Mageck_gene1']<0.1) | (table1['RPE_D14_Mageck_gene2']<0.1)))

# Extract gene symbols
table1['A1'] = table1['GENE PAIR'].apply(lambda x: x.split('_')[0])
table1['A2'] = table1['GENE PAIR'].apply(lambda x: x.split('_')[1])

# Merge with table 3 which has ensembl ids for the symbols
table1 = pd.merge(table1, table3[['A1','A2','A1_ensembl','A2_ensembl']], how='left')
# Re-order columns to show gene ids near the start
cols = table1.columns.to_list()
table1 = table1[cols[:3] + cols[-4:] + cols[3:-4]]

display('Ens id not in table 3:')
display(table1.loc[table1.A1_ensembl.isna()][['GENE PAIR', 'A1', 'A2']])
# RP11-111K18.1 is also called PSMA2, novel readthrough gene - excluding
table1 = table1[~((table1.A1=='Non') & (table1.A2=='Tar'))].reset_index(drop=True)

table1[:1]

N paralogs: 645 / 1192


'Ens id not in table 3:'

Unnamed: 0,GENE PAIR,A1,A2
639,Non_Tar_RND_control_Non_Tar_RND_control (these...,Non,Tar
824,RP11-111K18.1_PSMA2,RP11-111K18.1,PSMA2


Unnamed: 0,Pair,GENE PAIR,Gene pair class,A1,A2,A1_ensembl,A2_ensembl,A375_D28_rra_fdr_low,A375_D28_t_fdr_low,A375_D14_rra_fdr_low,...,MEWO_D14_Bagel_Gene2,RPE_D14_Bagel_Gene1,RPE_D14_Bagel_Gene2,A375_D28_Bagel_Gene1,A375_D28_Bagel_Gene2,MEWO_D28_Bagel_Gene1,MEWO_D28_Bagel_Gene2,RPE_D28_Bagel_Gene1,RPE_D28_Bagel_Gene2,passes_rpe_filter
0,1,AARS2_AARS,Paralogous_gene_pair,AARS2,AARS,ENSG00000124608,ENSG00000090861,0.228786,1.0,0.999996,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,False


#### Label SL gene pairs

In [8]:
screen_hits = table1.assign(A375_SL= table1['GENE PAIR'].isin(table5['A375']),
                            Mewo_SL= table1['GENE PAIR'].isin(table5['Mewo']),
                            RPE_SL= table1['GENE PAIR'].isin(table5['RPE']))
screen_hits['n_SL'] = screen_hits[['A375_SL','Mewo_SL','RPE_SL']].sum(axis=1)

# Label which pairs pass at least 1 of the 'individually essential' filters
screen_hits['passes_min_1_filter'] = screen_hits.apply(
                                        lambda x: x.passes_A375_filter | x.passes_Mewo_filter | x.passes_rpe_filter, axis=1)

# Compute a score for each gene in each cell line (where available)
screen_hits['A375_score'] = screen_hits[['A375_D28_t_fdr_low','A375_D28_rra_fdr_low']].min(axis=1)
screen_hits['Mewo_score'] = screen_hits[['Mewo_D28_t_fdr_low','Mewo_D28_rra_fdr_low']].min(axis=1)

screen_hits = screen_hits[['A1_ensembl', 'A2_ensembl', 'A1','A2', 'GENE PAIR', 'A375_SL', 'Mewo_SL','RPE_SL', 'n_SL', 
                           'passes_min_1_filter', 'passes_A375_filter', 'passes_Mewo_filter', 'passes_rpe_filter', 
                           'Gene pair class', 'A375_score', 'Mewo_score']]
print('N=', screen_hits.shape[0])
screen_hits[:1]

N= 1191


Unnamed: 0,A1_ensembl,A2_ensembl,A1,A2,GENE PAIR,A375_SL,Mewo_SL,RPE_SL,n_SL,passes_min_1_filter,passes_A375_filter,passes_Mewo_filter,passes_rpe_filter,Gene pair class,A375_score,Mewo_score
0,ENSG00000124608,ENSG00000090861,AARS2,AARS,AARS2_AARS,False,False,False,0,False,False,False,False,Paralogous_gene_pair,0.228786,0.999996


#### Merge table 1 with Ensembl paralog pairs
Ensembl list is symmetric so only have to do 1 merge

Also filter out pairs where one of the genes was individually essential in all three cell lines screened

In [9]:
paralog_pairs = pd.read_csv(file_paralog_pairs)[['A1','A2','A1_ensembl','A2_ensembl','min_seq_id','max_seq_id']]
print('N=', paralog_pairs.shape[0])
paralog_pairs[:1]

N= 73296


Unnamed: 0,A1,A2,A1_ensembl,A2_ensembl,min_seq_id,max_seq_id
0,TEKT2,TEKT4,ENSG00000092850,ENSG00000163060,0.289655,0.293023


In [10]:
print('N paralogs:', table1[table1['Gene pair class']=='Paralogous_gene_pair'].shape[0])

N paralogs: 645


In [11]:
# Do any pairs match on symbol but not ensembl ids?
df = pd.merge(paralog_pairs[['A1_ensembl','A2_ensembl','A1','A2']], 
              screen_hits[['A1_ensembl','A2_ensembl','A1','A2']], on=['A1','A2'])
df[(df.A1_ensembl_x != df.A1_ensembl_y) | (df.A2_ensembl_x != df.A2_ensembl_y)]

Unnamed: 0,A1_ensembl_x,A2_ensembl_x,A1,A2,A1_ensembl_y,A2_ensembl_y
409,ENSG00000133028,ENSG00000284194,SCO1,SCO2,ENSG00000133028,ENSG00000130489


In [12]:
# Merge data frames on Ensembl ID
# Replace the Ensembl ID for SCO2 (this is differen in my data)
overlap = pd.merge(paralog_pairs, screen_hits.replace('ENSG00000130489', 'ENSG00000284194').drop(columns=['A1','A2']))

# Sort on A1, A2 (so order is same as in bronze standard) - A1 and A2 come from my paralog pairs file here
screen_pairs = pd.concat([pd.DataFrame(np.sort(overlap[['A1','A2']]), columns=['A1','A2']),
                          overlap.drop(columns=['A1','A2','A1_ensembl','A2_ensembl'])], axis=1)
# Need to merge ensembl ids back after sorting to get the correct symbol-ensembl id mappings
screen_pairs = pd.merge(screen_pairs, paralog_pairs[['A1','A2','A1_ensembl','A2_ensembl']])
assert(screen_pairs.shape[0] == overlap.shape[0])
print('Paralog pairs in Ens93 w/ 20%+ seq id:', screen_pairs.shape[0])

# Filter out pairs that were individually essential in all three cell lines
# The authors exclude these from their candidate SL pairs
screen_pairs = screen_pairs[screen_pairs.passes_min_1_filter].reset_index(0)
print('That pass 1+ filter:', screen_pairs[screen_pairs.passes_min_1_filter].shape[0])

screen_pairs[:1]

Paralog pairs in Ens93 w/ 20%+ seq id: 592
That pass 1+ filter: 541


Unnamed: 0,index,A1,A2,min_seq_id,max_seq_id,GENE PAIR,A375_SL,Mewo_SL,RPE_SL,n_SL,passes_min_1_filter,passes_A375_filter,passes_Mewo_filter,passes_rpe_filter,Gene pair class,A375_score,Mewo_score,A1_ensembl,A2_ensembl
0,0,CASKIN1,CASKIN2,0.396226,0.471714,CASKIN1_CASKIN2,False,False,False,0,True,True,False,True,Paralogous_gene_pair,0.999996,1.33e-10,ENSG00000167971,ENSG00000177303


#### Some summary numbers for the pairs in this screen

In [13]:
print('A375 SL: %d -> %.2f%%' % (sum(screen_pairs.A375_SL), sum(screen_pairs.A375_SL)/sum(screen_pairs.passes_A375_filter)*100))
print('Mewo SL: %d -> %.2f%%' % (sum(screen_pairs.Mewo_SL), sum(screen_pairs.Mewo_SL)/sum(screen_pairs.passes_Mewo_filter)*100))
print('RPE SL: %d -> %.2f%%' % (sum(screen_pairs.RPE_SL), sum(screen_pairs.RPE_SL)/sum(screen_pairs.passes_rpe_filter)*100))

print('SL 2+ N=%d -> %.2f%%' % (screen_pairs[screen_pairs.n_SL >= 2].shape[0],
                              screen_pairs[screen_pairs.n_SL >= 2].shape[0]/sum(screen_pairs.passes_min_1_filter)*100))
screen_pairs[:1]

A375 SL: 38 -> 9.82%
Mewo SL: 26 -> 6.18%
RPE SL: 39 -> 7.59%
SL 2+ N=23 -> 4.25%


Unnamed: 0,index,A1,A2,min_seq_id,max_seq_id,GENE PAIR,A375_SL,Mewo_SL,RPE_SL,n_SL,passes_min_1_filter,passes_A375_filter,passes_Mewo_filter,passes_rpe_filter,Gene pair class,A375_score,Mewo_score,A1_ensembl,A2_ensembl
0,0,CASKIN1,CASKIN2,0.396226,0.471714,CASKIN1_CASKIN2,False,False,False,0,True,True,False,True,Paralogous_gene_pair,0.999996,1.33e-10,ENSG00000167971,ENSG00000177303


In [14]:
# Check screen pairs that were not marked as paralogs
screen_pairs[screen_pairs['Gene pair class'] != 'Paralogous_gene_pair']

Unnamed: 0,index,A1,A2,min_seq_id,max_seq_id,GENE PAIR,A375_SL,Mewo_SL,RPE_SL,n_SL,passes_min_1_filter,passes_A375_filter,passes_Mewo_filter,passes_rpe_filter,Gene pair class,A375_score,Mewo_score,A1_ensembl,A2_ensembl
168,180,KDM6A,UTY,0.834488,0.8601,KDM6A_UTY,False,False,False,0,True,True,True,True,Miscellaneous_pair,0.999996,0.999996,ENSG00000147050,ENSG00000183878
169,181,KDM6B,UTY,0.258026,0.300554,KDM6B_UTY,False,False,False,0,True,True,True,True,Miscellaneous_pair,0.999996,0.999996,ENSG00000132510,ENSG00000183878
170,182,KDM6A,KDM6B,0.265161,0.318344,KDM6A_KDM6B,False,False,False,0,True,True,True,True,Miscellaneous_pair,0.999996,0.999996,ENSG00000147050,ENSG00000132510
211,226,EGFR,ERBB3,0.375559,0.416529,EGFR_ERBB3,False,False,False,0,True,True,True,True,SynLethDB_gene_pair,0.999996,0.999996,ENSG00000146648,ENSG00000065361


In [15]:
screen_pairs.drop(columns=['min_seq_id','max_seq_id']).to_csv(file_thompson_pairs, index=0)

### Overlap w/ our bronze standard set

In [16]:
depmap_pairs = pd.read_csv(file_depmap_pairs)
print('N=', depmap_pairs[depmap_pairs.SL].shape[0], '/', depmap_pairs.shape[0])
depmap_pairs[:1]

N= 127 / 3637


Unnamed: 0,A1,A2,SL,A1_ensembl,A2_ensembl,A1_entrez,A2_entrez
0,AAK1,BMP2K,False,ENSG00000115977,ENSG00000138756,22848,55589


In [17]:
overlap = pd.merge(screen_pairs[screen_pairs.passes_min_1_filter], depmap_pairs[['A1','A2','SL']])
print('Overlap w/ DepMap pairs:', overlap.shape[0])
print('Same status: %d, %.2f%% - SL: %d' % 
      (overlap[overlap.SL == (overlap.n_SL>0)].shape[0], 
       overlap[overlap.SL == (overlap.n_SL>0)].shape[0] / overlap.shape[0]*100,
       overlap[overlap.SL & (overlap.SL == (overlap.n_SL>0))].shape[0]))
print('SL DepMap only:', overlap[(overlap.SL) & (overlap.n_SL<1)].shape[0])
print('SL Thompson only (1+):', overlap[(~overlap.SL) & (overlap.n_SL>0)].shape[0])
ctab = pd.crosstab(overlap.n_SL>0, overlap.SL)
print('FET:', stats.fisher_exact(ctab))
ctab

Overlap w/ DepMap pairs: 159
Same status: 125, 78.62% - SL: 14
SL DepMap only: 19
SL Thompson only (1+): 15
FET: (5.4526315789473685, 0.00019557261557545704)


SL,False,True
n_SL,Unnamed: 1_level_1,Unnamed: 2_level_1
False,111,19
True,15,14


In [18]:
# Pairs SL in DepMap and SL in 2+ cell lines in screen
overlap[overlap.SL & (overlap.n_SL>=2)][['A1','A2','n_SL','A375_SL','Mewo_SL','RPE_SL']]

Unnamed: 0,A1,A2,n_SL,A375_SL,Mewo_SL,RPE_SL
3,TTC7A,TTC7B,3,True,True,True
5,CNOT7,CNOT8,3,True,True,True
20,CHM,CHML,2,False,True,True
89,UAP1,UAP1L1,2,True,True,False
102,SMARCA2,SMARCA4,2,True,False,True
107,FAM50A,FAM50B,3,True,True,True
135,EAF1,EAF2,2,True,True,False
