## CERES post-processing

**Input:**
CERES output file (a score for each gene in each cell line)

**Output:**
Cleaned up and filtered gene scores file to use for the rest of the analysis

For the output, use the same format as the table from DepMap.  
* Genes as column headings
* Cell lines as rows

Post-processing steps:
1. Filter out genes that were not targeted by enough guides (less than 3).  
2. Drop genes not in DepMap results.  
3. Rescale scores to the reference essentials / non-essentials.
4. Update genes (columns) to Entrez IDs.

In [1]:
import pandas as pd
import numpy as np
import os
import re
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
import sklearn.decomposition

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../local_data/' +'/'.join(folders) +'/'+ fname)

# Input from running CERES / sgRNA mapping
file_ceres_unscaled = lambda v, d: get_local_data_path(['processed', v], 'ceres_gene_unscaled_'+d+'.csv')
file_guides_per_gene = lambda v, d: get_local_data_path(['processed', v], 'guides_per_gene_'+d+'.csv')

# Inputs from DepMap
# 20Q2 positive controls: intersection of Hart (2015) and Blomen (2014)
file_ref_essentials = get_data_path(['depmap', '20Q2'], 'common_essentials.csv')
file_ref_nonessentials = get_data_path(['depmap', '20Q2'], 'nonessentials.csv')
file_depmap_scores = lambda v: get_data_path(['depmap', v], ('Achilles_' if v=='20Q2' else '')+'gene_effect_unscaled.csv')

file_id_map = get_local_data_path(['processed'], 'HGNC_gene_id_map.csv')

# OUTPUT
file_gene_scores = lambda v, d: get_local_data_path(['processed', v], 'gene_scores_'+d+'.csv')
file_table_s2 = get_local_data_path(['supplemental_files'], 'Table_S2.csv')

### Filter & normalize CERES output

In [2]:
def load_ceres_scores(fname):
    scores_raw = pd.read_csv(fname, index_col=0)
    scores = scores_raw.T
    scores = scores.dropna(axis=1, how='all') # Drop columns (genes) where all values are NaN
    scores.index = scores.index.str.replace('.','-', regex=False) # The unify with CCLE file identifiers
    print('Num genes:', scores.shape[1])
    print('Num cell lines:', scores.shape[0])
    return scores

In [3]:
scores = load_ceres_scores(file_ceres_unscaled('depmap20Q2','16_04_21'))
scores[:2]

Num genes: 17059
Num cell lines: 769


Unnamed: 0,SHOC2,NDUFA12,SDAD1,FAM98A,ZNF253,HIST1H2BF,SYNE2,BATF2,MYSM1,EIF2B1,...,OR2L3,LCE1A,GOLGA6B,NUTM2B,ARL1,IFNA5,SRSF10,STEAP1B,MTRNR2L4,UQCRH
ACH-001382,0.199794,-0.357807,-0.782414,0.213824,-0.229356,0.467195,0.474444,-0.649686,-0.052942,-0.249298,...,0.696107,0.244203,-0.787002,0.533137,0.734482,0.566447,-0.923637,0.263324,1.273179,-1.514391
ACH-000250,-0.537444,0.418329,-1.733716,0.739527,-0.301952,0.325419,0.546188,-0.324785,0.661721,-3.411731,...,1.248,-0.356356,-0.473046,1.274702,-0.115109,0.603122,-2.195696,0.537134,1.700918,-3.864593


In [4]:
sanger_scores = load_ceres_scores(file_ceres_unscaled('sanger','16_04_21'))
sanger_scores[:2]

Num genes: 16667
Num cell lines: 318


Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZUP1,ZW10,ZYG11B,ZYX,ZZEF1,ZZZ3,ZXDC,ZYG11A,ZWILCH,ZWINT
ACH-000219,0.384011,0.704012,1.219393,0.936236,0.577844,0.522497,0.978978,0.734357,0.888314,1.041226,...,0.898198,0.480057,0.967753,0.49803,0.758928,0.136288,0.947427,0.893774,-0.137148,-1.456578
ACH-000997,0.866759,0.520705,1.218575,1.047627,0.774713,0.392238,0.737939,1.059298,0.79234,0.972492,...,1.049635,0.617007,0.85544,0.87184,0.666326,0.63019,0.803924,0.910231,-0.078818,-1.73749


#### 1. Drop genes targetted by too few guides (< 3)

In [5]:
def drop_genes_w_too_few_guides(scores, fname):
    # Drop genes that were targetted by too few guides (less than 3)
    # Some genes that I expect to be included might not be there if there was no copy number data for them.
    guides_per_gene = pd.read_csv(fname, index_col=0)
    guides_per_gene = guides_per_gene.rename(columns={'ccds_symbol':'symbol'})
    print('Genes in guide-gene map:', guides_per_gene.symbol.nunique())
    display(guides_per_gene[:1])

    print('Genes in CERES output that are not in my guide-per-gene map:')
    print(scores.loc[:, ~scores.columns.isin(guides_per_gene.symbol)].columns.values)
    # Check if any of the missing ones are protein-coding
    id_map = pd.read_csv(file_id_map).dropna(subset=['entrez_id']).astype({'entrez_id':'int'})
    print('Protein-coding:', id_map[id_map.symbol.isin(scores.loc[:,~scores.columns.isin(guides_per_gene.symbol)].columns) &
                                     id_map.locus_type=='gene with protein product'].symbol.values)

    # Check if there are genes that were not mapped by CERES
    print('Genes in my guide-per-gene map that are not in CERES output:',
          guides_per_gene[~guides_per_gene.symbol.isin(scores.columns)].symbol.values)

    # Filter guides per gene down to genes that are in CERES output (additional genes dropped are due to NaN values)
    guides_per_gene = guides_per_gene[guides_per_gene.symbol.isin(scores.columns)].reset_index(drop=True)
    
    # Genes without any guides were already filtered out
    print('Genes w/ only 2 guides:', sum(guides_per_gene.guides_per_gene == 2))
    print('Genes w/ only 1 guide:', sum(guides_per_gene.guides_per_gene == 1))
    scores_filtered = scores.loc[:, scores.columns.isin(guides_per_gene[guides_per_gene.guides_per_gene >= 3].symbol)]
    print('Num genes after filtering for too few guides:', scores_filtered.shape[1],'/', guides_per_gene.shape[0])
    
    return scores_filtered, guides_per_gene

In [6]:
scores_1, guide_gene_map = drop_genes_w_too_few_guides(scores, file_guides_per_gene('depmap20Q2','16_04_21'))

Genes in guide-gene map: 17048


Unnamed: 0,symbol,entrez_id,guides_per_gene
0,A1BG,1,4


Genes in CERES output that are not in my guide-per-gene map:
['ZNF286B' 'LOC102723996' 'CCDC144NL' 'TCP10' 'AKAP2' 'UGT2A2' 'PALM2'
 'TCP10L2' 'PCDHA13' 'GDF5OS' 'FAM86C1' 'C9orf66']
Protein-coding: []
Genes in my guide-per-gene map that are not in CERES output: ['ICOSLG']
Genes w/ only 2 guides: 363
Genes w/ only 1 guide: 243
Num genes after filtering for too few guides: 16441 / 17047


In [7]:
sanger_scores_1, sanger_guide_gene_map = drop_genes_w_too_few_guides(sanger_scores, file_guides_per_gene('sanger','16_04_21'))

Genes in guide-gene map: 16659


Unnamed: 0,symbol,entrez_id,guides_per_gene
0,A1BG,1,5


Genes in CERES output that are not in my guide-per-gene map:
['AKAP2' 'CCDC144NL' 'GDF5OS' 'PALM2' 'PCDHB4' 'TCP10L2' 'UGT2A2'
 'ZNF286B']
Protein-coding: []
Genes in my guide-per-gene map that are not in CERES output: []
Genes w/ only 2 guides: 202
Genes w/ only 1 guide: 189
Num genes after filtering for too few guides: 16268 / 16659


#### 2. Drop genes not in original DepMap gene scores

In [8]:
# Load DepMap scores
depmap_scores_raw = pd.read_csv(file_depmap_scores('20Q2'), index_col=0)
get_gene_symbol = lambda x: re.search('([\w-]+)\s\(\w+\)', x).group(1)
depmap_scores = depmap_scores_raw.rename(columns=get_gene_symbol)

In [9]:
# Load DepMap Sanger scores
depmap_sanger_scores_raw = pd.read_csv(file_depmap_scores('sanger'), index_col=0)
depmap_sanger_scores = depmap_sanger_scores_raw.rename(columns=get_gene_symbol)

In [10]:
def filter_non_depmap_genes(scores, depmap_scores):
    scores_filtered = scores.loc[:, scores.columns.isin(depmap_scores.columns)]
    print('Dropped genes:', scores.loc[:, ~scores.columns.isin(depmap_scores.columns)].columns.values)
    print('After dropping genes not in DepMap:', scores_filtered.shape[1], '/', scores.shape[1])
    return scores_filtered

In [11]:
# Drop genes that weren't in original DepMap gene score file (likely removed for a QC reason)
scores_2 = filter_non_depmap_genes(scores_1, depmap_scores)

Dropped genes: ['TRAPPC2B' 'ANKRD20A1' 'PALM2-AKAP2']
After dropping genes not in DepMap: 16438 / 16441


In [12]:
# Drop genes that weren't in original DepMap gene score file (likely removed for a QC reason)
sanger_scores_2 = filter_non_depmap_genes(sanger_scores_1, depmap_sanger_scores)

Dropped genes: ['PALM2-AKAP2']
After dropping genes not in DepMap: 16267 / 16268


#### 3. Normalize CERES gene scores to reference essential/non-essential genes from DepMap

In [13]:
# Get the reference essential and non-essential set of genes to scale scores (downloaded from DepMap)
get_gene_name = lambda x: re.search('([\w-]+)\s\(\w+\)', x).group(1)
get_gene_id = lambda x: re.search('[\w-]+\s\((\w+)\)', x).group(1)

ref_essential = pd.read_csv(file_ref_essentials)
ref_essential = ref_essential.assign(symbol = ref_essential.gene.apply(get_gene_name),
                                     entrez_id = ref_essential.gene.apply(get_gene_id))

ref_non_essential = pd.read_csv(file_ref_nonessentials)
ref_non_essential = ref_non_essential.assign(symbol = ref_non_essential.gene.apply(get_gene_name),
                                             entrez_id = ref_non_essential.gene.apply(get_gene_id))
ref_essential[:1]

Unnamed: 0,gene,symbol,entrez_id
0,AAMP (14),AAMP,14


In [15]:
# Normalize CERES gene scores, per cell line, according to reference essential and non-essential genes
# scale_to_essentials function is equivalent to scale_to_essentials in CERES package

def scale_to_essentials(cell_line):
    # (gene_score - median(non essentials) / median(non essentials) - median(essentials))
    return ((cell_line - cell_line[cell_line.index.isin(ref_non_essential.symbol)].median()) / 
             (cell_line[cell_line.index.isin(ref_non_essential.symbol)].median() -
              cell_line[cell_line.index.isin(ref_essential.symbol)].median()))

def scale_scores_to_ref_essential_genes(scores):
    print('N ref essentials in my data:', ref_essential[ref_essential.symbol.isin(scores.columns)].shape[0])
    print('N ref non-essentials in my data:', ref_non_essential[ref_non_essential.symbol.isin(scores.columns)].shape[0])
    # Normalize per cell line
    scores_scaled = scores.apply(lambda line: scale_to_essentials(line), axis=1)
    # Verify normalization
    assert(scores_scaled.loc[:,scores_scaled.columns.isin(ref_essential.symbol)].median(axis=1).median() == -1)
    assert(scores_scaled.loc[:,scores_scaled.columns.isin(ref_non_essential.symbol)].median(axis=1).median() == 0)
    # Order index and columns
    scores_scaled = scores_scaled.reindex(sorted(scores_scaled.columns), axis=1).sort_index()
    assert(scores_scaled.shape[1]==scores.shape[1])
    return scores_scaled

In [16]:
scores_scaled = scale_scores_to_ref_essential_genes(scores_2)
scores_scaled[:1]

N ref essentials in my data: 1196
N ref non-essentials in my data: 651


Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
ACH-000004,0.152968,0.037153,-0.244197,-0.025637,-0.019579,-0.20799,0.309648,-0.443816,0.225697,0.144659,...,-0.239963,-0.198247,-0.132045,-0.460931,0.154466,0.169978,-0.477521,0.266874,0.10614,-0.216784


In [17]:
sanger_scores_scaled = scale_scores_to_ref_essential_genes(sanger_scores_2)
sanger_scores_scaled[:1]

N ref essentials in my data: 1181
N ref non-essentials in my data: 585


Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
ACH-000001,-0.402295,-0.229914,0.08239,-0.145481,-0.263255,0.079608,-0.27345,-0.189875,0.000872,0.111094,...,-0.008028,-0.262682,-0.200832,-0.315102,0.141383,-0.269534,-0.022532,0.134045,0.132889,-0.536921


#### 4. Create df with Entrez gene IDs

In [21]:
# Also round to 4 significant digits to reduce the file size
def set_cols_to_entrez_ids(scores, guide_gene_map):
    final_scores = pd.merge(guide_gene_map[['symbol', 'entrez_id']], 
                            scores.T.reset_index().rename(columns={'index':'symbol'}))
    final_scores = final_scores.drop(columns=['symbol']).astype({'entrez_id':'str'}).set_index('entrez_id').T
    print('Final num genes:', final_scores.shape[1], ', cell lines:', final_scores.shape[0])
    final_scores = final_scores.round(decimals=4)
    return final_scores

In [25]:
final_scores = set_cols_to_entrez_ids(scores_scaled, guide_gene_map)
print('N. genes dropped:',  depmap_scores.loc[:, ~depmap_scores.columns.isin(scores_scaled.columns)].shape[1], 
      '/', depmap_scores.shape[1])
final_scores[:1]

Final num genes: 16438 , cell lines: 769
N. genes dropped: 1681 / 18119


entrez_id,1,29974,2,144568,127550,53947,51146,8086,65985,13,...,221302,9183,55055,11130,79364,440590,79699,7791,23140,26009
ACH-000004,0.153,0.0372,-0.2442,-0.0256,-0.0196,-0.208,0.3096,-0.4438,0.2257,0.1447,...,-0.24,-0.1982,-0.132,-0.4609,0.1545,0.17,-0.4775,0.2669,0.1061,-0.2168


In [28]:
# Export
final_scores.to_csv(file_gene_scores('depmap20Q2', '16_04_21'))

In [30]:
# Also export scores as table S25
final_scores.to_csv(file_table_s2)

In [26]:
final_sanger_scores = set_cols_to_entrez_ids(sanger_scores_scaled, sanger_guide_gene_map)
print('N. genes dropped:',  
      depmap_sanger_scores.loc[:, ~depmap_sanger_scores.columns.isin(sanger_scores_scaled.columns)].shape[1], 
      '/', depmap_sanger_scores.shape[1])
final_sanger_scores[:1]

Final num genes: 16267 , cell lines: 318
N. genes dropped: 1532 / 17799


entrez_id,1,29974,2,144568,127550,53947,51146,8086,65985,13,...,221302,9183,55055,11130,79364,440590,79699,7791,23140,26009
ACH-000001,-0.4023,-0.2299,0.0824,-0.1455,-0.2633,0.0796,-0.2734,-0.1899,0.0009,0.1111,...,-0.008,-0.2627,-0.2008,-0.3151,0.1414,-0.2695,-0.0225,0.134,0.1329,-0.5369


In [29]:
# Export
final_sanger_scores.to_csv(file_gene_scores('sanger', '16_04_21'))

### Precision-recall analysis with the DepMap reference essentials

In [31]:
def compute_AUC_for_cell_line(scores, true_values):
    precision, recall, _ = precision_recall_curve(true_values, -scores)
    return auc(recall, precision)

def compute_AP_for_cell_line(scores, true_values):
    return average_precision_score(true_values, -scores)

def compute_metrics(scores, essentials, nonessentials):
    scores = scores.T.reset_index().rename(columns={'index':'symbol'})
    # Reduce scores down to essential and non-essential genes
    scores = scores[scores.symbol.isin(essentials.symbol) | scores.symbol.isin(nonessentials.symbol)]
    true_values = scores.symbol.apply(lambda x: 1 if x in essentials.symbol.values else 0).values
    scores = scores.set_index('symbol')
    aucs = scores.apply(lambda x: compute_AUC_for_cell_line(x, true_values))
    aps = scores.apply(lambda x: compute_AP_for_cell_line(x, true_values))
    return aucs, aps

def compute_metrics_for_orig_and_reprocessed(orig_scores, reprocessed_scores):
    # Reduce scores down to overlapping genes and cell lines
    orig_overlap = orig_scores.loc[orig_scores.index.isin(reprocessed_scores.index), 
                                   orig_scores.columns.isin(reprocessed_scores.columns)]
    # Drop cell lines that have NA scores in DepMap and then scale to essentials
    orig_overlap = orig_overlap.dropna(axis=0)
    orig_overlap = orig_overlap.apply(lambda line: scale_to_essentials(line), axis=1)
    
    reprocessed_overlap = reprocessed_scores.loc[reprocessed_scores.index.isin(orig_overlap.index),
                                                 reprocessed_scores.columns.isin(orig_overlap.columns)]
    reprocessed_overlap = reprocessed_overlap.apply(lambda line: scale_to_essentials(line), axis=1)
    print(reprocessed_overlap.shape, '==', orig_overlap.shape)
    
    orig_auc, orig_ap = compute_metrics(orig_overlap, ref_essential, ref_non_essential)
    print('Mean AUC for original gene scores: %.6f, AP: %.6f' % (orig_auc.mean(), orig_auc.mean()))
    
    reprocessed_auc, reprocessed_ap = compute_metrics(reprocessed_overlap, ref_essential, ref_non_essential)
    print('Mean AUC for reprocessed gene scores: mean: %.6f, AP: %.6f' % (reprocessed_auc.mean(), reprocessed_ap.mean()))

In [32]:
compute_metrics_for_orig_and_reprocessed(depmap_scores, scores_scaled)

(757, 16438) == (757, 16438)
Mean AUC for original gene scores: 0.988319, AP: 0.988319
Mean AUC for reprocessed gene scores: mean: 0.988115, AP: 0.988119


In [33]:
compute_metrics_for_orig_and_reprocessed(depmap_sanger_scores, sanger_scores_scaled)

(318, 16267) == (318, 16267)
Mean AUC for original gene scores: 0.984366, AP: 0.984366
Mean AUC for reprocessed gene scores: mean: 0.984972, AP: 0.984978
