## Filter DepMap Expression Data

Filter gene expression file down to cell lines that overlap with cell lines screened + only for paralogs

In [1]:
import pandas as pd
import os
import re

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../data/' +'/'.join(folders) +'/'+ fname)

# Input
file_gene_scores = get_local_data_path(['processed', 'depmap19Q1'], 'gene_scores_11_07_19.csv')
file_gene_expr = get_data_path(['depmap', '19Q1'], 'CCLE_depMap_19Q1_TPM.csv')
file_paralogs = get_local_data_path(['processed', 'ensembl93'], 'paralog_summary.csv')

# Output
file_filtered_expr = get_local_data_path(['processed', 'depmap19Q1'], 'filtered_gene_expression.csv')

In [2]:
all_expr_raw = pd.read_csv(file_gene_expr, index_col=0)

In [3]:
print(all_expr_raw.shape)
all_expr_raw[:1]

(1165, 57820)


Unnamed: 0,TSPAN6 (ENSG00000000003),TNMD (ENSG00000000005),DPM1 (ENSG00000000419),SCYL3 (ENSG00000000457),C1orf112 (ENSG00000000460),FGR (ENSG00000000938),CFH (ENSG00000000971),FUCA2 (ENSG00000001036),GCLC (ENSG00000001084),NFYA (ENSG00000001167),...,RP11-309M23.1 (ENSGR0000237531),AMDP1 (ENSGR0000237801),BX649553.1 (ENSGR0000263835),BX649553.2 (ENSGR0000263980),BX649553.3 (ENSGR0000264510),BX649553.4 (ENSGR0000264819),RN7SL355P (ENSGR0000265350),MIR3690 (ENSGR0000265658),AL732314.1 (ENSGR0000266731),AJ271736.10 (ENSGR0000270726)
ACH-000956,2.650765,0.0,6.216843,3.427606,4.672991,0.014355,0.111031,5.803744,6.900867,5.287251,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Extract ensembl ids for column headings
get_gene_id = lambda x: re.search('[\w-]+\s\((\w+)\)', x).group(1)
all_expr = all_expr_raw.rename(columns=get_gene_id)
all_expr[:1]

Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSGR0000237531,ENSGR0000237801,ENSGR0000263835,ENSGR0000263980,ENSGR0000264510,ENSGR0000264819,ENSGR0000265350,ENSGR0000265658,ENSGR0000266731,ENSGR0000270726
ACH-000956,2.650765,0.0,6.216843,3.427606,4.672991,0.014355,0.111031,5.803744,6.900867,5.287251,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
all_scores = pd.read_csv(file_gene_scores, index_col=0).set_index('cell_line')

In [6]:
all_scores[:1]

Unnamed: 0_level_0,8036,55967,55153,25940,23224,116071,114803,1967,23014,29957,...,128344,55249,84304,5869,57818,5273,84952,11046,90594,400
cell_line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-000601,-0.117838,-0.083374,-0.75716,-0.07252,-0.012776,-0.119112,0.053607,-1.092377,-0.234541,-0.018627,...,-0.18303,-0.096821,-0.092369,0.004617,0.010656,-0.008256,0.003173,-0.062839,-0.016976,-0.154559


In [7]:
paralogs = pd.read_csv(file_paralogs, index_col=0)
paralogs[:1]

Unnamed: 0,ensembl_id,symbol,entrez_id,percent_matched_in_paralog,num_paralogs,WGD
0,ENSG00000000003,TSPAN6,7105,0.583673,8,WGD


In [8]:
# Drop non-overlapping cell lines (with CERES scores) and singletons
expr = all_expr.loc[all_expr.index.isin(all_scores.index), all_expr.columns.isin(paralogs.ensembl_id)]
print(all_expr.shape)
print(expr.shape)

(1165, 57820)
(554, 11857)


In [9]:
expr.to_csv(file_filtered_expr)