## Pre-process DepMap files used in analysis

Note: CERES requires the replicate map and copy number files to be .tsv instead of .csv

Run this script before: `align_guides.R`

Figshare for DepMap 19Q1: https://figshare.com/articles/DepMap_Achilles_19Q1_Public/7655150

In [2]:
import os
import pandas as pd

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../data/' +'/'.join(folders) +'/'+ fname)

# Input
file_logfold_changes = get_data_path(['depmap', '19Q1'], 'logfold_change.csv')
file_replicate_map = get_data_path(['depmap', '19Q1'], 'replicate_map.csv')
file_copy_number = get_data_path(['depmap', '19Q1'], 'copy_number.csv')

# Output
file_sgrna_sequences = get_local_data_path(['processed','depmap19Q1'], 'sgrna_sequences.csv')
file_replicate_map_out = get_data_path(['depmap', '19Q1'], 'replicate_map.tsv')
file_copy_number_out = get_data_path(['depmap', '19Q1'], 'copy_number.tsv')

In [3]:
# Extract sgrna sequences from logfold_changes file, for R align_guides script
sequences_raw = pd.read_csv(file_logfold_changes, usecols=[0])
display(sequences_raw[:2])

Unnamed: 0.1,Unnamed: 0
0,AAAAAAATCCAGCAATGCAG
1,AAAAAACCCGTAGATAGCCT


In [5]:
sequences = sequences_raw.rename(columns={'Unnamed: 0':'sgrna'})
print('N sgrnas:', sequences.shape[0])
display(sequences[:1])
sequences.to_csv(file_sgrna_sequences, index=0)

N sgrnas: 73839


Unnamed: 0,sgrna
0,AAAAAAATCCAGCAATGCAG


In [10]:
# Column names should be strictly: `Replicate`, `CellLine`, pDNA_batch
rep_map = pd.read_csv(file_replicate_map)
rep_map = rep_map.rename(columns={'replicate_ID':'Replicate', 'DepMap_ID':'CellLine'})
rep_map.to_csv(file_replicate_map_out, sep='\t', index=0)
display(rep_map[:2])

Unnamed: 0,Replicate,Broad_ID,pDNA_batch
0,MIA Paca-2-311Cas9 Rep A p6_batch0,ACH-000601,0
1,MIA Paca-2-311Cas9 Rep B p6_batch0,ACH-000601,0


In [11]:
copynum = pd.read_csv(file_copy_number)
copynum = copynum.rename(columns={'DepMap_ID':'CellLine'})
copynum.to_csv(file_copy_number_out, sep='\t', index=0)
display(copynum[:2])

Unnamed: 0,Broad_ID,Chromosome,Start,End,Num_Probes,Segment_Mean
0,ACH-000004,1,12412,356492,28,0.743498
1,ACH-000004,1,356493,3222548,736,1.461753
