## Pre-process DepMap files used in analysis

Note: CERES requires the replicate map and copy number files to be .tsv instead of .csv

Run this script before: `align_guides.R`

In [3]:
import os
import pandas as pd
from pathlib import Path

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../local_data/' +'/'.join(folders) +'/'+ fname)

# Input
file_logfold_changes = get_data_path(['depmap', '20Q2'], 'Achilles_logfold_change.csv')
file_replicate_map = get_data_path(['depmap', '20Q2'], 'Achilles_replicate_map.csv')
file_copy_number = get_data_path(['depmap', '20Q2'], 'CCLE_segment_cn.csv')

# Output
file_sgrna_sequences = get_local_data_path(['processed','depmap20Q2'], 'sgrna_sequences.csv')
file_replicate_map_out = get_data_path(['depmap', '20Q2'], 'Achilles_replicate_map.tsv')
file_copy_number_out = get_data_path(['depmap', '20Q2'], 'CCLE_segment_cn.tsv')

In [4]:
# Extract sgrna sequences from logfold_changes file, for R align_guides script
sequences_raw = pd.read_csv(file_logfold_changes, usecols=[0])
display(sequences_raw[:2])

Unnamed: 0,Construct Barcode
0,AAAAAAATCCAGCAATGCAG
1,AAAAAACCCGTAGATAGCCT


In [5]:
sequences = sequences_raw.rename(columns={'Construct Barcode':'sgrna'})
print('N sgrnas:', sequences.shape[0])
sequences.to_csv(file_sgrna_sequences, index=0)

N sgrnas: 74687


In [14]:
# Column names should be strictly: `Replicate`, `CellLine`, pDNA_batch
rep_map = pd.read_csv(file_replicate_map)
rep_map = rep_map[rep_map.passes_QC==True].reset_index(drop=True)
rep_map = rep_map.rename(columns={'replicate_ID':'Replicate', 'DepMap_ID':'CellLine'}).drop(columns=['passes_QC'])
rep_map.to_csv(file_replicate_map_out, sep='\t', index=0)
display(rep_map[:2])

Unnamed: 0,Replicate,CellLine,pDNA_batch
0,PACADD188-311cas9_RepB_p6_batch3,ACH-001382,3
1,KMRC20-311Cas9_RepA_p6_batch3,ACH-000250,3


In [12]:
copynum = pd.read_csv(file_copy_number)
copynum = copynum.rename(columns={'DepMap_ID':'CellLine'})
copynum.to_csv(file_copy_number_out, sep='\t', index=0)
display(copynum[:2])

Unnamed: 0,CellLine,Chromosome,Start,End,Num_Probes,Segment_Mean,Source
0,ACH-000001,1,1,1969745,286,2.546065,Sanger WES
1,ACH-000001,1,1969746,6354345,365,2.175759,Sanger WES
