In [159]:
import pandas as pd
import re
from collections import defaultdict
import csv

In [160]:
cell_line_details = pd.read_excel('genotype_input/TableS1E.xlsx',skiprows=(0,1,3,1005))

In [161]:
tissues = {}
names = {}
for index, cell_line in cell_line_details.iterrows() :
    cosmic_id = int(cell_line['COSMIC identifier'])
    tissue = cell_line['GDSC\nTissue\ndescriptor 2']
    name = str(cell_line['Sample Name'])
    tissues[cosmic_id] = tissue
    names[cosmic_id] = name

In [162]:
regex = re.compile('[^a-zA-Z0-9]')
def convert_name(name) :
    return regex.sub('',name).upper()

Read and process mapping from CDSC to CCLE data

In [163]:
conversion_file = pd.read_excel('genotype_input/TableS4E.xlsx',skiprows=range(0,8))

In [164]:
mapping = {}
for index, cell_line in conversion_file.iterrows() :
    mapping[int(cell_line['GDSC1000 cosmic id'])] = cell_line['CCLE name']

Identify candidates to manually rename

In [165]:
for i in mapping :
    if mapping[i].split('_',1)[0] != regex.sub('',names[i]).upper() :
        print mapping[i],regex.sub('',names[i]).upper()

PL45_PANCREAS PL4
TE617T_SOFT_TISSUE TE6
NCIH322_LUNG NCIH322M
SW1353_BONE SW13
LU99_LUNG LU99A
KNS81_CENTRAL_NERVOUS_SYSTEM KNS81FD
WM793_SKIN WM793B
NIHOVCAR3_OVARY OVCAR3
786O_KIDNEY 7860
SKNBE2_AUTONOMIC_GANGLIA SKN
CMK86_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE CMK
COLO320_LARGE_INTESTINE COLO320HSR
HEYA8_OVARY HEY
KMRC2_KIDNEY KMRC20
HEL9217_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE HEL
LC1SQSF_LUNG LC1SQ


Identify mapping betweend GDSC and CCLE tissue types

In [166]:
tissue_type_map = defaultdict(set)
for i in mapping :
    gdsc_tissue_type = tissues[i]
    ccle_tissue_type = mapping[i].split('_',1)[1]
    tissue_type_map[gdsc_tissue_type].add(ccle_tissue_type)

In [167]:
for x in tissue_type_map :
    if len(tissue_type_map[x]) > 1 :
        print x, tissue_type_map[x]

kidney set([u'SOFT_TISSUE', u'KIDNEY'])
oesophagus set([u'OESOPHAGUS', u'SOFT_TISSUE'])
rhabdomyosarcoma set([u'SOFT_TISSUE', u'BONE'])
head and neck set([u'UPPER_AERODIGESTIVE_TRACT', u'SALIVARY_GLAND'])
uterus set([u'AUTONOMIC_GANGLIA', u'SOFT_TISSUE'])


In [168]:
tissue_type_map['rhabdomyosarcoma'] = [u'SOFT_TISSUE']
tissue_type_map['kidney'] = [u'KIDNEY']
tissue_type_map['oesophagus'] = [u'OESOPHAGUS']
tissue_type_map['head and neck'] = [u'HEADNECK']
tissue_type_map['uterus'] = [u'SOFT_TISSUE']

In [169]:
tissue_type_map['Lung_other'] = [u'LUNG']
tissue_type_map['skin_other'] = [u'SKIN']
tissue_type_map['hairy_cell_leukaemia'] = [u'HAEMATOPOIETIC_AND_LYMPHOID_TISSUE']
tissue_type_map['leukemia'] = [u'HAEMATOPOIETIC_AND_LYMPHOID_TISSUE']
tissue_type_map['cervix'] = [u'CERVIX']


In [170]:
groups = defaultdict(set)
for i in names :
    if tissues[i] in tissue_type_map :
        groups[list(tissue_type_map[tissues[i]])[0]].add(names[i])
    else :
        groups['OTHER'].add(names[i])
for g in groups :
    print g, len(groups[g])

THYROID 16
SOFT_TISSUE 23
HAEMATOPOIETIC_AND_LYMPHOID_TISSUE 173
BILIARY_TRACT 5
PANCREAS 32
CERVIX 15
CENTRAL_NERVOUS_SYSTEM 57
BONE 41
LARGE_INTESTINE 51
AUTONOMIC_GANGLIA 32
PLEURA 21
URINARY_TRACT 19
OTHER 8
LUNG 178
BREAST 52
SKIN 58
OVARY 43
PROSTATE 8
KIDNEY 33
STOMACH 29
ENDOMETRIUM 11
HEADNECK 44
OESOPHAGUS 35
LIVER 17


In [171]:
[(names[x],tissues[x]) for x in tissues if tissues[x] not in tissue_type_map]

[('SW962', u'urogenital_system_other'),
 ('SW954', u'urogenital_system_other'),
 ('NEC8', u'testis'),
 ('NCCIT', u'testis'),
 ('NTERA-2 cl.D1', u'testis'),
 ('JAR', u'urogenital_system_other'),
 ('JEG-3', u'urogenital_system_other'),
 ('HUTU-80', u'digestive_system_other')]

In [174]:
converted = set()
converted_names = {}
for x in mapping :
    if convert_name(names[x]) != mapping[x].split('_')[0] :
        print x,convert_name(names[x]),mapping[x]
for i in names :
    an_name = convert_name(names[i])
    if tissues[i] in tissue_type_map :
        tissue = list(tissue_type_map[tissues[i]])[0]
    else :
        tissue = "OTHER"
    converted.add("%s_%s" % (an_name,tissue))
    converted_names[i] = "%s_%s" % (an_name,tissue)

1298533 PL4 PL45_PANCREAS
946355 TE6 TE617T_SOFT_TISSUE
905967 NCIH322M NCIH322_LUNG
909744 SW13 SW1353_BONE
907796 LU99A LU99_LUNG
924188 KNS81FD KNS81_CENTRAL_NERVOUS_SYSTEM
1299081 WM793B WM793_SKIN
905933 OVCAR3 NIHOVCAR3_OVARY
905947 7860 786O_KIDNEY
1240215 SKN SKNBE2_AUTONOMIC_GANGLIA
910566 CMK CMK86_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
910569 COLO320HSR COLO320_LARGE_INTESTINE
1479988 HEY HEYA8_OVARY
1298169 KMRC20 KMRC2_KIDNEY
907053 HEL HEL9217_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
1298223 LC1SQ LC1SQSF_LUNG


In [175]:
with open("COSMIC_ID_TO_CANCERGD.txt","w") as f:
    for i in converted_names :
        f.write("%s\t%s\n" % (i, converted_names[i]))