In [127]:
import numpy as np
import pandas as pd
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt
import random
%matplotlib inline

In [2]:
DAT_DIR='/cellar/users/jlz014/Data/'

In [90]:
# load columns and index for TCGA matrix
columns=[]
with open('TCGAmatrix.realign.gene.hgnc.columns.txt') as fh:
    columns=fh.read().splitlines()
index=[]
with open('TCGAmatrix.realign.gene.hgnc.index.txt') as fh:
    index=fh.read().splitlines()

In [4]:
# load mapping table for ENSG/ENST/gene name (from Biomart)
gene_map = pd.read_table('/cellar/users/jlz014/Data/ensg_enst_gene_mapping.txt',index_col=2)
gene_map.head()

Unnamed: 0_level_0,Gene stable ID,Transcript stable ID
Gene name,Unnamed: 1_level_1,Unnamed: 2_level_1
RF00100,ENSG00000276626,ENST00000612820
RNU4-59P,ENSG00000201317,ENST00000364447
SNORD114-2,ENSG00000200823,ENST00000363953
MIR1249,ENSG00000221598,ENST00000408671
RF00019,ENSG00000199595,ENST00000362725


In [5]:
# check intersection of genes in mapping table and genes in TCGA expression table
print(len(index))
print(gene_map.shape[0])
print(len(set(index)&set(gene_map.index)))

27105
224501
26548


In [47]:
# convert columns from UUID to TCGA barcodes
legacy_uuid_barcode_map = pd.read_table('/cellar/users/andreabc/GDC_barcodes/LEGACY_uuid_barcode_map.txt')

In [48]:
legacy_uuid_barcode_map.head()

Unnamed: 0,file_id,file_name,barcode,sample_barcode,disease,type,submitter_uuid,data_format
0,c5b52dfa-c074-4fc6-9e9f-eac6b15513d1,unc.edu.db848332-4ee8-4f22-b6c9-b2634bd35918.1...,TCGA-FV-A3R3,TCGA-FV-A3R3-01A,TCGA-LIHC,file,,TXT
1,a4013547-e7fb-49af-8ecc-08a318b5c576,unc.edu.52101d8f-f20a-45d8-aa74-720e5d830a5c.2...,TCGA-3A-A9I7,TCGA-3A-A9I7-01A,TCGA-PAAD,file,,TXT
2,0af879ec-4755-4536-9172-c7cc89b52393,unc.edu.f4addef3-be6d-40d6-8bb9-6ca5cf74173c.1...,TCGA-BA-5558,TCGA-BA-5558-01A,TCGA-HNSC,file,,TXT
3,ac17693f-a9ca-47bb-ba29-854703aab236,TCGA-61-2003-01A-01T-0841-07.gene.tcga_level3....,TCGA-61-2003,TCGA-61-2003-01A,TCGA-OV,file,,TXT
4,ac470569-b313-41e2-999b-26d24bfefb33,URGED_p_TCGA_Pop_May2011_HT_HG-U133A_96-HTA_D0...,TCGA-76-6191,TCGA-76-6191-01A,TCGA-GBM,file,,TXT


In [53]:
# subset to column names
matched_ids = legacy_uuid_barcode_map.loc[(legacy_uuid_barcode_map['disease']=='TCGA-BRCA')  & legacy_uuid_barcode_map['submitter_uuid'].isin(columns)] 
matched_ids.head()

Unnamed: 0,file_id,file_name,barcode,sample_barcode,disease,type,submitter_uuid,data_format
120102,ab68528c-cf90-425b-bf92-33d5e4c54345,UNCID_2189320.B55B6630-7CF5-4BCA-B76F-E1352710...,TCGA-C8-A3M7,TCGA-C8-A3M7-01A,TCGA-BRCA,file,6f032bc9-cb50-42e2-a137-98ab4dca1a9b,FASTQ
120178,e2ab2cad-67a8-47de-a775-8dd9461f1f41,UNCID_2203856.27e70f84-80c5-4f6c-9d22-83c5a6e6...,TCGA-A2-A1FZ,TCGA-A2-A1FZ-01A,TCGA-BRCA,file,6e78c8be-1ef0-49ef-919d-086f0a64cf94,FASTQ
120195,0c5866d5-41e9-4a62-a84e-5ad63879e249,UNCID_2363831.11782f87-4d58-4897-9a61-7a4605c0...,TCGA-LD-A74U,TCGA-LD-A74U-01A,TCGA-BRCA,file,15ec93ae-15a2-4ab8-b64c-fc064b418006,FASTQ
120238,2bb0e784-10fc-4978-b5f3-bfc197438175,UNCID_2210783.ad46ed65-3cc2-4956-9d90-3e1edca2...,TCGA-B6-A0RE,TCGA-B6-A0RE-01A,TCGA-BRCA,file,9da91059-89ed-406f-b11b-e67a113e87bc,FASTQ
120277,babcc02f-9aa4-468f-8829-e605afea8c7c,UNCID_2209414.0e932148-a592-42e2-b1df-4fd599db...,TCGA-E2-A10E,TCGA-E2-A10E-01A,TCGA-BRCA,file,489e5fe4-7672-47c3-8f9e-51e955998804,FASTQ


In [128]:
# generate lookup dict with submitter_uuid as key and retaining only primary tumor samples
pre_lut=defaultdict(list)
for index,row in matched_ids.iterrows():
    if row['sample_barcode'][-3:]=='01A':
        pre_lut[row['barcode']].append(row['submitter_uuid'])
# find duplicates
duplicates=[(k,v) for k,v in pre_lut.items() if len(v)>1]
barcodes_to_keep = set(pre_lut.keys())-set([x[0] for x in duplicates])

# remove duplicates
lut = {v[0]:k for k,v in pre_lut.items() if k not in [x[0] for x in duplicates]}

In [92]:
# load matrix
mtx = np.load('TCGAmatrix.realign.gene.hgnc.npy')
breast_expr = pd.DataFrame(mtx, index=index, columns=columns)

In [94]:
breast_expr.head()

Unnamed: 0,db1e45ec-729e-4a48-ae4a-85dc7765cb1d,01c9c486-321f-4ebc-ade7-bbe6ea5c4a6e,ee4b9ebd-ead7-484d-a524-bb15a123bffb,f8b38707-616e-4e58-ab4e-e99e17850527,b4d48f51-be26-493d-8ae4-ed6217c57aed,56f705c4-26cb-467b-ac76-fb2241a2c7c2,b81df2b9-68ba-4b79-b0ce-30df40f0d358,19696dea-1352-4719-814a-b579403e20c9,7f7995d5-21f0-4a37-aca7-df3044174b8d,08c5ec86-d483-4f59-b961-c8c508e80333,...,f3b57849-0302-47b0-9e3b-7cbe9de07056,1c61982a-51be-4bb7-b74c-9c274a8101cc,f26e73ea-9282-4e81-a70d-9fad49c91109,a4ba0c52-371f-4049-ab36-76b830d0249a,ba0abfb3-8209-4d9e-9e87-106ec25e580e,e3bf46e6-af7c-440b-9a78-7b8b960921d1,b7b5179c-ef0c-4315-a0f6-e3f6794c22ce,3db4a51a-ef24-410a-814f-8def2a081b3a,05256017-6ed2-4253-a836-76a4194d9ba1,d28daf21-2ef9-4ee5-881c-52721835f26e
TSPAN6,25.394545,24.211422,15.65807,35.84742,32.340122,39.623505,56.24168,21.211855,32.212696,38.636696,...,12.336794,15.574802,2.302933,43.122551,48.425709,10.53377,16.181438,42.021618,29.758554,43.309277
TNMD,0.083524,0.0,1.32271,0.0,0.0,0.487686,0.465565,0.059166,0.106606,0.036371,...,0.058302,0.45572,0.0,0.071057,0.0,0.0,0.025136,0.068346,0.014627,0.071812
DPM1,35.967682,51.821732,21.38369,51.342186,19.762459,77.882195,19.19788,36.605034,24.989246,45.750732,...,53.274059,32.631763,41.376137,45.640671,27.804981,77.150291,10.395482,80.177917,44.421211,31.199669
SCYL3,4.387471,8.87837,3.028743,3.856329,5.038463,7.260881,3.433751,5.65752,3.793709,4.615981,...,5.517496,26.336496,10.584804,8.724298,4.301095,2.334554,1.09852,4.436597,8.62135,6.078245
C1orf112,2.522929,16.315493,1.103992,6.608015,3.582534,15.91647,2.240807,7.759534,2.54024,7.125426,...,1.942347,2.259234,12.162114,2.416817,1.225858,4.466543,2.143343,6.410424,5.573026,3.624989


In [99]:
# subset matrix for TCGA-BRCA columns
breast_uuids = lut.keys()
breast_mapped_idx = [columns.index(x) for x in breast_uuids]
breast_mtx = mtx[:, breast_mapped_idx]
new_cols = [c for c in columns if c in lut.keys()]

sub_breast_expr = breast_expr[new_cols]
sub_breast_expr.rename(lut,axis='columns',inplace=True)
sub_breast_expr.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,TCGA-GM-A2DO,TCGA-OL-A6VQ,TCGA-E9-A295,TCGA-A2-A0YL,TCGA-AQ-A54O,TCGA-BH-A0E7,TCGA-B6-A0I5,TCGA-E2-A15K,TCGA-C8-A133,TCGA-AO-A12F,...,TCGA-OL-A5RV,TCGA-AR-A1AX,TCGA-D8-A27N,TCGA-A8-A08C,TCGA-E2-A150,TCGA-LL-A8F5,TCGA-AQ-A1H3,TCGA-A2-A1FW,TCGA-B6-A0X5,TCGA-S3-AA17
TSPAN6,5.582386,47.24754,40.451157,52.29504,26.852585,19.906492,29.754513,4.058737,16.073622,62.805061,...,54.058868,13.817858,15.780485,43.901165,48.945454,8.75868,33.806385,11.960064,2.302933,29.758554
TNMD,0.184989,0.458678,0.530185,0.32301,0.017987,0.034819,0.028226,0.372139,0.124101,1.785896,...,0.388434,0.059786,1.011637,1.261265,0.04073,0.0,2.32855,1.168567,0.0,0.014627
DPM1,24.160618,40.233929,41.576668,39.247639,67.875374,47.461025,21.458618,42.44104,31.42771,62.010185,...,43.513496,40.749054,40.796822,30.094984,75.593971,30.650616,44.815239,53.111034,41.376137,44.421211
SCYL3,4.812248,21.97636,10.609933,17.628784,6.27027,14.589837,6.268253,19.447031,4.57503,11.581996,...,21.850689,13.740495,16.930092,22.844444,10.384742,2.426898,32.798695,10.641923,10.584804,8.62135
C1orf112,4.153088,8.105289,5.267715,11.751167,3.867262,5.346801,2.420327,17.211245,1.417945,10.370278,...,6.662964,8.678933,10.266292,7.405015,11.389722,3.796204,11.655321,6.609096,12.162114,5.573026


In [102]:
# load ENSG/ENST/chr/pos table from UCSC
pos_map=pd.read_table(DAT_DIR+'ensg_enst_hg19_table-ucsc.txt',index_col=6)
pos_map['chrom']=pos_map['chrom'].apply(lambda x: x[3:] if len(x[3:])<3 else x[3])
pos_map.head()

Unnamed: 0_level_0,#name,chrom,txStart,txEnd,cdsStart,cdsEnd
name2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000118473,ENST00000237247,1,66999065,67210057,67000041,67208778
ENSG00000118473,ENST00000371039,1,66999274,67210768,67000041,67208778
ENSG00000118473,ENST00000424320,1,66999297,67145425,67000041,67145425
ENSG00000118473,ENST00000371035,1,66999822,67208882,67000041,67208778
ENSG00000118473,ENST00000468286,1,66999838,67142779,67142779,67142779


In [105]:
# intersect between breast mtx genes and Biomart mapping table
print(len(set(sub_breast_expr.index)&set(gene_map.index)))
# intersect between Biomart mapping table and UCSC mapping table
print(len(set(gene_map['Gene stable ID'])&set(pos_map.index)))
print(gene_map.shape)
print(pos_map.shape)

26548
54443
(224501, 2)
(204940, 6)


In [106]:
# dicts for mapping
gene_ensg = dict(zip(gene_map.index, gene_map['Gene stable ID']))
ensg_chr = dict(zip(pos_map.index, pos_map['chrom']))
ensg_start = dict(zip(pos_map.index, pos_map['txStart']))
ensg_end = dict(zip(pos_map.index, pos_map['txEnd']))

In [109]:
# map chr,start,end to new cols of breast_mtx_df
sub_breast_expr['gene']=sub_breast_expr.index
sub_breast_expr['gene']=sub_breast_expr['gene'].map(gene_ensg)
sub_breast_expr['chr']=sub_breast_expr["gene"].map(ensg_chr)
sub_breast_expr['start']=sub_breast_expr['gene'].map(ensg_start)
sub_breast_expr['end']=sub_breast_expr['gene'].map(ensg_end)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [115]:
sub_breast_expr=sub_breast_expr.dropna()
sub_breast_expr.head()

Unnamed: 0,TCGA-GM-A2DO,TCGA-OL-A6VQ,TCGA-E9-A295,TCGA-A2-A0YL,TCGA-AQ-A54O,TCGA-BH-A0E7,TCGA-B6-A0I5,TCGA-E2-A15K,TCGA-C8-A133,TCGA-AO-A12F,...,TCGA-E2-A150,TCGA-LL-A8F5,TCGA-AQ-A1H3,TCGA-A2-A1FW,TCGA-B6-A0X5,TCGA-S3-AA17,gene,chr,start,end
TSPAN6,5.582386,47.24754,40.451157,52.29504,26.852585,19.906492,29.754513,4.058737,16.073622,62.805061,...,48.945454,8.75868,33.806385,11.960064,2.302933,29.758554,ENSG00000000003,X,99888438.0,99894988.0
TNMD,0.184989,0.458678,0.530185,0.32301,0.017987,0.034819,0.028226,0.372139,0.124101,1.785896,...,0.04073,0.0,2.32855,1.168567,0.0,0.014627,ENSG00000000005,X,99848620.0,99852528.0
DPM1,24.160618,40.233929,41.576668,39.247639,67.875374,47.461025,21.458618,42.44104,31.42771,62.010185,...,75.593971,30.650616,44.815239,53.111034,41.376137,44.421211,ENSG00000000419,20,49552684.0,49575069.0
SCYL3,4.812248,21.97636,10.609933,17.628784,6.27027,14.589837,6.268253,19.447031,4.57503,11.581996,...,10.384742,2.426898,32.798695,10.641923,10.584804,8.62135,ENSG00000000457,1,169828259.0,169863093.0
C1orf112,4.153088,8.105289,5.267715,11.751167,3.867262,5.346801,2.420327,17.211245,1.417945,10.370278,...,11.389722,3.796204,11.655321,6.609096,12.162114,5.573026,ENSG00000000460,1,169764549.0,169823221.0


In [116]:
# move chr,start,end to front of file (easier parsing for FUSION)
colnames=list(sub_breast_expr)
new_order=colnames[-4:]+colnames[:-4]
sub_breast_expr = sub_breast_expr[new_order]
sub_breast_expr['start']=sub_breast_expr['start'].apply(lambda x: int(x))
sub_breast_expr['end']=sub_breast_expr['end'].apply(lambda x: int(x))
sub_breast_expr.head()

Unnamed: 0,gene,chr,start,end,TCGA-GM-A2DO,TCGA-OL-A6VQ,TCGA-E9-A295,TCGA-A2-A0YL,TCGA-AQ-A54O,TCGA-BH-A0E7,...,TCGA-OL-A5RV,TCGA-AR-A1AX,TCGA-D8-A27N,TCGA-A8-A08C,TCGA-E2-A150,TCGA-LL-A8F5,TCGA-AQ-A1H3,TCGA-A2-A1FW,TCGA-B6-A0X5,TCGA-S3-AA17
TSPAN6,ENSG00000000003,X,99888438,99894988,5.582386,47.24754,40.451157,52.29504,26.852585,19.906492,...,54.058868,13.817858,15.780485,43.901165,48.945454,8.75868,33.806385,11.960064,2.302933,29.758554
TNMD,ENSG00000000005,X,99848620,99852528,0.184989,0.458678,0.530185,0.32301,0.017987,0.034819,...,0.388434,0.059786,1.011637,1.261265,0.04073,0.0,2.32855,1.168567,0.0,0.014627
DPM1,ENSG00000000419,20,49552684,49575069,24.160618,40.233929,41.576668,39.247639,67.875374,47.461025,...,43.513496,40.749054,40.796822,30.094984,75.593971,30.650616,44.815239,53.111034,41.376137,44.421211
SCYL3,ENSG00000000457,1,169828259,169863093,4.812248,21.97636,10.609933,17.628784,6.27027,14.589837,...,21.850689,13.740495,16.930092,22.844444,10.384742,2.426898,32.798695,10.641923,10.584804,8.62135
C1orf112,ENSG00000000460,1,169764549,169823221,4.153088,8.105289,5.267715,11.751167,3.867262,5.346801,...,6.662964,8.678933,10.266292,7.405015,11.389722,3.796204,11.655321,6.609096,12.162114,5.573026


In [117]:
sub_breast_expr.shape

(24966, 1071)

In [133]:
sub_breast_expr.to_csv('TCGA-BRCA_expression_matrix_ENSG_RPKM.txt', sep='\t',index=False)

# Get test and train patients

In [134]:
# separate into train/test data
brca_patients=list(sub_breast_expr)[4:]
train_patients=random.sample(brca_patients, int(len(brca_patients)*.8))
test_patients=list(set(brca_patients)-set(train_patients))
train_expr=sub_breast_expr[['gene','chr','start','end']+train_patients]
test_expr=sub_breast_expr[['gene','chr','start','end']+test_patients]

# write test,train to file and don't touch
open('tcga-brca_train_patients.txt','w').write('\n'.join(train_patients))
open('tcga-brca_test_patients.txt','w').write('\n'.join(test_patients))
train_expr.to_csv('TCGA-BRCA_train_matrix.txt',sep='\t',index=False)
test_expr.to_csv('TCGA-BRCA_test_matrix.txt',sep='\t',index=False)

#  Load TCGA-BRCA expression data (chrs1-22)

In [113]:
ensg_expr=pd.read_table('TCGA-BRCA_expression_matrix_ENSG_RPKM.txt')

  interactivity=interactivity, compiler=compiler, result=result)


In [121]:
# check for negative values
neg=ensg_expr.copy()
neg=neg.iloc[:,4:].applymap(lambda x: 1 if float(x)<0 else 0)

In [122]:
neg.values.sum()

0