In [1]:
from __future__ import print_function
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from scipy.stats import zscore
import seaborn as sns
import sys,os
from mapper import expand, parse_mapping_table, apply_mappers
%matplotlib  inline

In [2]:
gene_id = "ENTREZID"
raw_data_dir = "data/" 
preprocessed_data_dir =  "preprocessed/exprs/"
root_dir = "data/"

### PDX

In [3]:
exprs  = pd.read_excel(raw_data_dir+"PDX/nm.3954-S2.xlsx","RNAseq_fpkm")
exprs.set_index("Sample",inplace=True,drop=True)
print(exprs.shape)
exprs.head() 

(22665, 399)


Unnamed: 0_level_0,X-1004,X-1008,X-1027,X-1095,X-1119,X-1156,X-1167,X-1169,X-1172,X-1173,...,X-5713,X-5717,X-5727,X-5739,X-5808,X-5959,X-5974,X-5975,X-6030,X-6047
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,2.75,8.97,0.14,0.13,0.08,17.52,0.18,0.41,0.08,0.0,...,21.24,0.46,18.7,0.0,8.3,14.41,5.09,6.35,0.09,3.22
A1BG-AS1,2.48,3.25,0.0,0.0,0.0,6.52,0.0,0.0,0.0,0.0,...,15.21,0.09,3.42,0.0,8.3,8.36,3.71,4.52,0.0,3.24
A1CF,0.02,0.03,1.3,2.83,2.87,0.72,3.41,0.01,1.84,2.96,...,0.01,0.45,0.0,0.01,0.0,0.0,0.02,0.03,4.63,0.02
A2LD1,4.87,0.81,6.45,4.94,11.07,0.87,3.28,0.32,0.61,7.1,...,1.33,2.51,2.67,3.13,0.44,2.96,0.0,1.99,2.57,1.5
A2M,0.01,71.17,0.0,3.69,0.0,58.16,0.0,0.02,94.12,0.02,...,463.32,0.04,33.35,0.0,254.97,41.52,2.32,0.0,0.0,0.16


 ### Mapping of gene symbols to EntrezID using current gene_info file prowided by NCBI:

In [4]:
hgnc_file = raw_data_dir+"custom.txt"

hgnc = pd.read_csv(hgnc_file, sep ="\t",index_col=0)#
#print(hgnc.shape, len(set(hgnc.index.values)))
approved = hgnc.loc[hgnc["Status"] == "Approved",:]
hgnc_prev =  expand(approved[["Previous symbols","NCBI Gene ID"]],column="Previous symbols",sep=", ") 
hgnc_prev = parse_mapping_table(hgnc_prev, "Previous symbols","NCBI Gene ID")

1800 rows with both Previous symbols and NCBI Gene ID empty
Ok: no duplicated pairs detected
28125 rows with empty Previous symbols were excluded
193 Previous symbols ids mapped to no NCBI Gene ID
84 Previous symbols mapped to multiple NCBI Gene ID
5042 different Previous symbols mapped to the same NCBI Gene ID
9515 Previous symbols can be mapped directly to NCBI Gene ID


In [5]:
hgnc_syn =  expand(approved[["Alias names","NCBI Gene ID"]],column="Alias names",sep=", ") 
hgnc_syn = parse_mapping_table(hgnc_syn, "Alias names","NCBI Gene ID")

1887 rows with both Alias names and NCBI Gene ID empty
16 duplicated pairs dropped
33429 rows with empty Alias names were excluded
120 Alias names ids mapped to no NCBI Gene ID
264 Alias names mapped to multiple NCBI Gene ID
5252 different Alias names mapped to the same NCBI Gene ID
4345 Alias names can be mapped directly to NCBI Gene ID


In [6]:
NCBI = pd.read_csv(raw_data_dir+"Homo_sapiens.gene_info",sep = "\t")
NCBI = NCBI[["#tax_id","GeneID","Symbol","Synonyms","type_of_gene"]]
NCBI = NCBI.loc[NCBI["#tax_id"] == 9606]
NCBI = NCBI.loc[NCBI["type_of_gene"] != "unknown"]
ncbi_symbols = parse_mapping_table(NCBI, "Symbol","GeneID")

Ok: no empty rows detected
Ok: no duplicated pairs detected
Ok: All Symbol rows are not empty.
Ok: All Symbol are mapped to GeneID
9 Symbol mapped to multiple GeneID
Ok: All GeneID are unique
60205 Symbol can be mapped directly to GeneID


In [7]:
ncbi_synonyms = expand(NCBI[["Synonyms","GeneID"]],column="Synonyms",sep="|") 
ncbi_synonyms = parse_mapping_table(ncbi_synonyms, "Synonyms","GeneID")

Ok: no empty rows detected
Ok: no duplicated pairs detected
Ok: All Synonyms rows are not empty.
Ok: All Synonyms are mapped to GeneID
3258 Synonyms mapped to multiple GeneID
50734 different Synonyms mapped to the same GeneID
10915 Synonyms can be mapped directly to GeneID


In [8]:
exprs = apply_mappers(exprs, ncbi_symbols, ncbi_synonyms, verbose = True,handle_duplicates = "sum")[0]
exprs.head(5)

Mapped: 22437 
	directly via main_mapper 19905 
	via alternative mapper 522 
	via one of multiple synonyms in alternative mapper 1270 
	LOC 740 
Unmapped: 228 
	recognized symbols without Entrez ID 0 
	multiple query_ids map to the same target_id 0 
	query_ids map to multiple target_ids in the main mapper 0 
	query_ids map to multiple target_ids in the alternative mapper 71 
	LOC not found in Entrez 32 
	Not found at all: 125


Unnamed: 0_level_0,X-1004,X-1008,X-1027,X-1095,X-1119,X-1156,X-1167,X-1169,X-1172,X-1173,...,X-5713,X-5717,X-5727,X-5739,X-5808,X-5959,X-5974,X-5975,X-6030,X-6047
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.75,8.97,0.14,0.13,0.08,17.52,0.18,0.41,0.08,0.0,...,21.24,0.46,18.7,0.0,8.3,14.41,5.09,6.35,0.09,3.22
2,0.01,71.17,0.0,3.69,0.0,58.16,0.0,0.02,94.12,0.02,...,463.32,0.04,33.35,0.0,254.97,41.52,2.32,0.0,0.0,0.16
3,0.0,0.01,0.0,0.0,0.0,0.1,0.0,0.0,0.02,0.0,...,0.03,0.1,0.01,0.02,0.01,0.0,0.0,0.01,0.0,0.02
9,1.47,0.74,5.64,1.98,9.98,3.71,12.41,2.57,1.38,4.77,...,1.19,1.93,0.93,1.27,1.99,3.34,0.27,1.64,11.12,1.6
10,0.0,0.0,0.99,3.06,9.65,0.0,2.31,0.0,0.11,4.48,...,0.02,0.0,0.0,0.0,0.02,0.0,0.03,0.16,1.92,0.04


### FPKM to TPM conversion

In [9]:
##  FPKM convert to log2(TPM+1)
sum_fpkm = exprs.apply(sum,axis=0)
sum_fpkm.head()

X-1004    387779.59
X-1008    488681.10
X-1027    722773.26
X-1095    448767.63
X-1119    608770.40
dtype: float64

In [10]:
tpm = exprs / sum_fpkm *1000000 +1
tpm.head()

Unnamed: 0_level_0,X-1004,X-1008,X-1027,X-1095,X-1119,X-1156,X-1167,X-1169,X-1172,X-1173,...,X-5713,X-5717,X-5727,X-5739,X-5808,X-5959,X-5974,X-5975,X-6030,X-6047
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,8.091657,19.355529,1.193698,1.289682,1.131412,33.704426,1.364492,1.780126,1.165758,1.0,...,44.878131,1.790526,35.007766,1.0,17.000994,28.579781,11.788326,12.300361,1.170096,7.065444
2,1.025788,146.636899,1.0,9.222518,1.0,109.566747,1.0,1.038055,196.01479,1.03045,...,958.138214,1.068741,61.650214,1.0,492.538964,80.466516,5.917272,1.0,1.0,1.301389
3,1.0,1.020463,1.0,1.0,1.0,1.186669,1.0,1.0,1.04144,1.0,...,1.061975,1.171853,1.018186,1.041251,1.019278,1.0,1.0,1.017796,1.0,1.037674
9,4.790813,2.51428,8.803277,5.412083,17.393701,7.925424,26.129695,5.890058,3.859333,8.262344,...,3.458332,4.316771,2.691295,3.619447,4.836383,7.392538,1.572269,3.918518,22.016334,4.013885
10,1.0,1.0,2.369724,7.818674,16.851625,1.0,5.677647,1.0,1.227918,7.820818,...,1.041317,1.0,1.0,1.0,1.038557,1.0,1.063585,1.284734,4.628719,1.075347


In [11]:

tpm = tpm.applymap(np.log2)
tpm.to_csv(preprocessed_data_dir + "PDX.FPKM2TPMplus1log2.Expr.tsv",sep="\t")
print(tpm.shape)
tpm.head

(22401, 399)


<bound method NDFrame.head of              X-1004    X-1008    X-1027    X-1095    X-1119    X-1156  \
Sample                                                                  
1          3.016435  4.274674  0.255438  0.367016  0.178125  5.074866   
2          0.036732  7.196104  0.000000  3.205161  0.000000  6.775666   
3          0.000000  0.029224  0.000000  0.000000  0.000000  0.246918   
9          2.260271  1.330145  3.138041  2.436184  4.120493  2.986488   
10         0.000000  0.000000  1.244719  2.966924  4.074816  0.000000   
...             ...       ...       ...       ...       ...       ...   
102724473  0.000000  0.085954  0.000000  0.000000  0.000000  0.078611   
103091865  2.830198  4.360533  1.112493  1.521952  0.811982  0.725385   
105375355  0.600038  0.430687  0.472380  2.448015  1.454815  0.496383   
109623460  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
109731405  3.655870  3.916398  3.247177  3.430599  4.691364  2.434399   

             X-1167 

### TCGA

In [12]:
tcga_tmp_dir = raw_data_dir + "TCGA_seq/"

In [13]:
f_ext = ".rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt"
for fpath in os.listdir(tcga_tmp_dir):
    if not fpath.endswith(".tar.gz") :
        cohort = fpath.split(".")[0]
        print(cohort)
        fname = cohort + f_ext
        print(tcga_tmp_dir+fpath)
        exprs = pd.read_csv(tcga_tmp_dir+fpath,sep="\t",index_col=0)
        # drop "gene_id" and keep only "scaled_estimate" columns
        try:
            exprs = pd.read_csv(tcga_tmp_dir+fpath,sep="\t",index_col=0)
            # drop "gene_id" and keep only "scaled_estimate" columns
            exprs = exprs.loc[:,exprs.T.loc[exprs.T["gene_id"]=="scaled_estimate",:].index]
            exprs = exprs.iloc[1:,]
            exprs.rename(index = lambda x :  int(x.split("|")[1]),
                         columns = lambda x : x.replace(".1",""),inplace = True)
            exprs.index.name = "ENTREZID"
            # convert scaled_extimates to log2(TPM+1)
            exprs = exprs.applymap(lambda x : np.log2(float(x)*1000000+1))
            exprs = exprs.sort_index()
            exprs.to_csv(preprocessed_data_dir +"TCGA-"+cohort+"_exprs.RSEMscaled_est2TPMplus1log2.tsv",sep  ="\t")
            print(cohort,exprs.shape)
        except:
            print(cohort,"No expression data.")

ACC
data/TCGA_seq/ACC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


ACC (20531, 79)
BLCA
data/TCGA_seq/BLCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


BLCA (20531, 427)
BRCA
data/TCGA_seq/BRCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


BRCA (20531, 1212)
CESC
data/TCGA_seq/CESC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


CESC (20531, 309)
CHOL
data/TCGA_seq/CHOL.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


CHOL (20531, 45)
COAD
data/TCGA_seq/COAD.rnaseqv2__illuminaga_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


COAD (20531, 191)
COADREAD
data/TCGA_seq/COADREAD.rnaseqv2__illuminaga_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


COADREAD (20531, 263)
ESCA
data/TCGA_seq/ESCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


ESCA (20531, 196)
GBM
data/TCGA_seq/GBM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


GBM (20531, 171)
GBMLGG
data/TCGA_seq/GBMLGG.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


GBMLGG (20531, 701)
HNSC
data/TCGA_seq/HNSC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


HNSC (20531, 566)
KICH
data/TCGA_seq/KICH.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


KICH (20531, 91)
KIPAN
data/TCGA_seq/KIPAN.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


KIPAN (20531, 1020)
KIRC
data/TCGA_seq/KIRC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


KIRC (20531, 606)
KIRP
data/TCGA_seq/KIRP.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


KIRP (20531, 323)
LAML
data/TCGA_seq/LAML.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


LAML (20531, 173)
LGG
data/TCGA_seq/LGG.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


LGG (20531, 530)
LIHC
data/TCGA_seq/LIHC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


LIHC (20531, 423)
LUAD
data/TCGA_seq/LUAD.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


LUAD (20531, 576)
LUSC
data/TCGA_seq/LUSC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


LUSC (20531, 552)
MESO
data/TCGA_seq/MESO.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


MESO (20531, 87)
OV
data/TCGA_seq/OV.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


OV (20531, 307)
PAAD
data/TCGA_seq/PAAD.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


PAAD (20531, 183)
PCPG
data/TCGA_seq/PCPG.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


PCPG (20531, 187)
PRAD
data/TCGA_seq/PRAD.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


PRAD (20531, 550)
READ
data/TCGA_seq/READ.rnaseqv2__illuminaga_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


READ (20531, 72)
SARC
data/TCGA_seq/SARC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


SARC (20531, 265)
SKCM
data/TCGA_seq/SKCM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


SKCM (20531, 473)
STAD
data/TCGA_seq/STAD.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


STAD (20531, 450)
STES
data/TCGA_seq/STES.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


STES (20531, 646)
TGCT
data/TCGA_seq/TGCT.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


TGCT (20531, 156)
THCA
data/TCGA_seq/THCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


THCA (20531, 568)
THYM
data/TCGA_seq/THYM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


THYM (20531, 122)
UCEC
data/TCGA_seq/UCEC.rnaseqv2__illuminaga_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


UCEC (20531, 381)
UCS
data/TCGA_seq/UCS.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


UCS (20531, 57)
UVM
data/TCGA_seq/UVM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


  interactivity=interactivity, compiler=compiler, result=result)


UVM (20531, 80)


In [14]:
##  FPKM convert to log2(TPM+1)
sum_fpkm = exprs.apply(sum,axis=0)
sum_fpkm.head()

TCGA-RZ-AB0B-01A-11R-A405-07    53158.188717
TCGA-V3-A9ZX-01A-11R-A405-07    53227.417609
TCGA-V3-A9ZY-01A-11R-A405-07    42076.508745
TCGA-V4-A9E5-01A-11R-A405-07    41459.897618
TCGA-V4-A9E7-01A-11R-A405-07    46592.806687
dtype: float64

In [15]:
tpm = exprs / sum_fpkm *1000000 +1
tpm.head()

Unnamed: 0_level_0,TCGA-RZ-AB0B-01A-11R-A405-07,TCGA-V3-A9ZX-01A-11R-A405-07,TCGA-V3-A9ZY-01A-11R-A405-07,TCGA-V4-A9E5-01A-11R-A405-07,TCGA-V4-A9E7-01A-11R-A405-07,TCGA-V4-A9E8-01A-11R-A405-07,TCGA-V4-A9E9-01A-11R-A405-07,TCGA-V4-A9EA-01A-11R-A405-07,TCGA-V4-A9EC-01A-11R-A405-07,TCGA-V4-A9ED-01A-11R-A405-07,...,TCGA-WC-A885-01A-11R-A405-07,TCGA-WC-A888-01A-11R-A405-07,TCGA-WC-A88A-01A-11R-A405-07,TCGA-WC-AA9A-01A-11R-A405-07,TCGA-WC-AA9E-01A-11R-A405-07,TCGA-YZ-A980-01A-11R-A405-07,TCGA-YZ-A982-01A-11R-A405-07,TCGA-YZ-A983-01A-11R-A405-07,TCGA-YZ-A984-01A-11R-A405-07,TCGA-YZ-A985-01A-11R-A405-07
ENTREZID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,57.338242,68.695426,107.34446,76.762849,60.605496,78.924186,90.447568,86.848087,103.82067,63.680926,...,93.874433,70.061003,59.681281,50.254368,76.6464,54.388235,59.010027,91.742235,32.147755,34.144815
2,197.900442,201.391953,237.514589,235.004967,205.47438,194.940901,247.930725,251.431148,196.396207,228.49169,...,207.30597,216.638642,194.284084,198.32001,211.724076,165.727596,183.295199,215.327757,204.329778,225.372707
9,51.496292,29.846832,25.730741,31.863587,18.305232,40.307134,31.763156,15.679087,28.727742,31.895415,...,17.485826,36.280252,20.514909,53.829497,20.770005,30.162363,15.280548,27.19823,30.479879,29.089414
10,1.804006,1.0,1.832833,1.0,1.0,1.0,1.0,1.0,1.0,6.602092,...,1.0,1.0,1.0,2.079162,2.148258,2.267579,1.0,1.0,2.160605,3.991786
12,94.566432,116.521495,34.653783,104.584201,25.058621,41.627616,17.935722,41.72927,79.561263,121.308063,...,25.792118,78.760457,49.042052,15.887044,29.685352,161.124243,81.569903,20.049252,194.879795,25.934106


In [16]:

tpm = tpm.applymap(np.log2)
tpm.to_csv(preprocessed_data_dir + "PDX.FPKM2TPMplus1log2.Expr.tsv",sep="\t")
print(tpm.shape)
tpm.head()

(20531, 80)


Unnamed: 0_level_0,TCGA-RZ-AB0B-01A-11R-A405-07,TCGA-V3-A9ZX-01A-11R-A405-07,TCGA-V3-A9ZY-01A-11R-A405-07,TCGA-V4-A9E5-01A-11R-A405-07,TCGA-V4-A9E7-01A-11R-A405-07,TCGA-V4-A9E8-01A-11R-A405-07,TCGA-V4-A9E9-01A-11R-A405-07,TCGA-V4-A9EA-01A-11R-A405-07,TCGA-V4-A9EC-01A-11R-A405-07,TCGA-V4-A9ED-01A-11R-A405-07,...,TCGA-WC-A885-01A-11R-A405-07,TCGA-WC-A888-01A-11R-A405-07,TCGA-WC-A88A-01A-11R-A405-07,TCGA-WC-AA9A-01A-11R-A405-07,TCGA-WC-AA9E-01A-11R-A405-07,TCGA-YZ-A980-01A-11R-A405-07,TCGA-YZ-A982-01A-11R-A405-07,TCGA-YZ-A983-01A-11R-A405-07,TCGA-YZ-A984-01A-11R-A405-07,TCGA-YZ-A985-01A-11R-A405-07
ENTREZID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.841426,6.102142,6.746104,6.262336,5.921377,6.302396,6.49901,6.440422,6.69795,5.992789,...,6.55266,6.13054,5.899207,5.651177,6.260146,5.765223,5.882888,6.519514,5.006646,5.093595
2,7.628631,7.653862,7.891872,7.876547,7.682815,7.606893,7.953793,7.97402,7.617623,7.835998,...,7.695618,7.759147,7.602024,7.631686,7.726042,7.37267,7.518025,7.75039,7.674756,7.816169
9,5.686397,4.899506,4.685421,4.993837,4.194184,5.332963,4.989282,3.97077,4.844373,4.995277,...,4.128114,5.181113,4.358601,5.750325,4.37643,4.914678,3.933624,4.765441,4.929785,4.862422
10,0.851204,0.0,0.874075,0.0,0.0,0.0,0.0,0.0,0.0,2.722923,...,0.0,0.0,0.0,1.056002,1.103167,1.181153,0.0,0.0,1.111435,1.997034
12,6.563256,6.864452,5.114941,6.708521,4.647235,5.379469,4.164764,5.382988,6.313994,6.922532,...,4.688858,6.2994,5.615947,3.989779,4.891679,7.33203,6.349965,4.325477,7.606441,4.696779
