## Notebook to process the miRNA samples and match them with the corresponding RNA-Seq gene expression

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys

In [2]:
mirna_raw = pd.read_pickle("../data/miRNA_raw_no_brca.pkl")
mirna_raw_brca_train = pd.read_csv("../data/miRNA/miRNA_raw_data_TCGA_BRCA_CirielloSet_grch38.csv", sep=";")
mirna_raw_brca_test = pd.read_csv("../data/miRNA/miRNA_raw_data_TCGA_BRCA_PanCATestSet_grch38.csv", sep=";")

mirna_raw_brca_train = mirna_raw_brca_train.head(817)

In [3]:
mirna_raw.head(5)

Unnamed: 0,hsa-mir-1302-2,hsa-mir-6859-2,hsa-mir-6723,hsa-mir-200b,hsa-mir-200a,hsa-mir-429,hsa-mir-6726,hsa-mir-6727,hsa-mir-6808,hsa-mir-4251,...,hsa-mir-718,hsa-mir-6858,hsa-mir-664b,hsa-mir-1184-1,hsa-mir-1184-2,hsa-mir-1184-3,hsa-mir-3690-2,hsa-mir-6089-2,gdc_id,tcga_id
0,0,0,2,5785,7441,1101,1,0,0,0,...,0,0,4,0,0,0,0,0,9970426e-551e-4061-8f22-9e60f356da38-meq,TCGA-AM-5820
0,0,0,0,10,7,0,4,0,0,0,...,0,0,36,0,0,0,0,0,b31597ce-0d01-4c67-98ec-d6aabf68ab44-meq,TCGA-HT-7468
0,0,0,0,7011,4092,715,1,0,0,0,...,0,0,28,0,0,0,0,0,84aaa177-8747-492e-9bba-acda1a986976-meq,TCGA-J4-A6G1
0,0,0,0,37,45,3,1,0,1,0,...,0,0,47,0,0,0,0,0,0af91fd0-52b0-4001-b838-7dc11199628d-meq,TCGA-HT-A4DV
0,0,0,0,12519,9910,1812,0,0,1,0,...,0,0,12,0,0,0,0,0,5ffa5d3e-de27-47e1-908c-8dfb41cf475a-meq,TCGA-L5-A8NR


In [4]:
mirna_all_brca = mirna_raw_brca_train.append(mirna_raw_brca_test, sort=True)

In [5]:
mirna_all_brca.head(5)

Unnamed: 0,TCGA ID,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,...,hsa-mir-941-5,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
0,TCGA-A1-A0SB,38052.0,38123.0,38425.0,211210.0,24521.0,3190.0,1396.0,6307.0,6541.0,...,0.0,20.0,1.0,42.0,5.0,0.0,108.0,144.0,9380.0,134478.0
1,TCGA-A1-A0SD,66099.0,65545.0,65807.0,200725.0,10654.0,2104.0,5803.0,18967.0,19066.0,...,0.0,12.0,1.0,7.0,9.0,0.0,79.0,198.0,4445.0,198312.0
2,TCGA-A1-A0SE,20084.0,19857.0,20121.0,105277.0,10004.0,994.0,1737.0,4722.0,4596.0,...,0.0,6.0,0.0,13.0,10.0,0.0,77.0,98.0,3373.0,80274.0
3,TCGA-A1-A0SF,51958.0,52013.0,52333.0,152030.0,28994.0,2182.0,4550.0,15369.0,15688.0,...,0.0,40.0,0.0,9.0,14.0,0.0,349.0,307.0,14897.0,286445.0
4,TCGA-A1-A0SH,10068.0,10131.0,10213.0,57725.0,9082.0,798.0,1104.0,1864.0,1842.0,...,0.0,2.0,0.0,4.0,0.0,0.0,15.0,52.0,2157.0,55433.0


In [6]:
mirna_all_brca = mirna_all_brca.rename(columns={"TCGA ID": "tcga_id"})
mirna_all_tcga = mirna_all_brca.append(mirna_raw, sort=True)

In [7]:
mirna_all_tcga_meta = mirna_all_tcga['tcga_id']

In [8]:
mirna_all_tcga.drop(['gdc_id', 'tcga_id'], inplace=True, axis=1)

In [36]:
mirna_all_tcga.shape

(10251, 1881)

In [9]:
non_zero_genes = mirna_all_tcga.loc[:, (mirna_all_tcga != 0).any(axis=0)].columns

In [10]:
non_zero_genes

Index(['hsa-let-7a-1', 'hsa-let-7a-2', 'hsa-let-7a-3', 'hsa-let-7b',
       'hsa-let-7c', 'hsa-let-7d', 'hsa-let-7e', 'hsa-let-7f-1',
       'hsa-let-7f-2', 'hsa-let-7g',
       ...
       'hsa-mir-941-1', 'hsa-mir-942', 'hsa-mir-943', 'hsa-mir-944',
       'hsa-mir-95', 'hsa-mir-9500', 'hsa-mir-96', 'hsa-mir-98', 'hsa-mir-99a',
       'hsa-mir-99b'],
      dtype='object', length=1771)

In [11]:
mirna_all_tcga["tcga_id"] = mirna_all_tcga_meta

In [12]:
mirna_all_tcga.to_pickle("../data/miRNA/miRNA_raw_all_tcga.pkl")

### We now have the non_zero genes accross all miRNA samples

In [13]:
mirna_raw.head(5)

Unnamed: 0,hsa-mir-1302-2,hsa-mir-6859-2,hsa-mir-6723,hsa-mir-200b,hsa-mir-200a,hsa-mir-429,hsa-mir-6726,hsa-mir-6727,hsa-mir-6808,hsa-mir-4251,...,hsa-mir-718,hsa-mir-6858,hsa-mir-664b,hsa-mir-1184-1,hsa-mir-1184-2,hsa-mir-1184-3,hsa-mir-3690-2,hsa-mir-6089-2,gdc_id,tcga_id
0,0,0,2,5785,7441,1101,1,0,0,0,...,0,0,4,0,0,0,0,0,9970426e-551e-4061-8f22-9e60f356da38-meq,TCGA-AM-5820
0,0,0,0,10,7,0,4,0,0,0,...,0,0,36,0,0,0,0,0,b31597ce-0d01-4c67-98ec-d6aabf68ab44-meq,TCGA-HT-7468
0,0,0,0,7011,4092,715,1,0,0,0,...,0,0,28,0,0,0,0,0,84aaa177-8747-492e-9bba-acda1a986976-meq,TCGA-J4-A6G1
0,0,0,0,37,45,3,1,0,1,0,...,0,0,47,0,0,0,0,0,0af91fd0-52b0-4001-b838-7dc11199628d-meq,TCGA-HT-A4DV
0,0,0,0,12519,9910,1812,0,0,1,0,...,0,0,12,0,0,0,0,0,5ffa5d3e-de27-47e1-908c-8dfb41cf475a-meq,TCGA-L5-A8NR


In [14]:
mirna_raw_meta = mirna_raw[['gdc_id', 'tcga_id']]

In [15]:
mirna_raw.drop(['gdc_id', 'tcga_id'], inplace=True, axis=1)

#### Drop miRNA genes that are always 0

In [16]:
non_zero_genes

Index(['hsa-let-7a-1', 'hsa-let-7a-2', 'hsa-let-7a-3', 'hsa-let-7b',
       'hsa-let-7c', 'hsa-let-7d', 'hsa-let-7e', 'hsa-let-7f-1',
       'hsa-let-7f-2', 'hsa-let-7g',
       ...
       'hsa-mir-941-1', 'hsa-mir-942', 'hsa-mir-943', 'hsa-mir-944',
       'hsa-mir-95', 'hsa-mir-9500', 'hsa-mir-96', 'hsa-mir-98', 'hsa-mir-99a',
       'hsa-mir-99b'],
      dtype='object', length=1771)

In [17]:
mirna_raw_non_zero = mirna_raw[non_zero_genes]

In [18]:
mirna_raw_non_zero.head(5)

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-941-1,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
0,41510,41598,41271,27723,2622,2471,4450,39782,40644,1715,...,0,29,1,1,23,0,65,335,297,59641
0,88534,88711,88784,109807,43870,1456,4801,23829,24357,3082,...,0,9,0,0,94,0,42,298,17175,68479
0,114338,113785,114377,86901,66497,2029,10750,48069,49084,4681,...,0,20,0,20,0,0,103,302,16822,125913
0,34439,34791,34437,97326,31373,2335,4889,6897,7080,1400,...,0,12,1,0,32,0,17,262,5362,184622
0,72644,72815,72419,29549,5288,2955,19944,78307,80975,7392,...,0,57,0,1,165,0,49,554,956,70597


##### Compute TPM and log(x+1) norm

In [19]:
mirna_raw_1000000 = mirna_raw_non_zero*1000*1000
mirna_raw_norm = (mirna_raw_1000000).div(mirna_raw_non_zero.sum(axis=1), axis=0)

In [20]:
mirna_raw_norm = mirna_raw_norm.astype('float64')

In [21]:
mirna_raw_log_norm = mirna_raw_norm.apply(lambda x: np.log2(x+1))

In [22]:
mirna_raw_final = mirna_raw_log_norm
mirna_raw_final_meta = mirna_raw_final

In [23]:
mirna_raw_final_meta["tcga_id"] = mirna_raw_meta['tcga_id'].values

In [24]:
mirna_raw_final_meta["gdc_id"] = mirna_raw_meta['gdc_id'].values

In [25]:
mirna_raw_final_meta.head(5)

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,tcga_id,gdc_id
0,12.985925,12.98898,12.977595,12.403638,9.003841,8.91844,9.765823,12.92459,12.955512,8.392867,...,0.257505,0.257505,2.457969,0.0,3.776277,6.054465,5.883541,13.508716,TCGA-AM-5820,9970426e-551e-4061-8f22-9e60f356da38-meq
0,12.997484,13.000365,13.001552,13.308116,11.984667,7.081847,8.795733,11.10445,11.136054,8.158074,...,0.0,0.0,3.275238,0.0,2.286583,4.834021,10.632295,12.626966,TCGA-HT-7468,b31597ce-0d01-4c67-98ec-d6aabf68ab44-meq
0,13.820647,13.813653,13.821139,13.424819,13.038775,8.009764,10.410712,12.570659,12.6008,9.212637,...,0.0,1.81995,0.0,0.0,3.810757,5.293275,11.056342,13.959761,TCGA-J4-A6G1,84aaa177-8747-492e-9bba-acda1a986976-meq
0,12.840541,12.85521,12.840458,14.339196,12.706041,8.960694,10.025298,10.521324,10.559079,8.224631,...,0.278521,0.0,2.966137,0.0,2.207942,5.827641,10.158408,15.262841,TCGA-HT-A4DV,0af91fd0-52b0-4001-b838-7dc11199628d-meq
0,13.637945,13.641337,13.63347,12.340376,9.859337,9.020998,11.773356,13.746235,13.794567,10.342137,...,0.0,0.233235,4.904588,0.0,3.26272,6.617784,7.398718,13.596712,TCGA-L5-A8NR,5ffa5d3e-de27-47e1-908c-8dfb41cf475a-meq


In [26]:
mirna_raw_final_meta.to_pickle("../data/miRNA_no_brca_filtered_scaled_meta.pkl")
mirna_raw_final.to_pickle("../data/miRNA_no_brca_filtered_scaled.pkl")

## Now we need to match the miRNA with the gene expression data

In [52]:
mirna_raw_final = pd.read_pickle("../data/miRNA_no_brca_filtered_scaled.pkl")

In [53]:
tcga_no_brca = pd.read_pickle("../data/tcga_raw_no_labelled_brca_log_row_normalized_meta.pkl")

In [54]:
tcga_no_brca.shape

(9287, 19039)

In [55]:
mirna_raw_final.shape

(9215, 1773)

In [56]:
mirna_raw_final.head()

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,tcga_id,gdc_id
0,12.985925,12.98898,12.977595,12.403638,9.003841,8.91844,9.765823,12.92459,12.955512,8.392867,...,0.257505,0.257505,2.457969,0.0,3.776277,6.054465,5.883541,13.508716,TCGA-AM-5820,9970426e-551e-4061-8f22-9e60f356da38-meq
0,12.997484,13.000365,13.001552,13.308116,11.984667,7.081847,8.795733,11.10445,11.136054,8.158074,...,0.0,0.0,3.275238,0.0,2.286583,4.834021,10.632295,12.626966,TCGA-HT-7468,b31597ce-0d01-4c67-98ec-d6aabf68ab44-meq
0,13.820647,13.813653,13.821139,13.424819,13.038775,8.009764,10.410712,12.570659,12.6008,9.212637,...,0.0,1.81995,0.0,0.0,3.810757,5.293275,11.056342,13.959761,TCGA-J4-A6G1,84aaa177-8747-492e-9bba-acda1a986976-meq
0,12.840541,12.85521,12.840458,14.339196,12.706041,8.960694,10.025298,10.521324,10.559079,8.224631,...,0.278521,0.0,2.966137,0.0,2.207942,5.827641,10.158408,15.262841,TCGA-HT-A4DV,0af91fd0-52b0-4001-b838-7dc11199628d-meq
0,13.637945,13.641337,13.63347,12.340376,9.859337,9.020998,11.773356,13.746235,13.794567,10.342137,...,0.0,0.233235,4.904588,0.0,3.26272,6.617784,7.398718,13.596712,TCGA-L5-A8NR,5ffa5d3e-de27-47e1-908c-8dfb41cf475a-meq


In [57]:
tcga_mirna_rna_meta = pd.merge(mirna_raw_final, tcga_no_brca, on="tcga_id")

In [58]:
tcga_mirna_rna_meta.dtypes[tcga_mirna_rna_meta.dtypes!='float64']

tcga_id        object
gdc_id         object
sample_id      object
cancer_type    object
dtype: object

In [59]:
tcga_mirna_rna_meta.drop(['sample_id', 'gdc_id'], axis=1, inplace=True)

In [60]:
tcga_mirna_rna = tcga_mirna_rna_meta.drop(['tcga_id', 'gdc_id', 'sample_id', 'cancer_type'], axis=1)

KeyError: "['gdc_id' 'sample_id'] not found in axis"

In [None]:
tcga_mirna_rna_meta['cancer_type'].value_counts()

In [None]:
tcga_mirna_rna.shape

In [61]:
tcga_mirna_rna_meta.shape

(9003, 20809)

In [62]:
tcga_mirna_rna

NameError: name 'tcga_mirna_rna' is not defined

In [63]:
#tcga_mirna_rna.to_pickle("../data/hybrids/tcga_mirna_rna_filtered_scaled.pkl")
tcga_mirna_rna_meta.to_pickle("../data/hybrids/tcga_mirna_rna_filtered_scaled_meta.pkl")

### Now merge with RNA only to get the cancer type

In [44]:
cancer_type_table = tcga_no_brca[["cancer_type", 'tcga_id']]

In [46]:
mirna_with_meta = pd.merge(mirna_raw_final, cancer_type_table, on="tcga_id")

In [48]:
mirna_with_meta.shape

(9003, 1774)

In [49]:
mirna_raw_final.shape

(9215, 1773)

In [51]:
mirna_with_meta.to_pickle("../data/miRNA_no_brca_filtered_scaled_cancer_type.pkl")

## Do the same for the BRCA train and test data

In [38]:
tcga_brca_train = pd.read_pickle("../data/tcga_brca_raw_19036_row_log_norm_train.pkl")
tcga_brca_test = pd.read_pickle("../data/tcga_brca_raw_19036_row_log_norm_test.pkl")

In [46]:
tcga_brca_train.head(5)

Unnamed: 0,tcga_id,Ciriello_subtype,sample_id,cancer_type,A1BG,A1CF,A2M,A2ML1,A2MP1,A3GALT2,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,TCGA-A1-A0SK,Basal,57DABE72-0950-4301-BE03-262F3916DBB1,Breast Invasive Carcinoma,2.510456,0.02814,7.715284,1.461094,0.372293,0.225355,...,5.903705,6.487882,3.120146,4.879934,5.45841,3.337981,6.150583,4.955257,6.523552,6.331
1,TCGA-A2-A04P,Basal,D06D5373-6A1B-474C-9279-10FCB29F59BA,Breast Invasive Carcinoma,2.657545,0.131735,8.697799,6.334411,0.71429,0.028052,...,4.201788,5.843262,1.598004,3.150624,4.589208,3.475111,5.037776,7.627667,5.085828,5.15905
2,TCGA-A2-A0CM,Basal,64D0958D-EF53-43EC-8167-390D957485C4,Breast Invasive Carcinoma,2.133521,0.031171,8.740598,5.763921,0.411263,0.0,...,5.664298,6.181542,2.797198,4.219109,5.251486,4.131694,5.156545,7.888728,5.839178,6.057595
3,TCGA-A2-A0D2,Basal,004E0C7A-A9F7-43AD-ACAC-52FB44F567FC,Breast Invasive Carcinoma,1.217436,0.012358,10.919038,0.135197,0.92836,0.068827,...,5.634331,6.951186,2.450123,4.331003,4.86536,3.480749,5.938206,9.086016,5.295436,5.737644
4,TCGA-A2-A0ST,Basal,118D1959-5B7B-4D2C-9E9F-970613F0E133,Breast Invasive Carcinoma,2.757098,0.006323,9.567252,5.861877,1.191594,0.006037,...,4.54137,4.86959,3.484438,4.865189,4.580591,2.871138,5.308459,7.627094,6.263564,5.998159


In [43]:
mirna_brca_train = pd.read_csv("../data/miRNA_filtered_norm_scaled_meta_train.csv")
mirna_brca_test = pd.read_csv("../data/miRNA_filtered_norm_scaled_meta_test.csv")

In [44]:
mirna_brca_train.head(5)

Unnamed: 0,TCGA_ID,Ciriello_subtype,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,...,hsa-mir-941-1,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
0,TCGA-A1-A0SB,Normal,14.03037,14.033059,14.044442,16.502933,13.396463,10.454968,9.264031,11.437857,...,0.0,3.291905,0.525771,4.282995,1.677401,0.0,5.599551,6.007131,12.010321,15.851634
1,TCGA-A1-A0SD,LumA,14.03039,14.018248,14.024003,15.632852,11.397605,9.059588,10.521528,12.229467,...,0.0,2.013493,0.325539,1.470888,1.712899,0.0,4.392142,5.675821,10.137212,15.615404
2,TCGA-A1-A0SE,LumA,13.034042,13.017645,13.036697,15.423974,12.028745,8.700684,9.50448,10.946024,...,0.0,1.809602,0.0,2.684479,2.371771,0.0,5.051083,5.389651,10.460951,15.032799
3,TCGA-A1-A0SF,LumA,12.784891,12.786417,12.795264,14.333694,11.943461,8.215924,9.273612,11.028055,...,0.0,2.685329,0.0,1.152029,1.536697,0.0,5.596833,5.415917,10.983075,15.247564
4,TCGA-A1-A0SH,LumA,12.962609,12.971607,12.983236,15.481879,12.813933,9.307462,9.775111,10.530101,...,0.0,1.370444,0.0,2.060394,0.0,0.0,3.688321,5.399958,10.740593,15.423429


In [65]:
mirna_brca_test.head(5)

Unnamed: 0,tcga_id,expert_PAM50_subtypes,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,...,hsa-mir-941-1,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
0,TCGA-3C-AAAU,LumA,13.514627,13.504685,13.530334,14.562495,7.978139,8.680035,10.444366,10.798507,...,0.0,2.694435,0.266281,1.548775,2.366092,0.0,6.932103,5.724983,6.932103,15.496556
1,TCGA-3C-AALI,Her2,13.328474,13.330931,13.339006,14.510022,9.554742,9.018468,9.092649,11.673093,...,0.0,3.200443,0.0,0.727173,1.10228,0.0,5.933862,6.429633,7.887388,13.627101
2,TCGA-3C-AALJ,LumB,13.432159,13.418758,13.441694,13.415155,9.243957,9.297123,11.387266,12.511639,...,0.0,2.214673,0.0,0.845298,1.310029,0.0,6.635774,6.876964,7.574399,15.012233
3,TCGA-3C-AALK,LumA,13.550832,13.560693,13.567933,14.662512,11.497175,8.396544,10.350582,11.639559,...,0.0,2.153877,0.0,2.153877,1.402156,0.0,6.230695,5.41722,10.01507,14.552775
4,TCGA-4H-AAAK,LumA,13.831441,13.830614,13.839637,14.433812,11.662663,8.475744,10.70667,12.007394,...,0.0,1.543579,0.0,1.149743,0.903448,0.0,4.405742,5.134419,10.04546,14.63713


In [48]:
mirna_brca_train = mirna_brca_train.rename(columns={"TCGA_ID": "tcga_id"})
mirna_brca_test = mirna_brca_test.rename(columns={"TCGA_ID": "tcga_id"})

In [58]:
tcga_brca_train.shape

(817, 19040)

In [57]:
tcga_brca_test.shape

(236, 19040)

In [55]:
tcga_brca_mirna_rna_meta_train = pd.merge(mirna_brca_train, tcga_brca_train, on="tcga_id")
tcga_brca_mirna_rna_meta_train.shape

(816, 20812)

In [56]:
tcga_brca_mirna_rna_meta_test = pd.merge(mirna_brca_test, tcga_brca_test, on="tcga_id")
tcga_brca_mirna_rna_meta_test.shape

(219, 20812)

In [88]:
tcga_brca_mirna_rna_meta_train.dtypes[tcga_brca_mirna_rna_meta_train.dtypes != 'float64']

tcga_id             object
Ciriello_subtype    object
cancer_type         object
dtype: object

In [89]:
tcga_brca_mirna_rna_meta_test.dtypes[tcga_brca_mirna_rna_meta_test.dtypes != 'float64']

tcga_id                  object
expert_PAM50_subtypes    object
cancer_type              object
dtype: object

In [81]:
tcga_brca_mirna_rna_meta_train.drop(['Ciriello_subtype_y', 'sample_id', 'cancer_type'], axis=1, inplace=True)

In [83]:
tcga_brca_mirna_rna_meta_test.drop(['subtype', 'sample_id', 'cancer_type'], axis=1, inplace=True)

In [85]:
tcga_brca_mirna_rna_meta_train = tcga_brca_mirna_rna_meta_train.rename(columns={"Ciriello_subtype_x": "Ciriello_subtype"})

In [92]:
tcga_brca_mirna_rna_meta_train.to_pickle("../data/hybrids/tcga_brca_mirna_rna_meta_train.pkl")
tcga_brca_mirna_rna_meta_test.to_pickle("../data/hybrids/tcga_brca_mirna_rna_meta_test.pkl")

In [63]:
pd.read_pickle("../data/tcga_raw_19036.pkl").head(5)

Unnamed: 0,tcga_id,sample_id,cancer_type,A1BG,A1CF,A2M,A2ML1,A2MP1,A3GALT2,A4GALT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,TCGA-DD-AAVP-01A-11,3DFF72D2-F292-497E-ACE3-6FAA9C884205,Liver Hepatocellular Carcinoma,1454762,316636,641105,0,630,50,12380,...,9345,19795,3579,8192,46912,621,53209,302428,29863,44717
1,TCGA-KK-A7B2-01A-12,B1E54366-42B9-463C-8615-B34D52BD14DC,Prostate Adenocarcinoma,28886,192,2361023,10757,1538,10,238345,...,26355,21156,11269,55211,100123,6864,90816,1009426,156109,108371
2,TCGA-DC-6158-01A-11,473713F7-EB41-4F20-A37F-ACD209E3CB75,Rectum Adenocarcinoma,4322,154254,2025563,104,1421,160,129069,...,150203,252841,50985,184511,119044,458,203733,1319876,348923,253112
3,TCGA-DD-A4NP-01A-11,11F18F54-9B33-4C33-BDF9-0F093F4F3336,Liver Hepatocellular Carcinoma,23043947,983141,15232711,242,15854,2,69336,...,25389,21800,6888,39201,126221,6045,315638,83030,165945,175660
4,TCGA-HQ-A5ND-01A-11,136B7576-1108-4FA3-8254-6069F0CA879A,Bladder Urothelial Carcinoma,3273,270,719684,163497,296,0,196089,...,221120,633127,13853,44655,140114,4662,205856,1290436,131672,211360


In [64]:
pd.read_pickle("../data/tcga_raw_counts.pkl").head(5)

Unnamed: 0,tcga_id,sample_id,cancer_type,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,TCGA-DD-AAVP,3DFF72D2-F292-497E-ACE3-6FAA9C884205,Liver Hepatocellular Carcinoma,1454762,316636,641105,0,12380,0,122018,...,9345,19795,3579,8192,46912,621,53209,302428,29863,44717
1,TCGA-KK-A7B2,B1E54366-42B9-463C-8615-B34D52BD14DC,Prostate Adenocarcinoma,28886,192,2361023,10757,238345,0,210986,...,26355,21156,11269,55211,100123,6864,90816,1009426,156109,108371
2,TCGA-DC-6158,473713F7-EB41-4F20-A37F-ACD209E3CB75,Rectum Adenocarcinoma,4322,154254,2025563,104,129069,132,120345,...,150203,252841,50985,184511,119044,458,203733,1319876,348923,253112
3,TCGA-DD-A4NP,11F18F54-9B33-4C33-BDF9-0F093F4F3336,Liver Hepatocellular Carcinoma,23043947,983141,15232711,242,69336,0,220953,...,25389,21800,6888,39201,126221,6045,315638,83030,165945,175660
4,TCGA-HQ-A5ND,136B7576-1108-4FA3-8254-6069F0CA879A,Bladder Urothelial Carcinoma,3273,270,719684,163497,196089,0,173340,...,221120,633127,13853,44655,140114,4662,205856,1290436,131672,211360


In [12]:
miRNA_no_brca = pd.read_pickle("../data/miRNA_no_brca_filtered_scaled.pkl")

In [13]:
miRNA_no_brca.dtypes[miRNA_no_brca.dtypes!='float64']

tcga_id    object
gdc_id     object
dtype: object

In [14]:
miRNA_no_brca.shape

(9215, 1773)