In [1]:
import numpy as np
import pandas as pd

# 1. Import raw data
rna_seq_features = np.load('Data/RNAseq_selected_features.npy', allow_pickle=True)
dna_meth_features = np.load('Data/DNAMethylation_selected_features_indices.npy', allow_pickle=True)

rna_seq_samples = pd.read_csv('Data/cancer_data_BRCA_RNASeq2GeneNorm-20160128.csv', index_col=0)
dna_meth_samples = pd.read_csv('Data/preprocessed_methylation_df.csv', index_col=0)

# 2. Keep only selected features
rna_seq_features = list(rna_seq_features[:,0])
dna_meth_features = list(dna_meth_features)

rna_seq_samples = rna_seq_samples.loc[rna_seq_features,:]
dna_meth_samples = dna_meth_samples.iloc[dna_meth_features]

# 3. Keep only common samples
rna_seq_samples.columns = ['-'.join(s.split('-')[:3]) for s in rna_seq_samples.columns]
dna_meth_samples.columns = ['-'.join(s.split('-')[:3]) for s in dna_meth_samples.columns]
common_samples = list(set(rna_seq_samples.columns) & set(dna_meth_samples.columns))

rna_seq_samples = rna_seq_samples.loc[:,common_samples]
dna_meth_samples = dna_meth_samples.loc[:,common_samples]

# 4. Export data
rna_seq_samples.to_csv('Data/RNAseq_final_set.csv')
dna_meth_samples.to_csv('Data/DNAMethylation_final_set.csv')


In [2]:
rna_seq_samples

Unnamed: 0,TCGA-BH-A0H0,TCGA-D8-A27N,TCGA-LL-A5YM,TCGA-LL-A7SZ,TCGA-AN-A0XS,TCGA-BH-A0B6,TCGA-D8-A1JA,TCGA-B6-A1KI,TCGA-E9-A1R4,TCGA-BH-A1FD,...,TCGA-A1-A0SN,TCGA-BH-A0E2,TCGA-E2-A1AZ,TCGA-EW-A1OY,TCGA-OL-A6VO,TCGA-E2-A56Z,TCGA-AO-A0JF,TCGA-E9-A1RG,TCGA-AN-A0XV,TCGA-E2-A1LS
CPB1,125.1430,128.4449,885890.7146,0.0000,1.9616,8027.4365,24.1062,11.4437,0.3974,14.2111,...,2.8865,16258.6342,3.7509,2025.8398,0.6716,6812.8945,30.7456,43.9002,175583.6820,233.7337
MGP,1567.6512,3081.0229,6279.4548,7666.8990,18331.4344,7639.0635,53168.4724,216803.1103,7622.7909,3661.5429,...,14933.6113,6209.1837,49977.4944,11639.7933,5663.1968,53055.4554,28596.4643,138027.7264,92822.5244,196973.4735
RPL19,14765.2560,29852.8511,60537.5111,26533.7979,14768.1967,18478.9252,2683.3694,15090.3756,8063.9200,12796.4037,...,11497.1135,17388.5400,9966.2416,35022.2222,18617.8643,23674.4815,12329.3620,30980.5915,16448.3961,17870.8709
CD24,1135.9752,18176.2807,2122.0227,6022.2997,26591.7119,31594.9631,52324.2145,34016.7254,15817.0626,5683.0046,...,4653.6241,34293.5636,11225.8065,8949.3540,68580.2552,13929.2155,17584.9347,934.8429,20882.1478,30300.8008
SCGB2A2,45664.8052,2630.8036,5854.5408,1173.5192,2212.3525,71767.1272,2630.5525,408739.1432,1058.3125,47860.2088,...,72153.3034,655.8085,3.7509,5797.9328,0.0000,72315.5996,10618.3705,36.0444,37817.6430,44531.0310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SNRPD3,2835.2042,1838.1263,3975.1675,6181.9512,1770.4018,1349.2923,1273.0986,1449.1080,2129.6324,2418.4310,...,1112.1616,2241.5934,814.4186,2392.1085,2754.3284,1396.8350,1293.7356,2721.2431,2019.5676,2121.8569
MAP2K2,843.4367,1984.2754,6340.5936,5101.0453,1971.0763,2589.4187,417.1181,1519.9531,1426.3175,1371.5197,...,2072.4824,2691.1303,646.6617,2129.1990,1726.6622,2528.8548,1310.1460,1782.8096,1201.5342,1008.0080
C2orf18,1675.5702,1760.1589,2673.7995,1634.8432,1974.9995,2092.9213,1657.6381,1916.3732,4060.7725,3226.5081,...,1553.5600,1636.9702,1453.1133,2060.4651,1259.9060,966.6366,1658.3397,1276.3401,1732.9149,1395.3954
CYP27A1,1603.7139,1737.9790,870.7171,2459.2334,419.1275,3670.2645,74.2145,582.1596,892.5910,193.7355,...,1622.5144,625.9812,563.3908,501.2920,1546.3398,968.8909,1155.2652,593.3457,1118.2008,3652.1522


In [3]:
dna_meth_samples

Unnamed: 0,TCGA-BH-A0H0,TCGA-D8-A27N,TCGA-LL-A5YM,TCGA-LL-A7SZ,TCGA-AN-A0XS,TCGA-BH-A0B6,TCGA-D8-A1JA,TCGA-B6-A1KI,TCGA-E9-A1R4,TCGA-BH-A1FD,...,TCGA-A1-A0SN,TCGA-BH-A0E2,TCGA-E2-A1AZ,TCGA-EW-A1OY,TCGA-OL-A6VO,TCGA-E2-A56Z,TCGA-AO-A0JF,TCGA-E9-A1RG,TCGA-AN-A0XV,TCGA-E2-A1LS
NXN,0.625618,0.000000,0.801325,0.000000,0.792865,0.771031,0.823434,0.000000,0.902930,0.943024,...,0.912414,0.734681,0.000000,0.944300,0.000000,0.328079,0.695514,0.000000,0.000000,0.000000
BANP,0.913625,0.000000,0.891060,0.000000,0.000000,0.932134,0.931170,0.000000,0.936202,0.000000,...,0.000000,0.516428,0.520174,0.000000,0.000000,0.577548,0.891016,0.934841,0.913465,0.906320
PTPRN2,0.915133,0.000000,0.923741,0.896107,0.894649,0.944777,0.930539,0.000000,0.953458,0.000000,...,0.936901,0.935761,0.000000,0.702129,0.560057,0.922797,0.907006,0.000000,0.916492,0.663665
LOC440839,0.954632,0.934048,0.000000,0.881661,0.906415,0.946045,0.919359,0.428766,0.925509,0.000000,...,0.000000,0.907546,0.928556,0.899696,0.000000,0.932191,0.926661,0.673325,0.400575,0.000000
ITPA,0.904684,0.950256,0.931436,0.910865,0.932702,0.957019,0.874087,0.942341,0.919266,0.000000,...,0.000000,0.000000,0.000000,0.941202,0.920627,0.926096,0.938976,0.000000,0.950283,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RPS6KA2,0.674485,0.691943,0.842481,0.673097,0.620703,0.499460,0.620668,0.294710,0.757263,0.721129,...,0.635435,0.520976,0.159086,0.137904,0.197333,0.163109,0.737490,0.131961,0.838040,0.167781
ZFP42,0.872685,0.593676,0.812660,0.556774,0.248504,0.358983,0.763960,0.309454,0.303336,0.619524,...,0.631436,0.884706,0.773699,0.165929,0.707770,0.936940,0.499224,0.140556,0.750225,0.914694
SPATA7,0.000000,0.884550,0.706127,0.537166,0.608376,0.785792,0.717970,0.810404,0.871897,0.748068,...,0.812433,0.553183,0.832266,0.847575,0.574946,0.492242,0.000000,0.905835,0.638728,0.610759
LHX8,0.909960,0.620328,0.710112,0.625541,0.567445,0.487492,0.021686,0.222240,0.552236,0.717198,...,0.484251,0.709281,0.724084,0.943036,0.504202,0.890285,0.407176,0.731575,0.734013,0.744070
