In [1]:
import os, sys
import numpy as np

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from packages.tcgahandler import LayerDataset

from IPython.display import display
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None  # default='warn'

DATA_DIR = "../../data/"

# TCGA-LGG

In [2]:
project = "TCGA-LGG"
layer = "mirna"
dataset = LayerDataset(DATA_DIR, project, layer)


In [3]:
col_types = dataset.get_types_of_columns()
head = dataset.get_layer(n_rows=5)
ids = dataset.get_layer_by_column_type("")

print(f"Number of miRNAs: {ids.shape[0]}\nNumber of patients: {col_types.most_common()[0][1]}")
print(f"Types of columns: {col_types}")
display(head)
t = dataset.get_layer()

Number of miRNAs: 1881
Number of patients: 530
Types of columns: Counter({'read_count_': 530, 'reads_per_million_miRNA_mapped_': 530, 'cross-mapped_': 530, 'miRNA_ID': 1})


Unnamed: 0_level_0,read_count_TCGA-WY-A859-01A-12R-A36C-13,reads_per_million_miRNA_mapped_TCGA-WY-A859-01A-12R-A36C-13,cross-mapped_TCGA-WY-A859-01A-12R-A36C-13,read_count_TCGA-DB-5279-01A-03R-1762-13,reads_per_million_miRNA_mapped_TCGA-DB-5279-01A-03R-1762-13,cross-mapped_TCGA-DB-5279-01A-03R-1762-13,read_count_TCGA-HT-7855-01A-11R-2401-13,reads_per_million_miRNA_mapped_TCGA-HT-7855-01A-11R-2401-13,cross-mapped_TCGA-HT-7855-01A-11R-2401-13,read_count_TCGA-DB-A64Q-01A-11R-A29V-13,...,cross-mapped_TCGA-QH-A65V-01A-11R-A29V-13,read_count_TCGA-P5-A5F1-01A-11R-A28I-13,reads_per_million_miRNA_mapped_TCGA-P5-A5F1-01A-11R-A28I-13,cross-mapped_TCGA-P5-A5F1-01A-11R-A28I-13,read_count_TCGA-S9-A6TY-01A-12R-A32L-13,reads_per_million_miRNA_mapped_TCGA-S9-A6TY-01A-12R-A32L-13,cross-mapped_TCGA-S9-A6TY-01A-12R-A32L-13,read_count_TCGA-R8-A6YH-01A-21R-A360-13,reads_per_million_miRNA_mapped_TCGA-R8-A6YH-01A-21R-A360-13,cross-mapped_TCGA-R8-A6YH-01A-21R-A360-13
miRNA_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hsa-let-7a-1,47988,6603.1396,N,208393,12768.458,N,91156,8017.9585,N,41083,...,N,26378,5024.652,N,59026,5707.1597,N,90792,11082.219,N
hsa-let-7a-2,47732,6567.914,Y,207446,12710.434,Y,91403,8039.684,Y,40972,...,Y,26250,5000.2695,Y,59314,5735.006,Y,90038,10990.185,Y
hsa-let-7a-3,48024,6608.0933,N,209405,12830.464,N,92288,8117.5273,N,41148,...,N,26557,5058.749,N,59462,5749.316,N,91013,11109.194,N
hsa-let-7b,44799,6164.334,N,398927,24442.676,N,100243,8817.238,N,61183,...,N,33391,6360.533,N,80852,7817.4917,N,115569,14106.54,N
hsa-let-7c,49266,6778.992,Y,123244,7551.289,Y,66068,5811.252,Y,27192,...,Y,28290,5388.862,Y,24036,2324.0146,Y,131428,16042.315,Y


Cross-Mapped Analysis

In [4]:
cross_df = dataset.get_layer_by_column_type("cross-mapped_")

tcm = pd.DataFrame(index = cross_df.index, columns = ["number of cross mappings"])
tcm["number of cross mappings"] = (cross_df == "Y").sum(axis=1)
tcm = tcm.sort_values(by="number of cross mappings", ascending = False)

cross_mapped_remove = tcm[tcm["number of cross mappings"] > 50].index

Check for the presence of NaNs

In [5]:
print(f"Are there any NaNs?\n{t.isna().any().any()}")

Are there any NaNs?
False


Sparsity analysis - identify and remove columns with high percentage (>95) of 0s

In [6]:
counts_df = dataset.get_layer_by_column_type("read_count_")
print("fraction of zeros in whole dataset: ", (counts_df == 0).sum().sum() / (counts_df.shape[0]*counts_df.shape[1]))
zero_count = ((counts_df == 0).sum(axis="columns") / counts_df.shape[1])
print("\npercentage of 0s by gene:\n", zero_count.sort_values(ascending=False).head(5))

all_zero = zero_count >= 0.95
index_remove_zero = counts_df[all_zero].index


fraction of zeros in whole dataset:  0.6457514569729068

percentage of 0s by gene:
 miRNA_ID
hsa-mir-7154      1.0
hsa-mir-5186      1.0
hsa-mir-6089-1    1.0
hsa-mir-8052      1.0
hsa-mir-378e      1.0
dtype: float64


Process and save datasets

In [7]:
col_types = dataset.get_types_of_columns()
print("Types of columns:\n", list(col_types.keys()))

Types of columns:
 ['miRNA_ID', 'read_count_', 'reads_per_million_miRNA_mapped_', 'cross-mapped_']


Raw Counts Dataset

In [8]:
type_of_column = "read_count_"

df = dataset.get_layer_by_column_type(type_of_column)
print(df.shape)

# drop genes with high crossmap
df = df.drop(index=cross_mapped_remove)
print(df.shape)

#drop genes with high zero quantity
df = df.drop(index=index_remove_zero)
print(df.shape)

# transpose to have samples in rows and gene in columns
df = df.T
display(df.head())

dataset.set_raw_data(data_type="counts", df=df)

(1881, 530)
(1830, 530)
(1101, 530)


miRNA_ID,hsa-let-7a-1,hsa-let-7a-3,hsa-let-7b,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,hsa-let-7i,hsa-mir-1-1,...,hsa-mir-937,hsa-mir-939,hsa-mir-940,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-96,hsa-mir-98,hsa-mir-99b
TCGA-WY-A859-01A-12R-A36C-13,47988,48024,44799,3858,13864,26333,27072,5127,2328,73,...,3,2,63,7,0,0,138,101,963,157711
TCGA-DB-5279-01A-03R-1762-13,208393,209405,398927,3752,9696,63301,64044,4786,4682,25,...,20,4,105,34,6,2,73,122,974,208684
TCGA-HT-7855-01A-11R-2401-13,91156,92288,100243,4193,13050,16925,17241,5284,2962,5,...,14,3,19,10,0,0,28,40,421,261817
TCGA-DB-A64Q-01A-11R-A29V-13,41083,41148,61183,1182,2172,8377,8653,1120,1625,7,...,15,0,2,1,0,0,22,87,234,42247
TCGA-HT-7604-01A-11R-2089-13,88879,89663,163263,2652,10064,14298,14304,3203,2138,0,...,28,5,30,4,0,0,43,121,410,212061


Reads Per Million (RPM) Dataset

In [9]:
type_of_column = "reads_per_million_miRNA_mapped_"

df = dataset.get_layer_by_column_type(type_of_column)
print(df.shape)

# drop genes with high crossmap
df = df.drop(index=cross_mapped_remove)
print(df.shape)

#drop genes with all zeros
df = df.drop(index=index_remove_zero)
print(df.shape)

# transpose to have samples in rows and gene in columns
df = df.T
display(df.head())

dataset.set_raw_data(data_type="rpm", df=df)

(1881, 530)
(1830, 530)
(1101, 530)


miRNA_ID,hsa-let-7a-1,hsa-let-7a-3,hsa-let-7b,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,hsa-let-7i,hsa-mir-1-1,...,hsa-mir-937,hsa-mir-939,hsa-mir-940,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-96,hsa-mir-98,hsa-mir-99b
TCGA-WY-A859-01A-12R-A36C-13,6603.1396,6608.0933,6164.334,530.86005,1907.6837,3623.4158,3725.102,705.47424,320.33237,10.044786,...,0.412799,0.2752,8.668788,0.963199,0.0,0.0,18.988773,13.897581,132.50862,21701.004
TCGA-DB-5279-01A-03R-1762-13,12768.458,12830.464,24442.676,229.88898,594.0841,3878.5186,3924.043,293.24323,286.87106,1.531776,...,1.225421,0.245084,6.43346,2.083216,0.367626,0.122542,4.472787,7.475068,59.678,12786.287
TCGA-HT-7855-01A-11R-2401-13,8017.9585,8117.5273,8817.238,368.8106,1147.8604,1488.7001,1516.495,464.7735,260.5335,0.439793,...,1.231421,0.263876,1.671214,0.879586,0.0,0.0,2.462842,3.518346,37.03059,23029.068
TCGA-DB-A64Q-01A-11R-A29V-13,5334.945,5343.3853,7945.085,153.49182,282.05096,1087.8181,1123.6588,145.44064,211.01878,0.909004,...,1.947866,0.0,0.259715,0.129858,0.0,0.0,2.85687,11.297621,30.386705,5486.099
TCGA-HT-7604-01A-11R-2089-13,13372.635,13490.595,24564.37,399.01697,1514.2183,2151.2612,2152.1638,481.9198,321.6811,0.0,...,4.212849,0.752294,4.513767,0.601836,0.0,0.0,6.469732,18.205524,61.688145,31906.46


# TCGA-COAD

In [10]:
project = "TCGA-COAD"
layer = "mirna"
dataset = LayerDataset(DATA_DIR, project, layer)


In [11]:
col_types = dataset.get_types_of_columns()
head = dataset.get_layer(n_rows=5)
ids = dataset.get_layer_by_column_type("")

print(f"Number of miRNAs: {ids.shape[0]}\nNumber of patients: {col_types.most_common()[0][1]}")
print(f"Types of columns: {col_types}")
display(head)
t = dataset.get_layer()

Number of miRNAs: 1881
Number of patients: 465
Types of columns: Counter({'read_count_': 465, 'reads_per_million_miRNA_mapped_': 465, 'cross-mapped_': 465, 'miRNA_ID': 1})


Unnamed: 0_level_0,read_count_TCGA-A6-6141-01A-11H-1773-13,reads_per_million_miRNA_mapped_TCGA-A6-6141-01A-11H-1773-13,cross-mapped_TCGA-A6-6141-01A-11H-1773-13,read_count_TCGA-CM-6678-01A-11H-1838-13,reads_per_million_miRNA_mapped_TCGA-CM-6678-01A-11H-1838-13,cross-mapped_TCGA-CM-6678-01A-11H-1838-13,read_count_TCGA-D5-6922-01A-11H-1927-13,reads_per_million_miRNA_mapped_TCGA-D5-6922-01A-11H-1927-13,cross-mapped_TCGA-D5-6922-01A-11H-1927-13,read_count_TCGA-CM-5341-01A-01T-1409-13,...,cross-mapped_TCGA-A6-6649-01A-11H-1773-13,read_count_TCGA-AA-3494-01A-01T-1409-13,reads_per_million_miRNA_mapped_TCGA-AA-3494-01A-01T-1409-13,cross-mapped_TCGA-AA-3494-01A-01T-1409-13,read_count_TCGA-5M-AATA-01A-31H-A41D-13,reads_per_million_miRNA_mapped_TCGA-5M-AATA-01A-31H-A41D-13,cross-mapped_TCGA-5M-AATA-01A-31H-A41D-13,read_count_TCGA-5M-AAT5-01A-21H-A41D-13,reads_per_million_miRNA_mapped_TCGA-5M-AAT5-01A-21H-A41D-13,cross-mapped_TCGA-5M-AAT5-01A-21H-A41D-13
miRNA_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hsa-let-7a-1,42021,8316.662,N,42764,8307.485,N,54512,8824.276,N,26848,...,N,9169,5031.5864,N,18795,6052.144,N,12225,5043.9473,N
hsa-let-7a-2,41969,8306.37,Y,42359,8228.809,N,54342,8796.757,Y,26558,...,Y,9203,5050.2446,N,18469,5947.1694,N,12239,5049.7236,N
hsa-let-7a-3,41950,8302.61,N,42462,8248.817,N,54331,8794.977,N,27098,...,N,9314,5111.1567,N,18927,6094.649,N,12406,5118.6265,N
hsa-let-7b,53873,10662.372,N,43892,8526.614,N,49460,8006.4697,N,39949,...,N,14815,8129.8896,N,23221,7477.3525,N,11576,4776.175,N
hsa-let-7c,7408,1466.1677,Y,2701,524.70575,N,5093,824.443,Y,2603,...,Y,279,153.10423,N,1074,345.8368,N,516,212.8979,N


Cross-Mapped Analysis

In [12]:
cross_df = dataset.get_layer_by_column_type("cross-mapped_")

tcm = pd.DataFrame(index = cross_df.index, columns = ["number of cross mappings"])
tcm["number of cross mappings"] = (cross_df == "Y").sum(axis=1)
tcm = tcm.sort_values(by="number of cross mappings", ascending = False)

cross_mapped_remove = tcm[tcm["number of cross mappings"] > 50].index

Check for the presence of NaNs

In [13]:
print(f"Are there any NaNs?\n{t.isna().any().any()}")

Are there any NaNs?
False


Sparsity analysis - identify and remove columns with high percentage (>95) of 0s

In [14]:
counts_df = dataset.get_layer_by_column_type("read_count_")
print("fraction of zeros in whole dataset: ", (counts_df == 0).sum().sum() / (counts_df.shape[0]*counts_df.shape[1]))
zero_count = ((counts_df == 0).sum(axis="columns") / counts_df.shape[1])
print("\npercentage of 0s by gene:\n", zero_count.sort_values(ascending=False).head(5))

all_zero = zero_count >= 0.95
index_remove_zero = counts_df[all_zero].index


fraction of zeros in whole dataset:  0.6905180840664712

percentage of 0s by gene:
 miRNA_ID
hsa-mir-4500    1.0
hsa-mir-8057    1.0
hsa-mir-8053    1.0
hsa-mir-8054    1.0
hsa-mir-4255    1.0
dtype: float64


Process and save datasets

In [15]:
col_types = dataset.get_types_of_columns()
print("Types of columns:\n", list(col_types.keys()))

Types of columns:
 ['miRNA_ID', 'read_count_', 'reads_per_million_miRNA_mapped_', 'cross-mapped_']


Raw Counts Dataset

In [16]:
type_of_column = "read_count_"

df = dataset.get_layer_by_column_type(type_of_column)
print(df.shape)

# drop genes with high crossmap
df = df.drop(index=cross_mapped_remove)
print(df.shape)

#drop genes with high zero quantity
df = df.drop(index=index_remove_zero)
print(df.shape)

# transpose to have samples in rows and gene in columns
df = df.T
display(df.head())

dataset.set_raw_data(data_type="counts", df=df)

(1881, 465)
(1833, 465)
(1061, 465)


miRNA_ID,hsa-let-7a-1,hsa-let-7a-3,hsa-let-7b,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,hsa-let-7i,hsa-mir-1-1,...,hsa-mir-938,hsa-mir-939,hsa-mir-940,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-96,hsa-mir-98,hsa-mir-99b
TCGA-A6-6141-01A-11H-1773-13,42021,41950,53873,4111,3866,25845,26164,4861,3546,94,...,0,10,29,46,0,4,64,89,260,69395
TCGA-CM-6678-01A-11H-1838-13,42764,42462,43892,1713,5150,43202,43482,3632,2190,142,...,0,6,12,27,0,2,102,127,271,60982
TCGA-D5-6922-01A-11H-1927-13,54512,54331,49460,2933,8286,32702,33070,2904,1242,413,...,0,10,8,84,0,1,51,43,183,77394
TCGA-CM-5341-01A-01T-1409-13,26848,27098,39949,1758,1285,10873,11075,1937,1912,15,...,0,4,3,12,0,0,52,61,223,42602
TCGA-DM-A0X9-01A-11H-A154-13,27128,26995,19479,2305,1393,21524,21718,3205,1211,44,...,0,6,15,95,0,0,68,48,229,30600


Reads Per Million (RPM) Dataset

In [17]:
type_of_column = "reads_per_million_miRNA_mapped_"

df = dataset.get_layer_by_column_type(type_of_column)
print(df.shape)

# drop genes with high crossmap
df = df.drop(index=cross_mapped_remove)
print(df.shape)

#drop genes with all zeros
df = df.drop(index=index_remove_zero)
print(df.shape)

# transpose to have samples in rows and gene in columns
df = df.T
display(df.head())

dataset.set_raw_data(data_type="rpm", df=df)

(1881, 465)
(1833, 465)
(1061, 465)


miRNA_ID,hsa-let-7a-1,hsa-let-7a-3,hsa-let-7b,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,hsa-let-7i,hsa-mir-1-1,...,hsa-mir-938,hsa-mir-939,hsa-mir-940,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-96,hsa-mir-98,hsa-mir-99b
TCGA-A6-6141-01A-11H-1773-13,8316.662,8302.61,10662.372,813.636,765.14636,5115.1597,5178.2954,962.0736,701.813,18.60418,...,0.0,1.979168,5.739587,9.104173,0.0,0.791667,12.666676,17.614595,51.45837,13734.437
TCGA-CM-6678-01A-11H-1838-13,8307.485,8248.817,8526.614,332.7734,1000.4571,8392.572,8446.966,705.56506,425.4371,27.585419,...,0.0,1.165581,2.331162,5.245115,0.0,0.388527,19.814878,24.671467,52.645412,11846.578
TCGA-D5-6922-01A-11H-1927-13,8824.276,8794.977,8006.4697,474.78723,1341.3185,5293.7236,5353.295,470.09277,201.05208,66.855484,...,0.0,1.618777,1.295021,13.597725,0.0,0.161878,8.255762,6.96074,29.623615,12528.361
TCGA-CM-5341-01A-01T-1409-13,8310.05,8387.43,12365.099,544.1399,397.7359,3365.4338,3427.9573,599.5443,591.8063,4.642832,...,0.0,1.238088,0.928566,3.714265,0.0,0.0,16.09515,18.88085,69.02343,13186.261
TCGA-DM-A0X9-01A-11H-A154-13,8591.553,8549.432,6169.082,730.0033,441.169,6816.742,6878.183,1015.0371,383.52884,13.934987,...,0.0,1.900225,4.750564,30.086905,0.0,0.0,21.535889,15.201804,72.525276,9691.15
