In [1]:
import os, sys
import numpy as np

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from packages.tcgahandler import LayerDataset

import re
import math
from IPython.display import display
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None  # default='warn'

DATA_DIR = "../../data/"
layer = "mrna"

# TCGA-LGG

In [2]:
project = "TCGA-LGG"
dataset = LayerDataset(DATA_DIR, project, layer)

In [3]:
col_types = dataset.get_types_of_columns()
head = dataset.get_layer(n_rows=5)
ids = dataset.get_layer_by_column_type("")

print(f"Number of miRNAs: {ids.shape[0]}\nNumber of patients: {col_types.most_common()[0][1]}")
print(f"Types of columns: {col_types}")
display(head)

Number of miRNAs: 60664
Number of patients: 534
Types of columns: Counter({'unstranded_': 534, 'stranded_first_': 534, 'stranded_second_': 534, 'tpm_unstranded_': 534, 'fpkm_unstranded_': 534, 'fpkm_uq_unstranded_': 534, 'gene_id': 1, 'gene_name': 1, 'gene_type': 1})


Unnamed: 0_level_0,gene_name,gene_type,unstranded_TCGA-WY-A859-01A-12R-A36H-07,unstranded_TCGA-QH-A86X-01A-11R-A36H-07,unstranded_TCGA-DU-6402-01A-11R-1708-07,unstranded_TCGA-HT-7611-01A-11R-2403-07,unstranded_TCGA-DB-A4XA-01A-11R-A26U-07,unstranded_TCGA-P5-A5EY-01A-11R-A27Q-07,unstranded_TCGA-HW-8320-01A-11R-2404-07,unstranded_TCGA-DU-6392-01A-11R-1708-07,...,unstranded_TCGA-S9-A7J1-01A-21R-A34R-07,unstranded_TCGA-S9-A7J2-01A-11R-A34F-07,unstranded_TCGA-S9-A89Z-01A-11R-A36H-07,unstranded_TCGA-TM-A7CF-01A-11R-A32Q-07,unstranded_TCGA-TM-A7CF-02A-11R-A32Q-07,unstranded_TCGA-TM-A84B-01A-11R-A36H-07,unstranded_TCGA-TQ-A7RP-01A-21R-A34F-07,unstranded_TCGA-VM-A8CD-01A-11R-A36H-07,unstranded_TCGA-WY-A85A-01A-21R-A36H-07,unstranded_TCGA-WY-A85C-01A-11R-A36H-07
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.15,TSPAN6,protein_coding,2884,4390,4926,6025,1764,2377,4267,3274,...,2623,3236,3505,1695,2153,7529,1801,2041,2412,4629
ENSG00000000005.6,TNMD,protein_coding,12,14,4,8,5,8,1,3,...,5,15,4,6,11,4,11,3,3,17
ENSG00000000419.13,DPM1,protein_coding,1489,1472,1435,1272,739,649,1200,811,...,708,976,1052,714,846,1356,1134,1274,758,1174
ENSG00000000457.14,SCYL3,protein_coding,634,886,474,789,379,331,786,1051,...,567,602,390,369,431,833,515,622,515,733
ENSG00000000460.17,C1orf112,protein_coding,214,223,347,321,96,79,332,326,...,154,184,218,96,183,542,191,219,186,229


Check for the presence of NaNs

In [4]:
t = dataset.get_layer_by_column_type('unstranded_')
t = t.drop(columns=["gene_name", "gene_type"])
print(f"Are there any NaNs?\n{t.isna().any().any()}")

Are there any NaNs?
False


Sparsity analysis - identify and remove columns with high percentage (>95) of 0s

In [5]:
print("fraction of zeros in whole dataset: ", (t == 0).sum().sum() / (t.shape[0]*t.shape[1]))
zero_count = ((t == 0).sum(axis="columns") / t.shape[1])
print(zero_count.sort_values(ascending=False).head(5))
all_zero = zero_count == 1
index_remove_zero = t[all_zero].index

fraction of zeros in whole dataset:  0.44243088719543666
gene_id
ENSG00000199744.1          1.0
ENSG00000277108.1          1.0
ENSG00000235521.1          1.0
ENSG00000277119.1          1.0
ENSG00000277120.5_PAR_Y    1.0
dtype: float64


Process and save datasets

In [6]:
col_types = dataset.get_types_of_columns()
print("Types of columns:\n", list(col_types.keys()))

Types of columns:
 ['gene_id', 'gene_name', 'gene_type', 'unstranded_', 'stranded_first_', 'stranded_second_', 'tpm_unstranded_', 'fpkm_unstranded_', 'fpkm_uq_unstranded_']


Raw Counts Dataset

In [7]:
type_of_column = "unstranded_"

df = dataset.get_layer_by_column_type(type_of_column)
print(df.shape)

# drop last 4 rows with NaN values
df = df.dropna()
print(df.shape)

# drop columns gene_name and gene_type
df = df.drop(columns = ["gene_name", "gene_type"])
print(df.shape)

# drop genes with all zeros
df = df.drop(index=index_remove_zero)
print(df.shape)

# transpose dataframe
df = df.T

display(df.head())
dataset.set_raw_data(data_type="counts", df=df)

(60664, 536)
(60660, 536)
(60660, 534)
(57582, 534)


gene_id,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288658.1,ENSG00000288659.1,ENSG00000288660.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288674.1,ENSG00000288675.1
TCGA-WY-A859-01A-12R-A36H-07,2884,12,1489,634,214,301,277,411,3686,1331,...,383,2,1,0,48,0,0,1050,15,94
TCGA-QH-A86X-01A-11R-A36H-07,4390,14,1472,886,223,539,379,1067,8869,3116,...,37,0,1,1,84,0,0,1188,7,87
TCGA-DU-6402-01A-11R-1708-07,4926,4,1435,474,347,1093,2941,4823,3108,1251,...,6,0,0,0,27,0,0,398,6,51
TCGA-HT-7611-01A-11R-2403-07,6025,8,1272,789,321,262,513,342,7912,2051,...,17,0,0,0,18,0,0,851,4,50
TCGA-DB-A4XA-01A-11R-A26U-07,1764,5,739,379,96,468,300,371,1376,1184,...,21,0,0,0,27,0,0,707,6,96


Transcripts Per Million (TPM) Dataset

In [8]:
type_of_column = "tpm_unstranded_"

df = dataset.get_layer_by_column_type(type_of_column)
print(df.shape)

# drop last 4 rows with NaN values
df = df.dropna()
print(df.shape)

# drop columns gene_name and gene_type
df = df.drop(columns = ["gene_name", "gene_type"])
print(df.shape)

# drop genes with all zeros
df = df.drop(index=index_remove_zero)
print(df.shape)

# transpose dataframe
df = df.T

display(df.head())
dataset.set_raw_data(data_type="tpm", df=df)

(60664, 536)
(60660, 536)
(60660, 534)
(57582, 534)


gene_id,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288658.1,ENSG00000288659.1,ENSG00000288660.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288674.1,ENSG00000288675.1
TCGA-WY-A859-01A-12R-A36H-07,34.0816,0.4358,66.1279,4.9375,1.9215,4.7708,1.8607,7.807,22.9269,18.7213,...,6.9196,0.1829,0.0737,0.0,0.4515,0.0,0.0,31.1479,0.0836,2.9993
TCGA-QH-A86X-01A-11R-A36H-07,44.4717,0.4358,56.0393,5.9149,1.7164,7.3233,2.1824,17.374,47.289,37.5708,...,0.573,0.0,0.0632,0.7658,0.6773,0.0,0.0,30.21,0.0334,2.3796
TCGA-DU-6402-01A-11R-1708-07,63.517,0.1585,69.5365,4.0278,3.3996,18.9023,21.5556,99.9606,21.0932,19.1994,...,0.1183,0.0,0.0,0.0,0.2771,0.0,0.0,12.8823,0.0365,1.7755
TCGA-HT-7611-01A-11R-2403-07,80.4729,0.3284,63.8477,6.9449,3.2576,4.6935,3.8948,7.3423,55.6218,32.6056,...,0.3471,0.0,0.0,0.0,0.1914,0.0,0.0,28.5323,0.0252,1.8031
TCGA-DB-A4XA-01A-11R-A26U-07,31.015,0.2702,48.8297,4.3915,1.2825,11.0362,2.9982,10.4849,12.7338,24.7776,...,0.5645,0.0,0.0,0.0,0.3778,0.0,0.0,31.2038,0.0497,4.5573


# TCGA-COAD

In [9]:
project = "TCGA-COAD"
dataset = LayerDataset(DATA_DIR, project, layer)

In [10]:
col_types = dataset.get_types_of_columns()
head = dataset.get_layer(n_rows=5)
ids = dataset.get_layer_by_column_type("")

print(f"Number of miRNAs: {ids.shape[0]}\nNumber of patients: {col_types.most_common()[0][1]}")
print(f"Types of columns: {col_types}")
display(head)

Number of miRNAs: 60664
Number of patients: 524
Types of columns: Counter({'unstranded_': 524, 'stranded_first_': 524, 'stranded_second_': 524, 'tpm_unstranded_': 524, 'fpkm_unstranded_': 524, 'fpkm_uq_unstranded_': 524, 'gene_id': 1, 'gene_name': 1, 'gene_type': 1})


Unnamed: 0_level_0,gene_name,gene_type,unstranded_TCGA-A6-6141-01A-11R-1774-07,unstranded_TCGA-G4-6309-01A-21R-1839-07,unstranded_TCGA-AZ-4682-01B-01R-A32Z-07,unstranded_TCGA-QL-A97D-01A-12R-A41B-07,unstranded_TCGA-AD-5900-01A-11R-1653-07,unstranded_TCGA-AD-6963-01A-11R-1928-07,unstranded_TCGA-DM-A1D4-01A-21R-A155-07,unstranded_TCGA-CK-4948-01B-11R-1653-07,...,unstranded_TCGA-DM-A1D8-01A-11R-A155-07,unstranded_TCGA-DM-A1HB-01A-21R-A180-07,unstranded_TCGA-DM-A28F-01A-11R-A32Y-07,unstranded_TCGA-F4-6855-01A-11R-1928-07,unstranded_TCGA-G4-6306-01A-11R-1774-07,unstranded_TCGA-G4-6314-01A-11R-1723-07,unstranded_TCGA-G4-6586-01A-11R-1774-07,unstranded_TCGA-NH-A8F7-01A-11R-A41B-07,unstranded_TCGA-NH-A8F7-06A-31R-A41B-07,unstranded_TCGA-QG-A5Z2-01A-11R-A28H-07
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.15,TSPAN6,protein_coding,4442,1033,13510,5351,1791,7331,6596,9680,...,10358,2446,19176,5400,6536,8172,2405,10413,15299,3559
ENSG00000000005.6,TNMD,protein_coding,22,2,326,181,0,10,36,270,...,1,0,291,16,29,36,3,91,26,30
ENSG00000000419.13,DPM1,protein_coding,1016,1500,6092,3549,1469,6461,3717,3640,...,4143,2394,4622,1412,2653,2255,1462,3815,5139,1902
ENSG00000000457.14,SCYL3,protein_coding,540,329,813,666,450,839,953,825,...,881,672,951,712,202,594,439,647,614,1131
ENSG00000000460.17,C1orf112,protein_coding,218,399,883,580,276,751,564,719,...,570,368,858,291,115,504,288,362,561,504


Check for the presence of NaNs

In [11]:
t = dataset.get_layer_by_column_type('unstranded_')
t = t.drop(columns=["gene_name", "gene_type"])
print(f"Are there any NaNs?\n{t.isna().any().any()}")

Are there any NaNs?
False


Sparsity analysis - identify and remove columns with high percentage (>95) of 0s

In [12]:
print("fraction of zeros in whole dataset: ", (t == 0).sum().sum() / (t.shape[0]*t.shape[1]))
zero_count = ((t == 0).sum(axis="columns") / t.shape[1])
print(zero_count.sort_values(ascending=False).head(5))
all_zero = zero_count == 1
index_remove_zero = t[all_zero].index

fraction of zeros in whole dataset:  0.5083016399680684
gene_id
ENSG00000252656.1    1.0
ENSG00000281426.1    1.0
ENSG00000272821.1    1.0
ENSG00000272139.1    1.0
ENSG00000223362.1    1.0
dtype: float64


Process and save datasets

In [13]:
col_types = dataset.get_types_of_columns()
print("Types of columns:\n", list(col_types.keys()))

Types of columns:
 ['gene_id', 'gene_name', 'gene_type', 'unstranded_', 'stranded_first_', 'stranded_second_', 'tpm_unstranded_', 'fpkm_unstranded_', 'fpkm_uq_unstranded_']


Raw Counts Dataset

In [14]:
type_of_column = "unstranded_"

df = dataset.get_layer_by_column_type(type_of_column)
print(df.shape)

# drop last 4 rows with NaN values
df = df.dropna()
print(df.shape)

# drop columns gene_name and gene_type
df = df.drop(columns = ["gene_name", "gene_type"])
print(df.shape)

# drop genes with all zeros
df = df.drop(index=index_remove_zero)
print(df.shape)

# transpose dataframe
df = df.T

display(df.head())
dataset.set_raw_data(data_type="counts", df=df)

(60664, 526)
(60660, 526)
(60660, 524)
(57622, 524)


gene_id,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288658.1,ENSG00000288659.1,ENSG00000288660.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288674.1,ENSG00000288675.1
TCGA-A6-6141-01A-11R-1774-07,4442,22,1016,540,218,152,383,2108,1335,896,...,7,0,2,0,15,0,0,89,8,18
TCGA-G4-6309-01A-21R-1839-07,1033,2,1500,329,399,91,278,4419,1592,1156,...,1,0,1,0,18,0,0,107,1,13
TCGA-AZ-4682-01B-01R-A32Z-07,13510,326,6092,813,883,114,594,6515,3346,1739,...,4,0,4,0,20,0,0,206,6,16
TCGA-QL-A97D-01A-12R-A41B-07,5351,181,3549,666,580,131,494,1520,1859,1888,...,1,0,2,0,46,0,0,240,4,12
TCGA-AD-5900-01A-11R-1653-07,1791,0,1469,450,276,426,956,1914,1728,968,...,6,0,0,0,9,0,0,169,4,14


Transcripts Per Million (TPM) Dataset

In [15]:
type_of_column = "tpm_unstranded_"

df = dataset.get_layer_by_column_type(type_of_column)
print(df.shape)

# drop last 4 rows with NaN values
df = df.dropna()
print(df.shape)

# drop columns gene_name and gene_type
df = df.drop(columns = ["gene_name", "gene_type"])
print(df.shape)

# drop genes with all zeros
df = df.drop(index=index_remove_zero)
print(df.shape)

# transpose dataframe
df = df.T

display(df.head())
dataset.set_raw_data(data_type="tpm", df=df)

(60664, 526)
(60660, 526)
(60660, 524)
(57622, 524)


gene_id,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288658.1,ENSG00000288659.1,ENSG00000288660.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288674.1,ENSG00000288675.1
TCGA-A6-6141-01A-11R-1774-07,77.1112,1.1737,66.2824,6.1777,2.8754,3.539,3.7793,58.8201,12.1979,18.5132,...,0.1858,0.0,0.2166,0.0,0.2073,0.0,0.0,3.8783,0.0655,0.8437
TCGA-G4-6309-01A-21R-1839-07,21.272,0.1266,116.0821,4.4648,6.2428,2.5133,3.254,146.2676,17.2551,28.3335,...,0.0315,0.0,0.1285,0.0,0.295,0.0,0.0,5.531,0.0097,0.7228
TCGA-AZ-4682-01B-01R-A32Z-07,165.5622,12.2775,280.5638,6.5659,8.2218,1.8737,4.1377,128.3324,21.5823,25.3653,...,0.0749,0.0,0.3058,0.0,0.1951,0.0,0.0,6.3371,0.0347,0.5294
TCGA-QL-A97D-01A-12R-A41B-07,79.2385,8.237,197.5027,6.4994,6.5257,2.6018,4.1581,36.1794,14.4893,33.2765,...,0.0226,0.0,0.1848,0.0,0.5422,0.0,0.0,8.9213,0.0279,0.4798
TCGA-AD-5900-01A-11R-1653-07,34.0663,0.0,105.0067,5.6408,3.9888,10.8677,10.3361,58.5177,17.2997,21.9149,...,0.1745,0.0,0.0,0.0,0.1363,0.0,0.0,8.0692,0.0359,0.719
