In [1]:
import os, sys
import numpy as np

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from packages.tcgahandler import LayerDataset

from IPython.display import display
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None  # default='warn'

DATA_DIR = "../../data/"

# TCGA-LGG

In [2]:
project = "TCGA-LGG"
layer = "protein"
dataset = LayerDataset(DATA_DIR, project, layer)

In [3]:
col_types = dataset.get_types_of_columns()
head = dataset.get_layer(n_rows=5)
ids = dataset.get_layer_by_column_type("")

print(f"Number of proteins: {ids.shape[0]}\nNumber of patients: {col_types.most_common()[0][1]}")
print(f"Types of columns: {col_types}")
display(head)

Number of proteins: 487
Number of patients: 435
Types of columns: Counter({'': 435, 'AGID': 1, 'lab_id': 1, 'catalog_number': 1, 'set_id': 1, 'peptide_target': 1})


Unnamed: 0_level_0,lab_id,catalog_number,set_id,peptide_target,TCGA-DB-5279-01A,TCGA-HT-7855-01A,TCGA-QH-A6CW-01A,TCGA-DB-A64Q-01A,TCGA-HT-7604-01A,TCGA-FG-7637-01A,...,TCGA-HT-7880-01A,TCGA-QH-A6CX-01A,TCGA-F6-A8O4-01A,TCGA-HT-7470-01A,TCGA-TQ-A7RG-01A,TCGA-P5-A731-01A,TCGA-QH-A65V-01A,TCGA-P5-A5F1-01A,TCGA-S9-A6TY-01A,TCGA-R8-A6YH-01A
AGID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AGID00100,882,sc-628,Old,1433BETA,0.063553,0.097345,0.3255,-0.095779,-0.041091,0.27307,...,0.32448,0.092804,0.62854,0.19147,0.055679,0.50066,0.48557,0.3011,0.23886,0.23749
AGID00111,913,sc-23957,Old,1433EPSILON,0.28431,0.73337,0.095182,0.11432,0.12982,0.34437,...,0.25907,0.077194,0.2288,0.26427,0.064463,0.28235,0.21431,0.46626,0.44848,0.10152
AGID00101,883,sc-1019,Old,1433ZETA,-0.40822,-0.54587,0.04258,-0.34079,-0.1315,-0.22034,...,-0.020128,-0.40286,-0.12014,0.1951,-0.4396,-0.29249,-0.30212,0.26825,0.54965,-0.41721
AGID00001,2,9452,Old,4EBP1,-0.68649,-0.46193,-0.47662,-0.53777,0.024208,-0.24296,...,-0.94528,-0.72787,-0.36614,-0.84887,-0.67266,-0.62856,-0.63387,-0.12012,-0.30425,-0.52234
AGID00002,3,9456,Old,4EBP1_pS65,-0.25424,-0.28065,-0.19106,0.014253,-0.13592,0.17197,...,0.16286,-0.3867,-0.49195,0.34179,-0.18518,-0.12082,-0.011695,-0.18069,0.09742,-0.25235


Check for the presence of NaNs

In [4]:
t = dataset.get_layer_by_column_type('')
t = t.drop(columns=["lab_id", "catalog_number","set_id", "peptide_target"])
print(f"Are there any NaNs?\n{t.isna().any().any()}")

Are there any NaNs?
True


Sparsity analysis - identify and remove columns with high percentage (>95) of 0s

In [5]:
print("fraction of zeros in whole dataset: ", (t == 0).sum().sum() / (t.shape[0]*t.shape[1]))
zero_count = ((t == 0).sum(axis="columns") / t.shape[1])
print(zero_count.sort_values(ascending=False).head(5))

fraction of zeros in whole dataset:  0.0
AGID
AGID00100    0.0
AGID02155    0.0
AGID00245    0.0
AGID02199    0.0
AGID00323    0.0
dtype: float64


Process and save datasets

In [6]:
col_types = dataset.get_types_of_columns()
print("Types of columns:\n", list(col_types.keys()))

Types of columns:
 ['AGID', 'lab_id', 'catalog_number', 'set_id', 'peptide_target', '']


Relative Expression Dataset

In [7]:
type_of_column = ""

df = dataset.get_layer_by_column_type(type_of_column)
df = df.drop(columns=["lab_id", "catalog_number","set_id", "peptide_target"])
print(df.shape)

# drop rows with NaN
df = df.dropna(axis="index", how="any")
print(df.shape)

# transpose dataframe
df = df.T

display(df.head())
dataset.set_raw_data(data_type="rppa", df=df)

(487, 435)
(456, 435)


AGID,AGID00100,AGID00111,AGID00101,AGID00001,AGID00002,AGID00003,AGID00443,AGID00120,AGID00004,AGID00005,...,AGID00349,AGID02137,AGID00088,AGID00089,AGID00504,AGID00095,AGID02217,AGID02210,AGID00326,AGID00432
TCGA-DB-5279-01A,0.063553,0.28431,-0.40822,-0.68649,-0.25424,0.44096,-0.40288,-0.44502,-0.010774,0.43273,...,-0.525998,-0.44341,-0.59079,-2.477,-0.15133,0.073364,-0.841587,-0.883834,-1.475134,0.65832
TCGA-HT-7855-01A,0.097345,0.73337,-0.54587,-0.46193,-0.28065,0.83087,-0.3485,-0.57879,-0.15299,-0.7307,...,-0.42958,-0.21012,-0.20853,-0.83595,0.23712,0.01625,-0.167003,-0.186865,-1.418856,0.43177
TCGA-QH-A6CW-01A,0.3255,0.095182,0.04258,-0.47662,-0.19106,0.77002,-0.17413,-0.2948,-0.19183,-0.52805,...,0.022217,-0.33641,-0.27488,-0.76129,-0.53427,0.10893,-0.346167,-0.613352,-1.517299,1.191739
TCGA-DB-A64Q-01A,-0.095779,0.11432,-0.34079,-0.53777,0.014253,0.93708,-0.25359,0.12298,0.013886,-0.029776,...,0.306443,-0.3716,-0.37835,-1.1467,-0.5665,0.057059,-0.724308,-0.45713,-1.451234,0.738193
TCGA-HT-7604-01A,-0.041091,0.12982,-0.1315,0.024208,-0.13592,1.2621,-0.35806,0.56085,0.58171,0.52778,...,-0.247401,-0.15366,-0.41184,-0.97738,-0.31331,0.18063,0.062216,-0.050716,-1.581383,0.880428


# TCGA-COAD

In [8]:
project = "TCGA-COAD"
layer = "protein"
dataset = LayerDataset(DATA_DIR, project, layer)

In [9]:
col_types = dataset.get_types_of_columns()
head = dataset.get_layer(n_rows=5)
ids = dataset.get_layer_by_column_type("")

print(f"Number of proteins: {ids.shape[0]}\nNumber of patients: {col_types.most_common()[0][1]}")
print(f"Types of columns: {col_types}")
display(head)

Number of proteins: 727
Number of patients: 363
Types of columns: Counter({'': 363, 'AGID': 1, 'lab_id': 1, 'catalog_number': 1, 'set_id': 1, 'peptide_target': 1})


Unnamed: 0_level_0,lab_id,catalog_number,set_id,peptide_target,TCGA-A6-6141-01A,TCGA-CM-6678-01A,TCGA-D5-6922-01A,TCGA-CM-5341-01A,TCGA-DM-A0X9-01A,TCGA-AA-3502-01A,...,TCGA-AA-3664-01A,TCGA-4T-AA8H-01A,TCGA-DM-A1DB-01A,TCGA-AA-A00J-01A,TCGA-AD-6548-01A,TCGA-G4-6588-01A,TCGA-SS-A7HO-01A,TCGA-A6-6649-01A,TCGA-5M-AATA-01A,TCGA-5M-AAT5-01A
AGID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AGID00100,882,sc-628,Old,1433BETA,-0.25023,0.077985,0.1971,0.10278,-0.12771,-0.073956,...,-0.26081,-0.13222,-0.19462,0.13954,0.14213,0.25254,0.07723,-0.12765,0.002181,0.094542
AGID00111,913,sc-23957,Old,1433EPSILON,-0.17142,0.3394,-0.11991,-0.28875,-0.14024,-0.24868,...,0.059625,0.058415,-0.15329,-0.18585,-0.028797,0.12731,-0.013323,-0.13385,-0.15004,-0.23604
AGID00101,883,sc-1019,Old,1433ZETA,-0.094394,0.69463,0.061087,0.98698,0.10853,0.073882,...,0.50133,0.55963,-0.15779,-0.27101,0.45927,0.62917,-0.084323,0.3182,0.51082,0.053267
AGID00001,2,9452,Old,4EBP1,-0.23947,-0.092303,-0.47642,0.2285,0.73142,0.27165,...,-0.077359,0.78824,-0.059245,-0.20477,0.12496,-0.58954,0.87159,0.15541,0.68438,0.1696
AGID00002,3,9456,Old,4EBP1_pS65,-0.34213,-0.32331,0.61476,0.06715,-0.35289,-0.012491,...,-0.22126,-0.26887,-0.19196,0.49149,0.15907,-0.36609,-0.67702,1.0113,-0.50767,-0.18622


Check for the presence of NaNs

In [10]:
t = dataset.get_layer_by_column_type('')
t = t.drop(columns=["lab_id", "catalog_number","set_id", "peptide_target"])
print(f"Are there any NaNs?\n{t.isna().any().any()}")

Are there any NaNs?
True


Sparsity analysis - identify and remove columns with high percentage (>95) of 0s

In [11]:
print("fraction of zeros in whole dataset: ", (t == 0).sum().sum() / (t.shape[0]*t.shape[1]))
zero_count = ((t == 0).sum(axis="columns") / t.shape[1])
print(zero_count.sort_values(ascending=False).head(5))

fraction of zeros in whole dataset:  0.0
AGID
AGID00100    0.0
AGID02137    0.0
AGID00089    0.0
AGID00504    0.0
AGID00095    0.0
dtype: float64


Process and save datasets

In [12]:
col_types = dataset.get_types_of_columns()
print("Types of columns:\n", list(col_types.keys()))

Types of columns:
 ['AGID', 'lab_id', 'catalog_number', 'set_id', 'peptide_target', '']


Relative Expression Dataset

In [13]:
type_of_column = ""

df = dataset.get_layer_by_column_type(type_of_column)
df = df.drop(columns=["lab_id", "catalog_number","set_id", "peptide_target"])
print(df.shape)

# drop rows with NaN
df = df.dropna(axis="index", thresh=0.8*df.shape[1])
df = df.dropna(axis="columns", thresh=0.5*df.shape[0])
df = df.dropna(axis="index", how="any")
print(df.shape)

# transpose dataframe
df = df.T

display(df.head())
dataset.set_raw_data(data_type="rppa", df=df)

(727, 363)
(455, 351)


AGID,AGID00100,AGID00111,AGID00101,AGID00001,AGID00002,AGID00003,AGID00443,AGID00120,AGID00004,AGID00005,...,AGID00349,AGID02137,AGID00088,AGID00089,AGID00504,AGID00095,AGID02217,AGID02210,AGID00326,AGID00432
TCGA-A6-6141-01A,-0.25023,-0.17142,-0.094394,-0.23947,-0.34213,-0.22751,-0.12147,0.010664,0.70318,0.41453,...,-0.051767,-0.41372,-0.31924,-0.42726,0.1755,0.022338,0.111084,-0.283443,0.126924,0.288428
TCGA-CM-6678-01A,0.077985,0.3394,0.69463,-0.092303,-0.32331,-0.16053,-0.35774,-1.0817,0.19673,0.14765,...,-1.072271,0.12978,0.57359,0.54724,0.3662,-0.063297,0.305689,0.110589,-1.656316,0.088431
TCGA-D5-6922-01A,0.1971,-0.11991,0.061087,-0.47642,0.61476,-0.61644,-0.30777,-0.75514,0.42579,-0.087877,...,-0.201523,-0.10977,-0.018871,-0.024563,0.23147,-0.14068,0.192026,0.312169,-1.031938,-0.110418
TCGA-CM-5341-01A,0.10278,-0.28875,0.98698,0.2285,0.06715,-0.479,-0.15629,-0.58233,-0.4805,-0.37074,...,-0.51583,-0.5054,-0.26351,0.14978,1.4802,-0.018243,0.056647,0.483704,-0.813626,-0.224691
TCGA-DM-A0X9-01A,-0.12771,-0.14024,0.10853,0.73142,-0.35289,-0.88487,-0.20602,-0.29411,-0.26123,0.63974,...,-0.284889,0.021421,-0.2481,-0.30953,0.34952,-0.083643,0.484082,0.459417,-0.832135,-0.353984
