# Ingest gene expression data from TCGA, TARGET and GTEX

Ingest from xena, calculate a tumor/normal feature, and export into an hdf5 file formatted for machine learning

In [7]:
import numpy as np
import pandas as pd
import h5py
from collections import defaultdict

!pip3 install -q tables

In [8]:
%%time
# Convert expression tsv into dataframe loading directly as float32 vs. pandas float64 default
# Source: https://toil.xenahubs.net/download/TcgaTargetGtex_rsem_gene_tpm
converters = defaultdict(str)
converters["sample"] = str
expression = pd.read_csv("/data/scratch/rcurrie/xena/TcgaTargetGtex_rsem_gene_tpm", 
                         sep="\t", index_col=0, converters=converters, dtype=np.float32).T
expression.to_hdf("/data/scratch/rcurrie/xena/TcgaTargetGtex_rsem_gene_tpm.hd5", "expression", 
                  mode="w", format="fixed")

CPU times: user 18min 55s, sys: 16.9 s, total: 19min 12s
Wall time: 19min 13s


In [9]:
%%time
# Read back hdf using pandas - conveniance for later iterations so we can skip above tsv import
X = pd.read_hdf("/data/scratch/rcurrie/xena/TcgaTargetGtex_rsem_gene_tpm.hd5", "expression").sort_index(axis=0)

CPU times: user 1.87 s, sys: 4.33 s, total: 6.2 s
Wall time: 6.2 s


In [21]:
X.head()

sample,ENSG00000242268.2,ENSG00000259041.1,ENSG00000270112.3,ENSG00000167578.16,ENSG00000278814.1,ENSG00000078237.5,ENSG00000269416.5,ENSG00000263642.1,ENSG00000146083.11,ENSG00000158486.13,...,ENSG00000009694.13,ENSG00000238244.3,ENSG00000216352.1,ENSG00000123685.8,ENSG00000267117.1,ENSG00000273233.1,ENSG00000105063.18,ENSG00000231119.2,ENSG00000280861.1,ENSG00000181518.3
GTEX-1117F-0226-SM-5GZZ7,-9.9658,-9.9658,-4.2934,5.119,-9.9658,0.8488,-9.9658,-9.9658,5.1498,-9.9658,...,2.4571,-1.4305,-9.9658,2.8178,-0.1031,-9.9658,5.1631,-3.3076,-9.9658,-9.9658
GTEX-1117F-0426-SM-5EGHI,-9.9658,-9.9658,0.0014,4.1277,-9.9658,0.688,-9.9658,-9.9658,3.483,-9.9658,...,-0.9132,-9.9658,-9.9658,-0.9406,-9.9658,-1.5105,4.1764,-5.0116,-9.9658,-9.9658
GTEX-1117F-0526-SM-5EGHJ,-9.9658,-9.9658,-9.9658,4.4067,-9.9658,0.044,-9.9658,-9.9658,4.3841,-9.9658,...,1.5165,-9.9658,-9.9658,1.7141,-1.1488,-9.9658,4.8768,-9.9658,-9.9658,-9.9658
GTEX-1117F-0626-SM-5N9CS,-1.2481,-9.9658,-5.5735,5.686,-9.9658,1.3679,-9.9658,-9.9658,5.0644,-1.8836,...,1.5998,-9.9658,-9.9658,3.9356,-1.1488,-1.0559,4.8694,-1.9379,-9.9658,-9.9658
GTEX-1117F-0726-SM-5GIEN,-3.816,-9.9658,0.3573,4.0357,-9.9658,-0.4325,-5.5735,-9.9658,3.9421,-3.458,...,-3.1714,-9.9658,-9.9658,0.6608,-1.1811,-9.9658,3.6816,-2.6349,-9.9658,-9.9658


In [46]:
# Convert from ensembl to hugo gene names
# http://www.genenames.org/cgi-bin/statistics
ensembl = pd.read_table("/data/reference/hgnc_complete_set.txt", 
                        usecols=["symbol", "ensembl_gene_id", "gene_family"],
                        index_col=2)
ensembl.head()
# ensembl.loc["ENSG00000121410"].symbol

Unnamed: 0_level_0,symbol,gene_family
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000121410,A1BG,Immunoglobulin like domain containing
ENSG00000268895,A1BG-AS1,
ENSG00000148584,A1CF,RNA binding motif containing
ENSG00000175899,A2M,"C3 and PZP like, alpha-2-macroglobulin domain ..."
ENSG00000245105,A2M-AS1,


In [37]:
# Read in the clinical saved from https://toil.xenahubs.net/download/TcgaTargetGTEX_phenotype.txt
Y = pd.read_table("/data/scratch/rcurrie/xena/TcgaTargetGTEX_phenotype.txt",
                  header=0, names=["detailed_category", "primary_site", "sample_type", "gender", "study"],
                  sep="\t", encoding="ISO-8859-1", index_col=0, dtype="str").sort_index(axis=0)

# Add column for tumor/normal
Y["tumor_normal"] = Y.apply(
    lambda row: "Normal" if row["sample_type"] in ["Cell Line", "Normal Tissue", "Solid Tissue Normal"]
    else "Tumor", axis=1)

Y.head()

Unnamed: 0,detailed_category,primary_site,sample_type,gender,study,tumor_normal
GTEX-1117F-0226-SM-5GZZ7,Adipose - Subcutaneous,Adipose Tissue,Normal Tissue,Female,GTEX,Normal
GTEX-1117F-0426-SM-5EGHI,Muscle - Skeletal,Muscle,Normal Tissue,Female,GTEX,Normal
GTEX-1117F-0526-SM-5EGHJ,Artery - Tibial,Blood Vessel,Normal Tissue,Female,GTEX,Normal
GTEX-1117F-0626-SM-5N9CS,Artery - Coronary,Blood Vessel,Normal Tissue,Female,GTEX,Normal
GTEX-1117F-0726-SM-5GIEN,Heart - Atrial Appendage,Heart,Normal Tissue,Female,GTEX,Normal


In [38]:
Y.describe()

Unnamed: 0,detailed_category,primary_site,sample_type,gender,study,tumor_normal
count,19130,19126,19131,18972,19131,19131
unique,93,46,17,2,3,2
top,Breast Invasive Carcinoma,Brain,Primary Tumor,Male,TCGA,Tumor
freq,1212,1846,9185,10456,10535,10531


In [39]:
# Label to use for stratified folds to make sure we have equal representation in
# our training and validation sets
label_for_classes = "primary_site"
label_to_predict = "tumor_normal"
output_file = "/data/scratch/rcurrie/tumor_normal.h5"

In [40]:
# Remove rows where the label is null or the sample is missing
Y_not_null = Y[pd.notnull(Y[label_for_classes])]
intersection = X.index.intersection(Y_not_null.index)
X_clean = X[X.index.isin(intersection)]
Y_clean = Y[Y.index.isin(intersection)]
print(intersection.shape[0], "samples with non-null labels")

19126 samples with non-null labels


In [41]:
# Make sure the label and example samples are in the same order
assert(X_clean.index.equals(Y_clean.index))

In [42]:
# Convert classes into numbers
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(Y_not_null[label_for_classes].values)
classes = list(encoder.classes_)
classes = encoder.transform(Y_clean[label_for_classes])
print("Total classes for stratification:", len(set(classes)))

Total classes for stratification: 46


In [43]:
# Convert prediction target into one-hot
# http://stackoverflow.com/questions/31947140/sklearn-labelbinarizer-returns-vector-when-there-are-2-classes
# so we get [0,1] and [1,0] instead of [0] and [1]
from sklearn.preprocessing import LabelBinarizer
import numpy as np

class OneHotLabelBinarizer(LabelBinarizer):
    def transform(self, y):
        Y = super().transform(y)
        if self.y_type_ == 'binary':
            return np.hstack((Y, 1-Y))
        else:
            return Y

    def inverse_transform(self, Y, threshold=None):
        if self.y_type_ == 'binary':
            return super().inverse_transform(Y[:, 0], threshold)
        else:
            return super().inverse_transform(Y, threshold)

encoder = OneHotLabelBinarizer()
y_one_hot = encoder.fit_transform(Y_clean[label_to_predict])
labels = encoder.classes_
y_one_hot

array([[0, 1],
       [0, 1],
       [0, 1],
       ..., 
       [1, 0],
       [0, 1],
       [1, 0]])

In [45]:
"""
Write to an h5 file for training
X: examples as a numpy floating point array
y: one hot array to predict
classes: integer array of classes for stratification
features: string array of the features in X
labels: array of strings coresponding to the one-hot labels
"""
with h5py.File(output_file, "w") as f:
    f.create_dataset('X', X_clean.shape, dtype='f')[:] = X_clean.values
    f.create_dataset('y', y_one_hot.shape, dtype='i')[:] = y_one_hot

    f.create_dataset('classes', classes.shape, dtype='i')[:] = classes
    f.create_dataset('features', X_clean.columns.shape, 'S10', 
                     [l.encode("ascii", "ignore") for l in X_clean.columns.values])
    f.create_dataset('labels', (len(labels), 1), 'S10', 
                     [l.encode("ascii", "ignore") for l in labels])