# Description

It reads the entire GTEx v8 data set, performs some preprocessing and then standardizes the data to have mean 0 and std 1.

# Modules

In [1]:
# reload imported modules if changed
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from IPython.display import display

# from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from pvae import conf

# Settings and paths

In [2]:
conf.data.GTEX_PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
display(conf.data.GTEX_PROCESSED_DIR)

PosixPath('/home/prashant/Documents/milton_lab/pvae/base/input/gtex_v8/processed')

In [3]:
# INPUT_FILES and OUTPUT_FILES must be provided if running with pytask
INPUT_FILES = {
    "gtex_gene_expr_data_feather": conf.data.GTEX_PROCESSED_DIR
    / "gtex_v8_data-full.ftr",
}

OUTPUT_FILES = {
    "gtex_gene_expr_data_feather_prep": conf.data.GTEX_PROCESSED_DIR
    / "gtex_v8_data-full-prep.ftr",
    "gtex_gene_expr_data_feather_log": conf.data.GTEX_PROCESSED_DIR
    / "gtex_v8_data-full-log.ftr",
    "gtex_gene_expr_data_feather_std": conf.data.GTEX_PROCESSED_DIR
    / "gtex_v8_data-full-std.ftr",
}

# Load data

In [4]:
full_dataset = pd.read_feather(INPUT_FILES["gtex_gene_expr_data_feather"])

In [5]:
full_dataset.shape

(56200, 17384)

In [6]:
full_dataset.head()

Unnamed: 0,gene_ens_id,gene_symbol,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
0,ENSG00000223972.5,DDX11L1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.03629,0.0,0.0,0.0,0.0,0.0,0.0,0.01965,0.02522
1,ENSG00000227232.5,WASH7P,8.764,3.861,7.349,11.07,3.306,5.389,11.99,16.95,...,1.606,2.268,5.386,2.31,2.456,4.023,1.922,2.857,0.8696,2.167
2,ENSG00000278267.1,MIR6859-1,0.0,0.0,1.004,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ENSG00000243485.5,MIR1302-2HG,0.07187,0.0,0.0,0.06761,0.0,0.0,0.0,0.0,...,0.0,0.0,0.06073,0.0,0.08464,0.1435,0.0,0.05216,0.0,0.0
4,ENSG00000237613.2,FAM138A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03904,...,0.02429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Prepare data

In [7]:
full_dataset.isna().any(axis=None)

np.False_

In [8]:
data = full_dataset.set_index("gene_ens_id").rename_axis(index=None)

In [9]:
data = data.iloc[:, 1:].T

In [10]:
data.shape

(17382, 56200)

In [11]:
data.head()

Unnamed: 0,ENSG00000223972.5,ENSG00000227232.5,ENSG00000278267.1,ENSG00000243485.5,ENSG00000237613.2,ENSG00000268020.3,ENSG00000240361.1,ENSG00000186092.4,ENSG00000238009.6,ENSG00000233750.3,...,ENSG00000198886.2,ENSG00000210176.1,ENSG00000210184.1,ENSG00000210191.1,ENSG00000198786.2,ENSG00000198695.2,ENSG00000210194.1,ENSG00000198727.2,ENSG00000210195.2,ENSG00000210196.2
GTEX-1117F-0226-SM-5GZZ7,0.0,8.764,0.0,0.07187,0.0,0.0,0.06621,0.0,0.0,0.03595,...,12400.0,0.0,0.0,0.0,2928.0,3799.0,16.24,6938.0,0.943,0.0
GTEX-1117F-0426-SM-5EGHI,0.0,3.861,0.0,0.0,0.0,0.056,0.05004,0.1025,0.04574,0.01359,...,34030.0,0.0,0.0,0.0,10400.0,14750.0,44.31,26310.0,6.414,6.226
GTEX-1117F-0526-SM-5EGHJ,0.0,7.349,1.004,0.0,0.0,0.0,0.0,0.07434,0.09953,0.0,...,13820.0,0.9891,0.0,0.0,4471.0,6728.0,23.74,8455.0,1.034,1.004
GTEX-1117F-0626-SM-5N9CS,0.0,11.07,0.0,0.06761,0.0,0.0,0.0,0.0,0.0,0.0,...,11990.0,0.0,0.0,0.0,1237.0,817.3,0.0,8799.0,0.0,0.0
GTEX-1117F-0726-SM-5GIEN,0.0,3.306,0.0,0.0,0.0,0.0,0.0,0.04233,0.07556,0.0,...,69350.0,0.5631,0.0,0.5473,7780.0,7051.0,2.252,29170.0,0.0,2.857


# Save original data

In [12]:
data_output = data.T.reset_index()
data_output.head()

Unnamed: 0,index,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
0,ENSG00000223972.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.03629,0.0,0.0,0.0,0.0,0.0,0.0,0.01965,0.02522
1,ENSG00000227232.5,8.764,3.861,7.349,11.07,3.306,5.389,11.99,16.95,10.04,...,1.606,2.268,5.386,2.31,2.456,4.023,1.922,2.857,0.8696,2.167
2,ENSG00000278267.1,0.0,0.0,1.004,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ENSG00000243485.5,0.07187,0.0,0.0,0.06761,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.06073,0.0,0.08464,0.1435,0.0,0.05216,0.0,0.0
4,ENSG00000237613.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03904,0.0,...,0.02429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
data_output.to_feather(OUTPUT_FILES["gtex_gene_expr_data_feather_prep"])

# Save log2(data + 1)

In [14]:
data_log = np.log2(data.T + 1).reset_index()
data_log.head()

Unnamed: 0,index,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
0,ENSG00000223972.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.051428,0.0,0.0,0.0,0.0,0.0,0.0,0.028074,0.035934
1,ENSG00000227232.5,3.287472,2.281253,3.061603,3.593354,2.106348,2.67559,3.69933,4.165912,3.464668,...,1.381837,1.708408,2.674913,1.726831,1.789103,2.328549,1.546956,1.947479,0.90273,1.663117
2,ENSG00000278267.1,0.0,0.0,1.002883,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ENSG00000243485.5,0.10013,0.0,0.0,0.094385,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.085057,0.0,0.117216,0.193456,0.0,0.073354,0.0,0.0
4,ENSG00000237613.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055251,0.0,...,0.034624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
data_log.to_feather(OUTPUT_FILES["gtex_gene_expr_data_feather_log"])

# Save data log2 z-scaled

In [16]:
# Standardize the features
scaler = StandardScaler().set_output(transform="pandas")
data_scaled = scaler.fit_transform(data_log.set_index("index").T)

In [17]:
data_scaled.head()

Unnamed: 0,ENSG00000223972.5,ENSG00000227232.5,ENSG00000278267.1,ENSG00000243485.5,ENSG00000237613.2,ENSG00000268020.3,ENSG00000240361.1,ENSG00000186092.4,ENSG00000238009.6,ENSG00000233750.3,...,ENSG00000198886.2,ENSG00000210176.1,ENSG00000210184.1,ENSG00000210191.1,ENSG00000198786.2,ENSG00000198695.2,ENSG00000210194.1,ENSG00000198727.2,ENSG00000210195.2,ENSG00000210196.2
GTEX-1117F-0226-SM-5GZZ7,-0.338351,1.446143,-0.170328,1.112595,-0.414894,-0.619364,0.115692,-0.617925,-0.459297,-0.240765,...,-1.132009,-0.906613,-0.792974,-0.771997,-0.656424,-0.294425,0.011071,-1.369166,-0.120123,-1.218601
GTEX-1117F-0426-SM-5EGHI,-0.338351,0.146659,-0.170328,-0.475818,-0.414894,0.223272,-0.07837,0.216348,-0.043118,-0.357425,...,0.273846,-0.906613,-0.792974,-0.771997,0.928498,1.170515,0.821533,0.500253,1.373591,0.914563
GTEX-1117F-0526-SM-5EGHJ,-0.338351,1.154444,7.806741,-0.475818,-0.414894,-0.619364,-0.698425,-0.004863,0.423619,-0.429593,...,-0.981031,-0.120193,-0.792974,-0.771997,-0.127165,0.322767,0.314011,-1.091847,-0.069068,-0.468806
GTEX-1117F-0626-SM-5N9CS,-0.338351,1.841174,-0.170328,1.021456,-0.414894,-0.619364,-0.698425,-0.617925,-0.459297,-0.429593,...,-1.178831,-0.906613,-0.792974,-0.771997,-1.73347,-1.952937,-2.376993,-1.035918,-0.861029,-1.218601
GTEX-1117F-0726-SM-5GIEN,-0.338351,-0.079222,-0.170328,-0.475818,-0.414894,-0.619364,-0.698425,-0.26347,0.218517,-0.429593,...,1.265273,-0.395809,-0.792974,-0.324998,0.565527,0.373407,-1.387902,0.644984,-0.861029,0.237412


In [18]:
data_scaled.iloc[:, 1:10].describe()

Unnamed: 0,ENSG00000227232.5,ENSG00000278267.1,ENSG00000243485.5,ENSG00000237613.2,ENSG00000268020.3,ENSG00000240361.1,ENSG00000186092.4,ENSG00000238009.6,ENSG00000233750.3
count,17382.0,17382.0,17382.0,17382.0,17382.0,17382.0,17382.0,17382.0,17382.0
mean,-1.7737e-15,-1.818052e-16,1.085722e-15,-1.071006e-16,2.097863e-15,-1.581368e-15,-2.052079e-15,1.365328e-16,-1.737727e-15
std,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029
min,-2.799471,-0.1703276,-0.4758178,-0.4148941,-0.6193641,-0.698425,-0.6179249,-0.4592973,-0.4295925
25%,-0.7445109,-0.1703276,-0.4758178,-0.4148941,-0.6193641,-0.698425,-0.6179249,-0.4592973,-0.4295925
50%,0.01217664,-0.1703276,-0.4758178,-0.4148941,-0.3050157,-0.1874367,-0.2245567,-0.2720171,-0.2898106
75%,0.7127909,-0.1703276,0.3438724,-0.4148941,0.3018597,0.2979726,0.1791415,0.002042197,-0.05982614
max,3.799565,14.99735,16.68885,19.67335,18.61204,17.45251,12.43482,15.9302,11.83146


In [19]:
data_output = data_scaled.T.reset_index()
data_output.head()

Unnamed: 0,index,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
0,ENSG00000223972.5,-0.338351,-0.338351,-0.338351,-0.338351,-0.338351,-0.338351,-0.338351,-0.338351,-0.338351,...,-0.338351,0.561293,-0.338351,-0.338351,-0.338351,-0.338351,-0.338351,-0.338351,0.152758,0.290247
1,ENSG00000227232.5,1.446143,0.146659,1.154444,1.841174,-0.079222,0.655926,1.978037,2.580606,1.674983,...,-1.014894,-0.593143,0.655051,-0.569351,-0.488929,0.20774,-0.801651,-0.284394,-1.633638,-0.651635
2,ENSG00000278267.1,-0.170328,-0.170328,7.806741,-0.170328,-0.170328,-0.170328,-0.170328,-0.170328,-0.170328,...,-0.170328,-0.170328,-0.170328,-0.170328,-0.170328,-0.170328,-0.170328,-0.170328,-0.170328,-0.170328
3,ENSG00000243485.5,1.112595,-0.475818,-0.475818,1.021456,-0.475818,-0.475818,-0.475818,-0.475818,-0.475818,...,-0.475818,-0.475818,0.873493,-0.475818,1.383645,2.593081,-0.475818,0.687837,-0.475818,-0.475818
4,ENSG00000237613.2,-0.414894,-0.414894,-0.414894,-0.414894,-0.414894,-0.414894,-0.414894,1.595137,-0.414894,...,0.844731,-0.414894,-0.414894,-0.414894,-0.414894,-0.414894,-0.414894,-0.414894,-0.414894,-0.414894


In [20]:
data_output.to_feather(OUTPUT_FILES["gtex_gene_expr_data_feather_std"])