# Description

It reads the entire Mice data set, performs some preprocessing and then standardizes the data to have mean 0 and std 1.

# Modules

In [1]:
# reload imported modules if changed
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from IPython.display import display

# from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from pvae import conf

# Settings and paths

In [2]:
conf.data.MICE_PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
display(conf.data.MICE_PROCESSED_DIR)

PosixPath('/home/prashant/Documents/milton_lab/pvae/base/input/mice_data/processed')

In [3]:
# INPUT_FILES and OUTPUT_FILES must be provided if running with pytask
INPUT_FILES = {
    "mice_gene_expr_data_feather": conf.data.MICE_PROCESSED_DIR
    / "no_scrna_rpkm.ftr",
}

OUTPUT_FILES = {
    "mice_gene_expr_data_feather_prep": conf.data.MICE_PROCESSED_DIR
    / "mice_data-full-prep.ftr",
    "mice_gene_expr_data_feather_log": conf.data.MICE_PROCESSED_DIR
    / "mice_data-full-log.ftr",
    "mice_gene_expr_data_feather_std": conf.data.MICE_PROCESSED_DIR
    / "mice_data-full-std.ftr",
}

# Load data

In [4]:
full_dataset = pd.read_feather(INPUT_FILES["mice_gene_expr_data_feather"])

In [5]:
full_dataset.shape

(10849, 190112)

In [6]:
full_dataset.head()

Unnamed: 0,index,SRR1557112,SRR1557113,SRR1557114,SRR1557115,SRR1557116,SRR1557117,SRR1557118,SRR1557119,SRR1557120,...,ERR1082767,ERR1082768,ERR1082769,ERR1082770,SRR5445512,SRR5445513,SRR5445514,SRR5445515,SRR5445516,SRR5445517
0,Xkr4,-0.335025,-0.323103,-0.263053,-0.241244,-0.181879,-0.338608,-0.34317,-0.347328,-0.348079,...,0.401471,0.441549,0.105787,0.159327,-0.345772,-0.34925,-0.34925,-0.34925,-0.34925,-0.344023
1,Sox17,-0.123598,-0.182498,-0.188383,-0.129148,1.963323,-0.203247,-0.203247,-0.202696,-0.203247,...,-0.038926,-0.052286,-0.031002,-0.030753,-0.191517,-0.203247,-0.197212,-0.19485,-0.191935,-0.203247
2,Mrpl15,3.482403,6.487705,3.042429,3.694001,2.108774,-0.373438,0.177227,-0.121857,1.226623,...,-0.551519,-0.603537,-0.590422,-0.588911,0.882692,0.652247,0.967914,0.412631,0.397471,0.45826
3,Lypla1,3.206998,5.01725,4.331312,4.547355,4.99124,-0.81386,-0.554094,-0.731604,-0.371866,...,-0.199951,-0.317052,-0.207565,-0.313447,0.59683,0.624814,0.849883,0.131341,0.17032,0.233833
4,Tcea1,3.921904,8.486859,5.057495,6.88478,4.994615,0.277471,1.280629,0.843965,1.616705,...,0.24652,0.386121,0.234953,0.112505,1.139192,1.139515,1.191143,0.031718,0.248511,0.233001


# Prepare data

In [7]:
full_dataset.isna().any(axis=None)

np.False_

In [8]:
data = full_dataset.set_index("index").rename_axis(index=None)

In [9]:
data = data.iloc[:, 1:].T

In [10]:
data.shape

(190110, 10849)

In [11]:
data.head()

Unnamed: 0,Xkr4,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,St18,Pcmtd1,Vcpip1,...,Ofd1,Trappc2,Rab9,Egfl6,Tmsb4x,Tlr8,Tlr7,Msl3,Amelx,Uty
SRR1557113,-0.323103,-0.182498,6.487705,5.01725,8.486859,0.208984,3.915403,-0.257123,1.991735,0.966572,...,-0.726326,-0.556579,-0.736511,-0.153961,-0.443546,-0.184273,-0.232621,-0.445884,-0.014402,-0.459125
SRR1557114,-0.263053,-0.188383,3.042429,4.331312,5.057495,0.454319,2.438414,-0.258343,1.919939,0.687475,...,-0.726326,-0.497728,-0.60054,-0.153961,-0.443565,-0.184273,-0.232621,-0.447351,-0.014402,-0.528039
SRR1557115,-0.241244,-0.129148,3.694001,4.547355,6.88478,0.446912,3.224474,-0.260245,1.491456,1.121649,...,-0.710569,-0.559632,-0.736511,-0.153961,-0.409422,-0.184273,-0.232621,-0.447351,-0.014402,-0.535765
SRR1557116,-0.181879,1.963323,2.108774,4.99124,4.994615,0.188691,3.565857,-0.259763,1.28974,1.543496,...,-0.726326,-0.555743,-0.732866,-0.153961,-0.331431,-0.184273,0.238683,-0.445215,-0.014402,-0.535765
SRR1557117,-0.338608,-0.203247,-0.373438,-0.81386,0.277471,0.264561,-0.749493,-0.250712,0.538078,1.556082,...,0.072192,0.085772,-0.731728,-0.153961,-0.368832,-0.184273,-0.228539,-0.434594,-0.014402,-0.535765


# Save original data

In [12]:
data_output = data.T.reset_index()
# print(data_output.drop('index', axis=1).min(axis=0).min())
display(data_output.isna().any(axis=None))
data_output.head()

np.False_

Unnamed: 0,index,SRR1557113,SRR1557114,SRR1557115,SRR1557116,SRR1557117,SRR1557118,SRR1557119,SRR1557120,SRR1557121,...,ERR1082767,ERR1082768,ERR1082769,ERR1082770,SRR5445512,SRR5445513,SRR5445514,SRR5445515,SRR5445516,SRR5445517
0,Xkr4,-0.323103,-0.263053,-0.241244,-0.181879,-0.338608,-0.34317,-0.347328,-0.348079,-0.342558,...,0.401471,0.441549,0.105787,0.159327,-0.345772,-0.34925,-0.34925,-0.34925,-0.34925,-0.344023
1,Sox17,-0.182498,-0.188383,-0.129148,1.963323,-0.203247,-0.203247,-0.202696,-0.203247,-0.203247,...,-0.038926,-0.052286,-0.031002,-0.030753,-0.191517,-0.203247,-0.197212,-0.19485,-0.191935,-0.203247
2,Mrpl15,6.487705,3.042429,3.694001,2.108774,-0.373438,0.177227,-0.121857,1.226623,0.247997,...,-0.551519,-0.603537,-0.590422,-0.588911,0.882692,0.652247,0.967914,0.412631,0.397471,0.45826
3,Lypla1,5.01725,4.331312,4.547355,4.99124,-0.81386,-0.554094,-0.731604,-0.371866,-0.723954,...,-0.199951,-0.317052,-0.207565,-0.313447,0.59683,0.624814,0.849883,0.131341,0.17032,0.233833
4,Tcea1,8.486859,5.057495,6.88478,4.994615,0.277471,1.280629,0.843965,1.616705,0.315265,...,0.24652,0.386121,0.234953,0.112505,1.139192,1.139515,1.191143,0.031718,0.248511,0.233001


In [13]:
data_output.to_feather(OUTPUT_FILES["mice_gene_expr_data_feather_prep"])

# Save log2(data + 1)

In [14]:
data_log = np.log2(data.T + 3).reset_index()
display(data_log.isna().any(axis=None))
data_log.head()

np.False_

Unnamed: 0,index,SRR1557113,SRR1557114,SRR1557115,SRR1557116,SRR1557117,SRR1557118,SRR1557119,SRR1557120,SRR1557121,...,ERR1082767,ERR1082768,ERR1082769,ERR1082770,SRR5445512,SRR5445513,SRR5445514,SRR5445515,SRR5445516,SRR5445517
0,Xkr4,1.420562,1.452568,1.464018,1.494733,1.412181,1.409706,1.407446,1.407038,1.410038,...,1.766159,1.783058,1.634959,1.659617,1.408292,1.406401,1.406401,1.406401,1.406401,1.409243
1,Sox17,1.494417,1.4914,1.521479,2.311306,1.483753,1.483753,1.484037,1.483753,1.483753,...,1.56612,1.559597,1.569976,1.570097,1.489791,1.483753,1.486863,1.488078,1.489576,1.483753
2,Mrpl15,3.246059,2.595129,2.742869,2.352977,1.393176,1.667768,1.525138,2.079505,1.69955,...,1.291887,1.260907,1.26878,1.269685,1.957057,1.868784,1.988381,1.770884,1.764461,1.790046
3,Lypla1,3.003107,2.874071,2.915971,2.998419,1.128386,1.290369,1.181672,1.394039,1.18653,...,1.485452,1.423819,1.481524,1.425756,1.846726,1.857907,1.944815,1.646781,1.664628,1.693245
4,Tcea1,3.521912,3.010331,3.305209,2.999029,1.712583,2.097823,1.942595,2.206863,1.729124,...,1.698894,1.759633,1.693745,1.638076,2.049349,2.049462,2.067344,1.600136,1.699778,1.692874


In [15]:
data_log.to_feather(OUTPUT_FILES["mice_gene_expr_data_feather_log"])

# Save data log2 z-scaled

In [16]:
# Standardize the features
scaler = StandardScaler().set_output(transform="pandas")
data_scaled = scaler.fit_transform(data_log.set_index("index").T)

In [17]:
data_scaled.head()

Unnamed: 0,Xkr4,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,St18,Pcmtd1,Vcpip1,...,Ofd1,Trappc2,Rab9,Egfl6,Tmsb4x,Tlr8,Tlr7,Msl3,Amelx,Uty
SRR1557113,-0.39099,-0.345431,4.592125,3.72955,5.069136,0.451765,2.937064,-0.374919,1.815806,1.060769,...,-1.204524,-0.956177,-1.461516,-0.286453,-0.714651,-0.277938,-0.344157,-1.240779,-0.023286,-0.632972
SRR1557114,-0.289721,-0.360187,2.841639,3.40194,3.762489,0.8161,2.12801,-0.377349,1.76783,0.815972,...,-1.204524,-0.832683,-1.139406,-0.286453,-0.71469,-0.277938,-0.344157,-1.245425,-0.023286,-0.757327
SRR1557115,-0.253493,-0.21305,3.238942,3.508319,4.515646,0.805485,2.582608,-0.38114,1.466078,1.189445,...,-1.171806,-0.962665,-1.461516,-0.286453,-0.645912,-0.277938,-0.344157,-1.245425,-0.023286,-0.771482
SRR1557116,-0.156307,3.650533,2.190443,3.717647,3.73362,0.420393,2.762405,-0.38018,1.313906,1.516387,...,-1.204524,-0.954403,-1.452633,-0.286453,-0.492153,-0.277938,0.558075,-1.238663,-0.023286,-0.771482
SRR1557117,-0.417507,-0.397595,-0.390661,-1.030177,0.447865,0.536683,-0.843113,-0.362165,0.675947,1.525669,...,0.221405,0.254921,-1.449862,-0.286453,-0.56532,-0.277938,-0.3357,-1.205119,-0.023286,-0.771482


In [18]:
data_scaled.iloc[:, 1:10].describe()

Unnamed: 0,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,St18,Pcmtd1,Vcpip1
count,190110.0,190110.0,190110.0,190110.0,190110.0,190110.0,190110.0,190110.0,190110.0
mean,-5.681586e-12,1.919508e-14,-2.183318e-15,-1.504642e-14,3.975476e-12,1.544282e-14,3.717534e-12,1.162119e-14,1.37033e-15
std,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003
min,-0.3975948,-1.642469,-1.660871,-1.906256,-0.5007137,-2.189457,-0.3838606,-1.947943,-2.220685
25%,-0.3975948,-0.6781113,-0.7442601,-0.6883282,-0.5007137,-0.7091525,-0.3838606,-0.7427968,-0.6829319
50%,-0.2909079,-0.224399,-0.1773213,-0.1765235,-0.4702396,-0.1002198,-0.3717166,-0.05996216,-0.04997092
75%,-0.02586385,0.3675102,0.5094777,0.5438189,-0.003082116,0.6791981,-0.1442185,0.640531,0.6129238
max,27.52322,8.007564,7.563845,11.56426,15.10557,9.070656,22.7982,6.872076,7.862947


In [19]:
data_output = data_scaled.T.reset_index()
display(data_output.isna().any(axis=None))
data_output.head()

np.False_

Unnamed: 0,index,SRR1557113,SRR1557114,SRR1557115,SRR1557116,SRR1557117,SRR1557118,SRR1557119,SRR1557120,SRR1557121,...,ERR1082767,ERR1082768,ERR1082769,ERR1082770,SRR5445512,SRR5445513,SRR5445514,SRR5445515,SRR5445516,SRR5445517
0,Xkr4,-0.39099,-0.289721,-0.253493,-0.156307,-0.417507,-0.425338,-0.432488,-0.433781,-0.424286,...,0.702495,0.755966,0.287374,0.365393,-0.429811,-0.435796,-0.435796,-0.435796,-0.435796,-0.426804
1,Sox17,-0.345431,-0.360187,-0.21305,3.650533,-0.397595,-0.397595,-0.396205,-0.397595,-0.397595,...,0.005321,-0.026592,0.024181,0.024773,-0.368057,-0.397595,-0.382382,-0.376439,-0.369108,-0.397595
2,Mrpl15,4.592125,2.841639,3.238942,2.190443,-0.390661,0.347775,-0.035786,1.455021,0.433243,...,-0.663047,-0.746359,-0.725185,-0.722752,1.125733,0.888349,1.209969,0.625076,0.607802,0.676606
3,Lypla1,3.72955,3.40194,3.508319,3.717647,-1.030177,-0.618919,-0.894889,-0.355712,-0.882556,...,-0.123622,-0.280102,-0.133596,-0.275185,0.793615,0.822002,1.042652,0.285974,0.331288,0.403943
4,Tcea1,5.069136,3.762489,4.515646,3.73362,0.447865,1.43182,1.035347,1.710324,0.490113,...,0.412901,0.568038,0.399749,0.257564,1.308011,1.308299,1.353972,0.160659,0.41516,0.397526


In [20]:
data_output.to_feather(OUTPUT_FILES["mice_gene_expr_data_feather_std"])