# Description

It reads the entire Mice data set, performs some preprocessing and then standardizes the data to have mean 0 and std 1.

# Modules

In [3]:
# reload imported modules if changed
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from IPython.display import display

# from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from pvae import conf

# Settings and paths

In [4]:
conf.data.MICE_PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
display(conf.data.MICE_PROCESSED_DIR)

PosixPath('/home/prashant/Documents/milton_lab/pvae/base/input/mice_data/processed')

In [5]:
# INPUT_FILES and OUTPUT_FILES must be provided if running with pytask
INPUT_FILES = {
    "mice_gene_expr_data_feather": conf.data.MICE_PROCESSED_DIR
    / "no_scrna_rpkm.ftr",
}

OUTPUT_FILES = {
    "mice_gene_expr_data_feather_prep": conf.data.MICE_PROCESSED_DIR
    / "mice_data-full-prep.ftr",
    "mice_gene_expr_data_feather_log": conf.data.MICE_PROCESSED_DIR
    / "mice_data-full-log.ftr",
    "mice_gene_expr_data_feather_std": conf.data.MICE_PROCESSED_DIR
    / "mice_data-full-std.ftr",
}

# Load data

In [6]:
full_dataset = pd.read_feather(INPUT_FILES["mice_gene_expr_data_feather"])

In [7]:
full_dataset.shape

(10849, 190112)

In [8]:
full_dataset.head()

Unnamed: 0,index,SRR1557112,SRR1557113,SRR1557114,SRR1557115,SRR1557116,SRR1557117,SRR1557118,SRR1557119,SRR1557120,...,ERR1082767,ERR1082768,ERR1082769,ERR1082770,SRR5445512,SRR5445513,SRR5445514,SRR5445515,SRR5445516,SRR5445517
0,Xkr4,-0.335025,-0.323103,-0.263053,-0.241244,-0.181879,-0.338608,-0.34317,-0.347328,-0.348079,...,0.401471,0.441549,0.105787,0.159327,-0.345772,-0.34925,-0.34925,-0.34925,-0.34925,-0.344023
1,Sox17,-0.123598,-0.182498,-0.188383,-0.129148,1.963323,-0.203247,-0.203247,-0.202696,-0.203247,...,-0.038926,-0.052286,-0.031002,-0.030753,-0.191517,-0.203247,-0.197212,-0.19485,-0.191935,-0.203247
2,Mrpl15,3.482403,6.487705,3.042429,3.694001,2.108774,-0.373438,0.177227,-0.121857,1.226623,...,-0.551519,-0.603537,-0.590422,-0.588911,0.882692,0.652247,0.967914,0.412631,0.397471,0.45826
3,Lypla1,3.206998,5.01725,4.331312,4.547355,4.99124,-0.81386,-0.554094,-0.731604,-0.371866,...,-0.199951,-0.317052,-0.207565,-0.313447,0.59683,0.624814,0.849883,0.131341,0.17032,0.233833
4,Tcea1,3.921904,8.486859,5.057495,6.88478,4.994615,0.277471,1.280629,0.843965,1.616705,...,0.24652,0.386121,0.234953,0.112505,1.139192,1.139515,1.191143,0.031718,0.248511,0.233001


# Prepare data

In [9]:
full_dataset.isna().any(axis=None)

np.False_

In [10]:
data = full_dataset.set_index("index").rename_axis(index=None)

In [11]:
data = data.iloc[:, 1:].T

In [12]:
data.shape

(190110, 10849)

In [13]:
data.head()

Unnamed: 0,Xkr4,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,St18,Pcmtd1,Vcpip1,...,Ofd1,Trappc2,Rab9,Egfl6,Tmsb4x,Tlr8,Tlr7,Msl3,Amelx,Uty
SRR1557113,-0.323103,-0.182498,6.487705,5.01725,8.486859,0.208984,3.915403,-0.257123,1.991735,0.966572,...,-0.726326,-0.556579,-0.736511,-0.153961,-0.443546,-0.184273,-0.232621,-0.445884,-0.014402,-0.459125
SRR1557114,-0.263053,-0.188383,3.042429,4.331312,5.057495,0.454319,2.438414,-0.258343,1.919939,0.687475,...,-0.726326,-0.497728,-0.60054,-0.153961,-0.443565,-0.184273,-0.232621,-0.447351,-0.014402,-0.528039
SRR1557115,-0.241244,-0.129148,3.694001,4.547355,6.88478,0.446912,3.224474,-0.260245,1.491456,1.121649,...,-0.710569,-0.559632,-0.736511,-0.153961,-0.409422,-0.184273,-0.232621,-0.447351,-0.014402,-0.535765
SRR1557116,-0.181879,1.963323,2.108774,4.99124,4.994615,0.188691,3.565857,-0.259763,1.28974,1.543496,...,-0.726326,-0.555743,-0.732866,-0.153961,-0.331431,-0.184273,0.238683,-0.445215,-0.014402,-0.535765
SRR1557117,-0.338608,-0.203247,-0.373438,-0.81386,0.277471,0.264561,-0.749493,-0.250712,0.538078,1.556082,...,0.072192,0.085772,-0.731728,-0.153961,-0.368832,-0.184273,-0.228539,-0.434594,-0.014402,-0.535765


# Save original data

In [15]:
data_output = data.T.reset_index()
# print(data_output.drop('index', axis=1).min(axis=0).min())
data_output.head()

-2.1603627172017297


Unnamed: 0,index,SRR1557113,SRR1557114,SRR1557115,SRR1557116,SRR1557117,SRR1557118,SRR1557119,SRR1557120,SRR1557121,...,ERR1082767,ERR1082768,ERR1082769,ERR1082770,SRR5445512,SRR5445513,SRR5445514,SRR5445515,SRR5445516,SRR5445517
0,Xkr4,-0.323103,-0.263053,-0.241244,-0.181879,-0.338608,-0.34317,-0.347328,-0.348079,-0.342558,...,0.401471,0.441549,0.105787,0.159327,-0.345772,-0.34925,-0.34925,-0.34925,-0.34925,-0.344023
1,Sox17,-0.182498,-0.188383,-0.129148,1.963323,-0.203247,-0.203247,-0.202696,-0.203247,-0.203247,...,-0.038926,-0.052286,-0.031002,-0.030753,-0.191517,-0.203247,-0.197212,-0.19485,-0.191935,-0.203247
2,Mrpl15,6.487705,3.042429,3.694001,2.108774,-0.373438,0.177227,-0.121857,1.226623,0.247997,...,-0.551519,-0.603537,-0.590422,-0.588911,0.882692,0.652247,0.967914,0.412631,0.397471,0.45826
3,Lypla1,5.01725,4.331312,4.547355,4.99124,-0.81386,-0.554094,-0.731604,-0.371866,-0.723954,...,-0.199951,-0.317052,-0.207565,-0.313447,0.59683,0.624814,0.849883,0.131341,0.17032,0.233833
4,Tcea1,8.486859,5.057495,6.88478,4.994615,0.277471,1.280629,0.843965,1.616705,0.315265,...,0.24652,0.386121,0.234953,0.112505,1.139192,1.139515,1.191143,0.031718,0.248511,0.233001


In [13]:
data_output.to_feather(OUTPUT_FILES["mice_gene_expr_data_feather_prep"])

# Save log2(data + 1)

In [14]:
data_log = np.log2(data.T + 3).reset_index()
data_log.head()

  result = func(self.values, **kwargs)


Unnamed: 0,index,SRR1557113,SRR1557114,SRR1557115,SRR1557116,SRR1557117,SRR1557118,SRR1557119,SRR1557120,SRR1557121,...,ERR1082767,ERR1082768,ERR1082769,ERR1082770,SRR5445512,SRR5445513,SRR5445514,SRR5445515,SRR5445516,SRR5445517
0,Xkr4,-0.562991,-0.440367,-0.398293,-0.289615,-0.596423,-0.606408,-0.61557,-0.617232,-0.605064,...,0.486942,0.52762,0.145074,0.213288,-0.612134,-0.619824,-0.619824,-0.619824,-0.619824,-0.608283
1,Sox17,-0.290706,-0.301129,-0.1995,1.567216,-0.327796,-0.327796,-0.326798,-0.327796,-0.327796,...,-0.057281,-0.077476,-0.045435,-0.045064,-0.30671,-0.327796,-0.316909,-0.312671,-0.307457,-0.327796
2,Mrpl15,2.904524,2.015223,2.230818,1.636346,-0.67447,0.235393,-0.187472,1.154857,0.319614,...,-1.156881,-1.334741,-1.28779,-1.282476,0.912797,0.724429,0.976667,0.498385,0.482818,0.544248
3,Lypla1,2.589104,2.414491,2.4718,2.582855,-2.425538,-1.165188,-1.897567,-0.670856,-1.857018,...,-0.321839,-0.550151,-0.335635,-0.542558,0.675211,0.700275,0.887434,0.178034,0.226903,0.303148
4,Tcea1,3.245931,2.598721,2.97907,2.583667,0.353291,1.189432,0.882811,1.387751,0.395353,...,0.317906,0.471053,0.304456,0.153812,1.097066,1.097284,1.131684,0.045049,0.320208,0.302174


In [15]:
data_log.to_feather(OUTPUT_FILES["mice_gene_expr_data_feather_log"])

# Save data log2 z-scaled

In [16]:
# Standardize the features
scaler = StandardScaler().set_output(transform="pandas")
data_scaled = scaler.fit_transform(data_log.set_index("index").T)

In [17]:
data_scaled.head()

Unnamed: 0,Xkr4,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,St18,Pcmtd1,Vcpip1,...,Ofd1,Trappc2,Rab9,Egfl6,Tmsb4x,Tlr8,Tlr7,Msl3,Amelx,Uty
SRR1557113,-0.429182,-0.429328,2.753134,2.123783,2.618757,0.727303,1.801635,-0.44855,1.24703,0.881229,...,-1.848654,-1.389378,-2.324218,-0.363045,-0.952396,-0.33607,-0.421933,-1.668208,-0.030982,-0.68213
SRR1557114,-0.257182,-0.453618,2.018248,2.00474,2.159124,1.115287,1.451226,-0.45273,1.224444,0.729137,...,-1.848654,-1.130084,-1.514492,-0.363045,-0.95247,-0.33607,-0.421933,-1.676506,-0.030982,-0.910123
SRR1557115,-0.198167,-0.216779,2.196409,2.04381,2.429239,1.104565,1.653104,-0.459262,1.076886,0.956651,...,-1.754938,-1.403754,-2.324218,-0.363045,-0.823909,-0.33607,-0.421933,-1.676506,-0.030982,-0.93773
SRR1557116,-0.045728,3.900443,1.705158,2.119522,2.148432,0.691757,1.729303,-0.457606,0.998382,1.136852,...,-1.848654,-1.38546,-2.297488,-0.363045,-0.556139,-0.33607,0.898542,-1.664434,-0.030982,-0.93773
SRR1557117,-0.476075,-0.515764,-0.204416,-1.29495,0.564471,0.821685,-1.11708,-0.426688,0.628403,1.141757,...,0.4374,0.473904,-2.289214,-0.363045,-0.680415,-0.33607,-0.407301,-1.605079,-0.030982,-0.93773


In [18]:
data_scaled.iloc[:, 1:10].describe()

Unnamed: 0,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,St18,Pcmtd1,Vcpip1
count,190110.0,187365.0,184591.0,181883.0,190110.0,172643.0,190110.0,169092.0,170515.0
mean,3.309998e-13,1.170908e-15,-4.452156e-15,-9.917119e-15,-4.683856e-14,-3.141496e-15,-3.444093e-13,8.81266e-16,-4.427396e-15
std,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003
min,-0.5157635,-13.05303,-11.06513,-9.773286,-0.5861746,-10.11388,-0.463957,-9.949662,-10.02758
25%,-0.5157635,-0.597733,-0.5493207,-0.5048116,-0.5861746,-0.5301493,-0.463957,-0.4576467,-0.4652393
50%,-0.3406386,0.01314759,0.1181438,0.09200356,-0.5337198,0.1507604,-0.443047,0.1881375,0.1652093
75%,0.06866814,0.5733822,0.6585958,0.6579146,0.1696115,0.714791,-0.07258752,0.6670549,0.6519798
max,16.96133,3.97585,3.345479,4.589644,8.93091,3.875423,13.63901,3.058039,3.524685


In [19]:
data_output = data_scaled.T.reset_index()
data_output.head()

Unnamed: 0,index,SRR1557113,SRR1557114,SRR1557115,SRR1557116,SRR1557117,SRR1557118,SRR1557119,SRR1557120,SRR1557121,...,ERR1082767,ERR1082768,ERR1082769,ERR1082770,SRR5445512,SRR5445513,SRR5445514,SRR5445515,SRR5445516,SRR5445517
0,Xkr4,-0.429182,-0.257182,-0.198167,-0.045728,-0.476075,-0.49008,-0.502932,-0.505263,-0.488195,...,1.043512,1.100569,0.56399,0.65967,-0.498113,-0.508899,-0.508899,-0.508899,-0.508899,-0.49271
1,Sox17,-0.429328,-0.453618,-0.216779,3.900443,-0.515764,-0.515764,-0.51344,-0.515764,-0.515764,...,0.114654,0.06759,0.142261,0.143125,-0.466626,-0.515764,-0.490392,-0.480517,-0.468365,-0.515764
2,Mrpl15,2.753134,2.018248,2.196409,1.705158,-0.204416,0.547462,0.198022,1.307274,0.617059,...,-0.603062,-0.75004,-0.711241,-0.706849,1.107244,0.951584,1.160024,0.764789,0.751925,0.802689
3,Lypla1,2.123783,2.00474,2.04381,2.119522,-1.29495,-0.435707,-0.935006,-0.098696,-0.907362,...,0.139246,-0.016405,0.129841,-0.011229,0.818986,0.836073,0.963669,0.480035,0.513351,0.565331
4,Tcea1,2.618757,2.159124,2.429239,2.148432,0.564471,1.158279,0.940524,1.299121,0.594343,...,0.539341,0.648103,0.529789,0.422805,1.092683,1.092838,1.117268,0.345564,0.540976,0.528169


In [20]:
data_output.to_feather(OUTPUT_FILES["mice_gene_expr_data_feather_std"])