In [101]:
import pandas as pd
import numpy as np
import os
import sys
from scipy.stats import zscore
import scipy.io

import matplotlib.pyplot as plt
import seaborn as sns

# Load MyoD Data

In [87]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/myod/archive/myod_reprogramming/processed/rnaseq/MyoD_gene_Rna_raw.mat"

mat = scipy.io.loadmat(fpath)

# get timepointsand replicate ids
columns = mat['run_info_indiv'][0]
columns = ["".join(x[0]).replace(" ", "_") for x in columns]

# get gene names
gene_names = [f"{x[0][0]}" for x in mat['gene_names']]

print(f"{len(columns)=}")
print(f"{len(gene_names)=}")

# get expression data
data = mat['RNA_gene_indiv']
print(f"{data.shape=}")

# format as dataframe
df = pd.DataFrame(data, columns=columns, index=gene_names)
df = df.T

outpath = f"/nfs/turbo/umms-indikar/shared/projects/myod/clean_data/time_series_rna.csv"
df.to_csv(outpath, index=True)

df.head()

len(columns)=48
len(gene_names)=23614
data.shape=(23614, 48)


Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A4GALT,A4GNT,AA06,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
T0_R1,269.456,86.4078,2.27389,12246.0,20.465,5.68472,0.0,1058.5,34.1083,0.0,...,766.297,790.176,108.01,370.644,1078.96,40.93,1583.76,8782.9,1965.78,1799.78
T0_R2,250.517,82.75,3.40069,12650.6,11.3356,9.0685,0.0,941.99,23.8048,0.0,...,711.877,757.219,114.49,363.873,1257.12,40.8082,1965.6,7666.28,2357.81,1912.32
T0_R3,335.694,85.9457,1.01113,11036.4,13.1446,6.06676,0.0,1231.55,36.4005,0.0,...,691.61,768.456,106.168,277.049,1046.52,33.3672,1342.78,9807.92,1801.83,1457.03
T1_R1,222.507,91.352,0.652514,10989.0,36.5408,6.52514,0.0,811.727,19.5754,0.0,...,1092.96,1048.59,110.275,299.504,1192.14,33.9307,2291.63,7691.83,2130.46,1710.24
T1_R2,274.679,112.329,3.51027,10698.4,22.8168,7.02054,0.877568,799.464,26.327,0.0,...,1089.06,1016.22,105.308,354.537,1161.02,39.4906,2292.21,8713.37,2012.26,1720.03


In [102]:
""" use the initial time point as a control"""

init = df.head(3).mean().to_numpy()
control_matrix = np.tile(init, [len(df), 1])

print(f"{control_matrix.shape=}")
print(f"{df.shape=}")

"""Compute fold changes """
fold_changes = (df + 1) / (control_matrix + 1)
print(f"{fold_changes.shape=}")

outpath = f"/nfs/turbo/umms-indikar/shared/projects/myod/clean_data/time_series_rna_fold_changes.csv"
fold_changes.to_csv(outpath, index=True)

fold_changes.head()

control_matrix.shape=(48, 23614)
df.shape=(48, 23614)
fold_changes.shape=(48, 23614)


Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A4GALT,A4GNT,AA06,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
T0_R1,0.944916,1.015962,1.014037,1.022401,1.343096,0.841905,1.0,0.982523,1.082325,1.0,...,1.05942,1.023579,0.986016,1.098925,0.956959,1.065066,0.971225,1.003488,0.962783,1.04451
T0_R2,0.878747,0.973447,1.363046,1.056178,0.771856,1.268074,1.0,0.874478,0.764687,1.0,...,0.984281,0.980941,1.044629,1.078904,1.114828,1.061972,1.205236,0.875924,1.154691,1.109786
T0_R3,1.176337,1.010591,0.622917,0.921421,0.885048,0.890021,1.0,1.143,1.152989,1.0,...,0.956298,0.995479,0.969355,0.822171,0.928214,0.872963,0.823539,1.120588,0.882526,0.845704
T1_R1,0.780886,1.07343,0.511841,0.917464,2.348982,0.947751,1.0,0.753679,0.634302,1.0,...,1.510449,1.357901,1.006503,0.888569,1.057248,0.887276,1.405045,0.878842,1.043398,0.992574
T1_R2,0.963164,1.317251,1.396987,0.893205,1.490251,1.010144,1.877568,0.742307,0.842441,1.0,...,1.505064,1.316022,0.961576,1.051298,1.029673,1.028503,1.4054,0.995545,0.985536,0.998252


In [111]:
""" compute z-scores """

Z = (fold_changes - fold_changes.mean())/fold_changes.std(ddof=0)

# drop that genes that don't change (no reads)
Z = Z.dropna(axis=1, how='all')
print(f"{Z.shape=}")

outpath = f"/nfs/turbo/umms-indikar/shared/projects/myod/clean_data/time_series_rna_Z_scores.csv"
Z.to_csv(outpath, index=True)

Z.head()

Z.shape=(48, 20967)


Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A4GALT,A4GNT,AAAS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
T0_R1,1.405404,1.354427,-0.350276,1.712226,-0.702826,-0.118115,-0.470459,1.108334,2.080126,0.176877,...,-0.550108,-0.502374,-0.971989,-0.494887,-1.431415,-0.642975,-2.125625,0.878478,-1.223717,-0.486207
T0_R2,1.039275,1.099166,0.086865,1.994941,-1.442005,1.298854,-0.470459,0.605171,0.948936,-0.060846,...,-0.648207,-0.549161,-0.830686,-0.571403,-0.355441,-0.649817,-1.159188,0.271296,0.179373,-0.310175
T0_R3,2.685915,1.322179,-0.840162,0.867014,-1.295536,0.041865,-0.470459,1.855671,2.33178,1.593111,...,-0.684741,-0.533209,-1.012156,-1.552568,-1.627332,-1.067803,-2.735546,1.43585,-1.8105,-1.022328
T1_R1,0.497787,1.699461,-0.979287,0.833893,0.598781,0.233814,-0.470459,0.042615,0.484603,-0.166841,...,0.038745,-0.135519,-0.922598,-1.298813,-0.747879,-1.036149,-0.334006,0.285189,-0.634321,-0.626263
T1_R2,1.506375,3.163352,0.129377,0.630835,-0.512408,0.441264,1.27921,-0.010344,1.22584,0.477273,...,0.031715,-0.181473,-1.030909,-0.676906,-0.935824,-0.723831,-0.332538,0.840669,-1.057363,-0.61095
