In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
import os
import sys
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from pydmd import DMD

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# local imports
sys.path.append("../python/")
sys.path.append("../")

import nb_util as nb

# 2015 Data

In [2]:
data_path = f"/nfs/turbo/umms-indikar/shared/projects/cell_cycle/data/RNA_pipeline_ouputs/countMatrix/counts.raw.txt"
gene_path = f"/nfs/turbo/umms-indikar/shared/projects/cell_cycle/data/RNA_pipeline_ouputs/references/geneTable.csv"

""" Load the raw expression """
df = pd.read_csv(data_path, index_col=0)

# remove MT and ribosomal genes
all_genes = df.index.to_list()
mt_genes = [x for x in all_genes if x.startswith('MT-')]
rp_genes = [x for x in all_genes if x.startswith('RP')]

print(f"{df.shape=}")
df = df.drop(mt_genes) # drop MT genes
df = df.drop(rp_genes) # drop ribosomal genes
print(f"{df.shape=}")

# rewrite the list without MT genes
gene_names = df.index.to_list()

print(f"{len(all_genes)=} {len(mt_genes)=} {len(gene_names)=}")

""" Load gene lengths """
gf = nb.getGeneLengths(gene_path, gene_names)
print(f"{gf.shape=}")

df.head()

df.shape=(19393, 18)
df.shape=(19235, 18)
len(all_genes)=19393 len(mt_genes)=13 len(gene_names)=19235


  gf = nb.getGeneLengths(gene_path, gene_names)


gf.shape=(19235, 2)


Unnamed: 0_level_0,S1a,S1b,S2a,S2b,S3a,S3b,S4a,S4b,S5a,S5b,S6a,S6b,S7a,S7b,S8a,S8b,S9a,S9b
geneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
A1BG,12,5,5,9,2,6,7,3,6,5,5,5,4,3,4,5,5,3
A1CF,0,0,0,0,0,0,0,0,0,0,0,1,2,0,2,0,0,0
A2M,0,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,2
A2ML1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
A3GALT2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [3]:
target = 1e6
tpm = nb.TPM(df, gf, target=target)
tpm = tpm.reset_index(drop=False)
print(f"{tpm.shape=}")
tpm = nb.meltDf(tpm)
print(f"{tpm.shape=}")

column_names = [
    'gene_name', 
    'time_id',
    'tpm',
    'time_point',
    'replicate',
    'control',
    'hours',
]
tpm.columns = column_names
tpm.to_csv('../../data/raw_data/2015_tpm.csv', index=False)
tpm.head()

tpm.shape=(19235, 19)
tpm.shape=(346230, 7)


Unnamed: 0,gene_name,time_id,tpm,time_point,replicate,control,hours
0,A1BG,S1a,0.204405,0,r1c,control,0
1,A1CF,S1a,0.0,0,r1c,control,0
2,A2M,S1a,0.0,0,r1c,control,0
3,A2ML1,S1a,0.0,0,r1c,control,0
4,A3GALT2,S1a,0.0,0,r1c,control,0


In [4]:
print(tpm['control'].value_counts())

timecourse    307760
control        38470
Name: control, dtype: int64


# 2018 Data

In [5]:
data_path = f"/nfs/turbo/umms-indikar/shared/projects/myod/data/rnaseq/2018_rna/countMatrix/counts.raw.txt"
gene_path = f"/nfs/turbo/umms-indikar/shared/projects/myod/data/rnaseq/2018_rna/references/geneTable.csv"

""" Load the raw expression """
df = pd.read_csv(data_path, index_col=0)

# remove MT and ribosomal genes
all_genes = df.index.to_list()
mt_genes = [x for x in all_genes if x.startswith('MT-')]
rp_genes = [x for x in all_genes if x.startswith('RP')]

print(f"{df.shape=}")
df = df.drop(mt_genes) # drop MT genes
df = df.drop(rp_genes) # drop ribosomal genes
print(f"{df.shape=}")

# rewrite the list without MT genes
gene_names = df.index.to_list()

print(f"{len(all_genes)=} {len(mt_genes)=} {len(gene_names)=}")

""" Load gene lengths """
gf = nb.getGeneLengths(gene_path, gene_names)
print(f"{gf.shape=}")

df.head()

df.shape=(19393, 48)
df.shape=(19235, 48)
len(all_genes)=19393 len(mt_genes)=13 len(gene_names)=19235


  gf = nb.getGeneLengths(gene_path, gene_names)


gf.shape=(19235, 2)


Unnamed: 0_level_0,63246_T0R1,63252_T1R1,63249_T2R1,63261_T3R1,63258_T4R1,63255_T5R1,63270_T6R1,63267_T7R1,63264_T8R1,63279_T9R1,...,63272_T6R3,63269_T7R3,63266_T8R3,63281_T9R3,63278_T10R3,63275_T11R3,63290_T12R3,63287_T13R3,63284_T14R3,63293_T15R3
geneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,12,26,6,14,13,11,30,13,17,11,...,19,18,27,15,22,14,14,8,14,11
A1CF,1,1,0,0,0,1,4,1,1,0,...,2,2,1,1,0,3,0,5,0,1
A2M,3595,5795,3903,4714,2423,2915,4399,2681,3036,3123,...,3952,3457,3674,3769,3974,3461,2359,2805,3002,3481
A2ML1,0,1,0,1,0,2,2,1,3,1,...,0,1,0,0,0,0,0,0,1,0
A3GALT2,1,0,0,3,0,1,0,0,1,1,...,4,0,0,1,1,0,2,3,1,0


In [6]:
target = 1e6
tpm = nb.TPM(df, gf, target=target)
tpm = tpm.T
tpm = tpm.reset_index(drop=False)

tpm['sample_id'] = tpm['index'].apply(lambda x: x.split("_")[0])
tpm = tpm.drop(columns=['index'])
tpm['sample_id'] = tpm['sample_id'].astype(int)
print(f"{tpm.shape=}")

tpm.head()

tpm.shape=(48, 19236)


geneName,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,sample_id
0,0.126512,0.001016,6.489032,0.0,0.006116,1.388033,0.191932,1.63149,0.342097,0.006081,...,6.043442,0.871569,2.741955,0.54716,0.016701,0.554385,28.146263,0.478839,0.565802,63246
1,0.179995,0.000667,6.868637,0.000894,0.0,1.262624,0.086233,1.656584,0.858654,0.0,...,8.534292,0.91571,2.16904,0.676496,0.015354,0.927215,27.76455,0.581927,0.616531,63252
2,0.068018,0.0,7.575361,0.0,0.0,1.628216,0.043449,1.521492,0.569929,0.0,...,6.102463,1.799396,3.500129,0.775445,0.016163,0.757767,17.398688,0.580185,0.488433,63249
3,0.104575,0.0,6.028612,0.000965,0.013,0.461917,0.143143,1.598322,0.475989,0.0,...,25.399485,1.11153,3.078792,0.676394,0.060349,1.227617,21.352814,0.518082,0.827932,63261
4,0.196855,0.0,6.281842,0.0,0.0,1.143553,0.145093,1.482675,0.748358,0.0,...,19.880786,0.726076,2.602527,0.710269,0.026388,0.984738,40.824771,0.59964,0.643065,63258


In [7]:
meta_path = "/nfs/turbo/umms-indikar/shared/projects/myod/data/rnaseq_metadata.csv"
meta = pd.read_csv(meta_path)
meta['sample_id'] = meta['sample_id'].astype(int)
meta.head()

Unnamed: 0,sample_id,group_num,description,code,day,timepoint,replicate,hour,time_id
0,63246,1,T1R1,D1_T1R1,1,1,1,-48,0
1,63252,1,T1R7,D1_T2R1,1,2,1,0,1
2,63249,2,T1R4,D1_T3R1,1,3,1,8,2
3,63261,2,T1R16,D2_T1R1,2,1,1,16,3
4,63258,1,T1R13,D2_T2R1,2,2,1,24,4


In [8]:
# merge the metadata
tpm = pd.merge(tpm, 
               meta,
               how='left',
               left_on='sample_id',
               right_on='sample_id')


tpm.head()

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZZZ3,sample_id,group_num,description,code,day,timepoint,replicate,hour,time_id
0,0.126512,0.001016,6.489032,0.0,0.006116,1.388033,0.191932,1.63149,0.342097,0.006081,...,0.565802,63246,1,T1R1,D1_T1R1,1,1,1,-48,0
1,0.179995,0.000667,6.868637,0.000894,0.0,1.262624,0.086233,1.656584,0.858654,0.0,...,0.616531,63252,1,T1R7,D1_T2R1,1,2,1,0,1
2,0.068018,0.0,7.575361,0.0,0.0,1.628216,0.043449,1.521492,0.569929,0.0,...,0.488433,63249,2,T1R4,D1_T3R1,1,3,1,8,2
3,0.104575,0.0,6.028612,0.000965,0.013,0.461917,0.143143,1.598322,0.475989,0.0,...,0.827932,63261,2,T1R16,D2_T1R1,2,1,1,16,3
4,0.196855,0.0,6.281842,0.0,0.0,1.143553,0.145093,1.482675,0.748358,0.0,...,0.643065,63258,1,T1R13,D2_T2R1,2,2,1,24,4


In [10]:
id_vars = [
    'sample_id', 
    'group_num', 
    'description',	
    'code',	
    'day',	
    'timepoint',	
    'replicate',	
    'hour',	
    'time_id',
]

test = pd.melt(tpm, id_vars=id_vars,)
test = test[['variable', 'code', 'value', 'timepoint', 'replicate', 'hour']]
test.columns = ['gene_name', 'time_id', 'tpm', 'time_point',  'replicate', 'hours']
test['control'] = np.where(test['hours'] < 0, "control", "timecourse")

test.to_csv('../../data/raw_data/2018_tpm.csv', index=False)
test.head()

Unnamed: 0,gene_name,time_id,tpm,time_point,replicate,hours,control
0,A1BG,D1_T1R1,0.126512,1,1,-48,control
1,A1BG,D1_T2R1,0.179995,2,1,0,timecourse
2,A1BG,D1_T3R1,0.068018,3,1,8,timecourse
3,A1BG,D2_T1R1,0.104575,1,1,16,timecourse
4,A1BG,D2_T2R1,0.196855,2,1,24,timecourse
