# Coupled Sensor Selection

This notebook aims to perform sensor selection on reprogrammed/reprogramming cells in the 2017 data set by utilizing infromation in the 2015 data set.

In [1]:
# Imports
import pandas as pd
import numpy as np
from copy import deepcopy
import os
import sys
from importlib import reload
from scipy.stats import zscore
import scipy.io
from scipy.interpolate import make_interp_spline, BSpline
from scipy.signal import savgol_filter as savgol 
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
import leidenalg
import umap
import time
import gget
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from scipy.signal import savgol_filter
import scipy
import textwrap
from scipy import sparse

from pydmd import DMD

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# local imports
sys.path.append("../python/")

import nb_util as nb
import utils as ut
reload(ut)

import hasnain_DMD
reload(hasnain_DMD)

def getC(n, idxs):
    C = np.zeros((len(idxs), n))
    for i in range(len(idxs)):
        C[i, int(idxs[i])] = 1
    return C

## Load/Filter/reprocess Data

### 2015 Cell Cycle Data

In [2]:
# Load data (taken exactly from Cooper)
data_path = f"/nfs/turbo/umms-indikar/shared/projects/cell_cycle/data/RNA_pipeline_ouputs/countMatrix/counts.raw.txt"
gene_path = f"/nfs/turbo/umms-indikar/shared/projects/cell_cycle/data/RNA_pipeline_ouputs/references/geneTable.csv"

""" Load the raw expression """
df = pd.read_csv(data_path, index_col=0)

# remove MT and ribosomal genes
all_genes = df.index.to_list()
mt_genes = [x for x in all_genes if x.startswith('MT-')]
rp_genes = [x for x in all_genes if x.startswith('RP')]

print(f"{df.shape=}")
df = df.drop(mt_genes) # drop MT genes
df = df.drop(rp_genes) # drop ribosomal genes
print(f"{df.shape=}")

# rewrite the list without MT genes
gene_names = df.index.to_list()

print(f"{len(all_genes)=} {len(mt_genes)=} {len(gene_names)=}")

""" Load gene lengths """
gf = nb.getGeneLengths(gene_path, gene_names)
print(f"{gf.shape=}")

df.shape=(19393, 18)
df.shape=(19235, 18)
len(all_genes)=19393 len(mt_genes)=13 len(gene_names)=19235


  gf = pd.read_csv(gene_table_path)


gf.shape=(19235, 2)


Unnamed: 0_level_0,S1a,S1b,S2a,S2b,S3a,S3b,S4a,S4b,S5a,S5b,S6a,S6b,S7a,S7b,S8a,S8b,S9a,S9b
geneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
A1BG,12,5,5,9,2,6,7,3,6,5,5,5,4,3,4,5,5,3
A1CF,0,0,0,0,0,0,0,0,0,0,0,1,2,0,2,0,0,0
A2M,0,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,2
A2ML1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
A3GALT2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [3]:
target = 1e6
threshold = 0.5
rank = 7
tpm = nb.TPM(df, gf, target=target)

# get highly expressed genes
tpm_dist = tpm.mean(axis=1)
mask = (tpm_dist > threshold)
high_exp_genes = tpm_dist[mask].index.to_list()    

# filter and convert to fold changes
d = tpm[tpm.index.isin(high_exp_genes)]
dmd_data = nb.data2DMD(d) 
print(f"{dmd_data.shape=}")

dmd_res = hasnain_DMD.dmd(dmd_data, rank=rank)

print(f"{dmd_res['A'].shape=}")
print(f"{dmd_res['Atilde'].shape=}")
print(f"{dmd_res['u_r'].shape=}")
print(f"{dmd_res['L'].shape=}")
print(f"{dmd_res['W'].shape=}")
print(f"{dmd_res['Phi'].shape=}")
print(f"{dmd_res['amplitudes'][0].shape=}")
print('done')

dmd_data.shape=(8112, 8, 2)
dmd_res['A'].shape=(8112, 8112)
dmd_res['Atilde'].shape=(7, 7)
dmd_res['u_r'].shape=(8112, 7)
dmd_res['L'].shape=(7,)
dmd_res['W'].shape=(7, 7)
dmd_res['Phi'].shape=(8112, 7)
dmd_res['amplitudes'][0].shape=(7, 8)
done


### 2017 Reprogramming Data

In [4]:
# Load data (taken exactly from Cooper)
data_path = f"/nfs/turbo/umms-indikar/shared/projects/myod/data/rnaseq/2018_rna/countMatrix/counts.raw.txt"
gene_path = f"/nfs/turbo/umms-indikar/shared/projects/cell_cycle/data/RNA_pipeline_ouputs/references/geneTable.csv"

""" Load the raw expression """
df = pd.read_csv(data_path, index_col=0)

# remove MT and ribosomal genes
all_genes = df.index.to_list()
mt_genes = [x for x in all_genes if x.startswith('MT-')]
rp_genes = [x for x in all_genes if x.startswith('RP')]

print(f"{df.shape=}")
df = df.drop(mt_genes) # drop MT genes
df = df.drop(rp_genes) # drop ribosomal genes
print(f"{df.shape=}")

# rewrite the list without MT genes
gene_names = df.index.to_list()

print(f"{len(all_genes)=} {len(mt_genes)=} {len(gene_names)=}")

""" Load gene lengths """
gf = nb.getGeneLengths(gene_path, gene_names)
print(f"{gf.shape=}")

df.shape=(19393, 48)
df.shape=(19235, 48)
len(all_genes)=19393 len(mt_genes)=13 len(gene_names)=19235


  gf = pd.read_csv(gene_table_path)


gf.shape=(19235, 2)


In [7]:
target = 1e6
threshold = 0.5
rank = 7
tpm = nb.TPM(df, gf, target=target)

# get highly expressed genes
tpm_dist = tpm.mean(axis=1)
mask = (tpm_dist > threshold)
high_exp_genes = tpm_dist[mask].index.to_list()    

# filter and convert to fold changes
d = tpm[tpm.index.isin(high_exp_genes)]
dmd_data = nb.data2DMD(d) 
print(f"{dmd_data.shape=}")

dmd_res = hasnain_DMD.dmd(dmd_data, rank=rank)

print(f"{dmd_res['A'].shape=}")
print(f"{dmd_res['Atilde'].shape=}")
print(f"{dmd_res['u_r'].shape=}")
print(f"{dmd_res['L'].shape=}")
print(f"{dmd_res['W'].shape=}")
print(f"{dmd_res['Phi'].shape=}")
print(f"{dmd_res['amplitudes'][0].shape=}")
print('done')

IndexError: list index out of range

## Scratch

In [5]:
!head /nfs/turbo/umms-indikar/shared/projects/cell_cycle/data/RNA_pipeline_ouputs/countMatrix/counts.raw.txt

geneName,S1a,S1b,S2a,S2b,S3a,S3b,S4a,S4b,S5a,S5b,S6a,S6b,S7a,S7b,S8a,S8b,S9a,S9b
A1BG,12,5,5,9,2,6,7,3,6,5,5,5,4,3,4,5,5,3
A1CF,0,0,0,0,0,0,0,0,0,0,0,1,2,0,2,0,0,0
A2M,0,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,2
A2ML1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
A3GALT2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A4GALT,150,133,122,137,143,153,116,131,111,109,117,141,137,138,94,68,155,134
A4GNT,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0
AAAS,330,342,507,563,535,548,346,328,438,415,387,387,341,338,409,437,416,469
AACS,473,464,565,519,481,483,526,484,563,635,480,478,467,523,413,456,485,503


In [6]:
!head /nfs/turbo/umms-indikar/shared/projects/myod/data/rnaseq/2018_rna/countMatrix/counts.raw.txt

geneName,63246_T0R1,63252_T1R1,63249_T2R1,63261_T3R1,63258_T4R1,63255_T5R1,63270_T6R1,63267_T7R1,63264_T8R1,63279_T9R1,63276_T10R1,63273_T11R1,63288_T12R1,63285_T13R1,63282_T14R1,63291_T15R1,63247_T0R2,63253_T1R2,63250_T2R2,63262_T3R2,63259_T4R2,63256_T5R2,63271_T6R2,63268_T7R2,63265_T8R2,63280_T9R2,63277_T10R2,63274_T11R2,63289_T12R2,63286_T13R2,63283_T14R2,63292_T15R2,63248_T0R3,63254_T1R3,63251_T2R3,63263_T3R3,63260_T4R3,63257_T5R3,63272_T6R3,63269_T7R3,63266_T8R3,63281_T9R3,63278_T10R3,63275_T11R3,63290_T12R3,63287_T13R3,63284_T14R3,63293_T15R3
A1BG,12,26,6,14,13,11,30,13,17,11,13,17,22,3,7,7,17,21,5,13,11,17,22,14,18,28,23,11,16,3,2,9,17,19,8,17,15,5,19,18,27,15,22,14,14,8,14,11
A1CF,1,1,0,0,0,1,4,1,1,0,1,0,1,7,0,0,0,1,0,0,1,4,0,5,0,2,1,1,2,3,0,0,0,0,2,1,0,1,2,2,1,1,0,3,0,5,0,1
A2M,3595,5795,3903,4714,2423,2915,4399,2681,3036,3123,4190,2473,3763,3175,2525,3150,3820,4189,4162,3073,2746,3736,3397,3171,3737,3834,4964,2631,3831,2714,1513,2974,3607,2751,3929,3332,2845,2806,3952,3457,36