# Coupled Sensor Selection

This notebook aims to perform sensor selection on reprogrammed/reprogramming cells in the 2017 data set by utilizing infromation in the 2015 data set.

In [1]:
# Imports
import pandas as pd
import numpy as np
from copy import deepcopy
import os
import sys
from importlib import reload
from scipy.stats import zscore
import scipy.io
from scipy.interpolate import make_interp_spline, BSpline
from scipy.signal import savgol_filter as savgol 
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
import leidenalg
import umap
import time
import gget
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from scipy.signal import savgol_filter
import scipy
import textwrap
from scipy import sparse
import importlib

from pydmd import DMD

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# local imports
sys.path.append("../python/")

import nb_util as nb
import utils as ut
reload(ut)

import hasnain_DMD
reload(hasnain_DMD)

import sensorSelection
reload(sensorSelection)

def getC(n, idxs):
    C = np.zeros((len(idxs), n))
    for i in range(len(idxs)):
        C[i, int(idxs[i])] = 1
    return C

## Load/Filter/reprocess Data

### 2015 Cell Cycle Data

In [2]:
# Load data (taken exactly from Cooper)
data_path = f"/nfs/turbo/umms-indikar/shared/projects/cell_cycle/data/RNA_pipeline_ouputs/countMatrix/counts.raw.txt"
gene_path = f"/nfs/turbo/umms-indikar/shared/projects/cell_cycle/data/RNA_pipeline_ouputs/references/geneTable.csv"

""" Load the raw expression """
df = pd.read_csv(data_path, index_col=0)

# remove MT and ribosomal genes
all_genes = df.index.to_list()
mt_genes = [x for x in all_genes if x.startswith('MT-')]
rp_genes = [x for x in all_genes if x.startswith('RP')]

print(f"{df.shape=}")
df = df.drop(mt_genes) # drop MT genes
df = df.drop(rp_genes) # drop ribosomal genes
print(f"{df.shape=}")

# rewrite the list without MT genes
gene_names = df.index.to_list()

print(f"{len(all_genes)=} {len(mt_genes)=} {len(gene_names)=}")

""" Load gene lengths """
gf = nb.getGeneLengths(gene_path, gene_names)

target = 1e6
threshold = 0.5
rank = 7
tpm = nb.TPM(df, gf, target=target)

# get highly expressed genes
tpm_dist = tpm.mean(axis=1)
mask = (tpm_dist > threshold)
high_exp_genes = tpm_dist[mask].index.to_list()    

# filter and convert to fold changes
d = tpm[tpm.index.isin(high_exp_genes)]
dmd_data_2015 = nb.data2DMD(d)


df.shape=(19393, 18)
df.shape=(19235, 18)
len(all_genes)=19393 len(mt_genes)=13 len(gene_names)=19235


  gf = pd.read_csv(gene_table_path)


### 2017 Reprogramming Data

In [3]:
# Load data (taken exactly from Cooper)
data_path = f"/nfs/turbo/umms-indikar/shared/projects/myod/data/rnaseq/2018_rna/countMatrix/counts.raw.txt"
gene_path = f"/nfs/turbo/umms-indikar/shared/projects/cell_cycle/data/RNA_pipeline_ouputs/references/geneTable.csv"

""" Load the raw expression """
df = pd.read_csv(data_path, index_col=0)

# remove MT and ribosomal genes
all_genes = df.index.to_list()
mt_genes = [x for x in all_genes if x.startswith('MT-')]
rp_genes = [x for x in all_genes if x.startswith('RP')]

print(f"{df.shape=}")
df = df.drop(mt_genes) # drop MT genes
df = df.drop(rp_genes) # drop ribosomal genes
print(f"{df.shape=}")

# rewrite the list without MT genes
gene_names = df.index.to_list()

print(f"{len(all_genes)=} {len(mt_genes)=} {len(gene_names)=}")

""" Load gene lengths """
gf = nb.getGeneLengths(gene_path, gene_names)
target = 1e6
#threshold = 0.5
#rank = 7
tpm = nb.TPM(df, gf, target=target)
tpm = tpm.loc[high_exp_genes]
dmd_data_2017 = nb.data2DMD2017(tpm)

df.shape=(19393, 48)
df.shape=(19235, 48)
len(all_genes)=19393 len(mt_genes)=13 len(gene_names)=19235


  gf = pd.read_csv(gene_table_path)


In [4]:
print(dmd_data_2017.shape)
print(dmd_data_2015.shape)

(8112, 15, 2)
(8112, 8, 2)


## Comparative Sensor Selection

In [6]:
reload(sensorSelection)

<module 'sensorSelection' from '/home/jpic/DMD_gene/notebooks/../python/sensorSelection.py'>

In [7]:
ss2015 = sensorSelection.hasnain2023(dmd_data_2015, dmd_rank=7, gramT=10, vxNames=high_exp_genes)


In [10]:
ss2017 = sensorSelection.hasnain2023(dmd_data_2017, dmd_rank=7, gramT=10, vxNames=high_exp_genes)


In [9]:
ss2015['sensors']

Unnamed: 0,gene,ev1,weight,rank
2480,GAPDH,0.451304+0.000000j,0.451304,1.0
6966,TMSB4X,0.343204+0.000000j,0.343204,2.0
573,B2M,0.250874+0.000000j,0.250874,3.0
2927,HLA-DRA,0.238704+0.000000j,0.238704,4.0
2433,FTL,0.212981+0.000000j,0.212981,5.0
...,...,...,...,...
4711,PCDHGC3,-0.002543+0.000000j,-0.002543,8108.0
4390,NMB,-0.002548+0.000000j,-0.002548,8109.0
4617,OXCT2,-0.002552+0.000000j,-0.002552,8110.0
6994,TNFSF13,-0.002555+0.000000j,-0.002555,8111.0


In [13]:
ss2017['sensors']

Unnamed: 0,gene,ev1,weight,rank
6042,SIRPG,0.002753+0.000000j,0.002753,22.0
104,ACY3,0.002753+0.000000j,0.002753,22.0
3493,LGALS14,0.002753+0.000000j,0.002753,22.0
3441,LAIR1,0.002753+0.000000j,0.002753,22.0
996,CCL22,0.002753+0.000000j,0.002753,22.0
...,...,...,...,...
6965,TMSB10,-0.186987+0.000000j,-0.186987,8108.0
2480,GAPDH,-0.233729+0.000000j,-0.233729,8109.0
7497,VIM,-0.271955+0.000000j,-0.271955,8110.0
5790,S100A6,-0.294759+0.000000j,-0.294759,8111.0


In [None]:
subset_tpm = tpm.loc[high_exp_genes]
subset_tpm

In [None]:
target = 1e6
threshold = 0.5
rank = 7
tpm = nb.TPM(df, gf, target=target)

# get highly expressed genes
tpm_dist = tpm.mean(axis=1)
mask = (tpm_dist > threshold)
high_exp_genes = tpm_dist[mask].index.to_list()

# filter and convert to fold changes
d = tpm[tpm.index.isin(high_exp_genes)]
dmd_data_2017 = nb.data2DMD2017(d)
print(f"{dmd_data.shape=}")

dmd_res = hasnain_DMD.dmd(dmd_data, rank=rank)

print(f"{dmd_res['A'].shape=}")
print(f"{dmd_res['Atilde'].shape=}")
print(f"{dmd_res['u_r'].shape=}")
print(f"{dmd_res['L'].shape=}")
print(f"{dmd_res['W'].shape=}")
print(f"{dmd_res['Phi'].shape=}")
print(f"{dmd_res['amplitudes'][0].shape=}")
print('done')

## Scratch

In [None]:
!head /nfs/turbo/umms-indikar/shared/projects/cell_cycle/data/RNA_pipeline_ouputs/countMatrix/counts.raw.txt

In [None]:
!head /nfs/turbo/umms-indikar/shared/projects/myod/data/rnaseq/2018_rna/countMatrix/counts.raw.txt