In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

import scipy.io
import sklearn.preprocessing
import time
import hashlib

from collections import Counter

### define required functions

In [55]:
## convert the sparse matrix into a log x+1 normalized sparse tensor.
def make_log_maxn_tensor(sparseMatrix):
  # get the log transform...
  sparseMatrix.data = np.log1p(sparseMatrix.data)
  # normalize to the max...
  sklearn.preprocessing.normalize(sparseMatrix, norm="max", axis=1, copy=False)

  # convert the filtered matrix to COO format
  sparseMatrix = sparseMatrix.tocoo()
  indices = np.mat([sparseMatrix.row, sparseMatrix.col]).transpose()

  # make a sparse tensor and re-order it...
  sparseMatrix = tf.SparseTensor(indices,sparseMatrix.data,sparseMatrix.shape)
  sparseMatrix = tf.sparse.reorder(sparseMatrix)
  return sparseMatrix

Copy the data from github to the VM.  We need to install git-lfs, because the count matrix is too large to be placed in github, so we use the git lfs (large file system. )

In [57]:
%%bash
# download the count matrix, labels, barcodes, genes, and the label encoder used during training...
FILES=( dirty_neurons_test.mtx.gz dirty_neuron_encoder.npy dirty_neurons_test_barcodes.csv.gz dirty_neurons_test_labels.csv.gz dirty_neurons_genes.csv.gz )
for FILE in "${FILES[@]}"
do
    if [ ! -f $FILE ]; then
      echo "$FILE not found...  downloading...."
      wget -q https://raw.githubusercontent.com/ArielLevineLabNINDS/Seq-Seek-classifyData/master/$FILE
      #curl https://raw.githubusercontent.com/ArielLevineLabNINDS/Seq-Seek-classifyData/master/dirty_neurons_test.mtx.gz -o dirty_neurons_test.mtx.gz
    else
      echo "$FILE found..."
    fi
done

# download the model
FILE=neurons_doublets.model.tgz
DIR=neurons_doublets.model

if [ ! -d $DIR ]; then
  echo "model $DIR not found...  downloading from github ...."
  if [ ! -f $FILE ]; then
      echo "model $FILE not found...  downloading...."
      # if you are using a mac, comment out the wget command and uncomment out the curl command below
      wget -q https://raw.githubusercontent.com/ArielLevineLabNINDS/SeqSeek-Classify-NN/master/$FILE
      #curl https://raw.githubusercontent.com/ArielLevineLabNINDS/Seq-Seek-classifyData/master/$FILE -o $FILE
  else
      echo "$FILE found..."
  fi
  ## untar the file...
  tar -xvf $FILE
  rm $FILE

  if [ ! -d $DIR ]; then
    echo "download failed!"
  else
    echo "model $DIR downloaded successfully"
  fi
else
  echo "model $DIR found"
fi



dirty_neurons_test.mtx.gz not found...  downloading....
dirty_neuron_encoder.npy not found...  downloading....
dirty_neurons_test_barcodes.csv.gz not found...  downloading....
dirty_neurons_test_labels.csv.gz not found...  downloading....
dirty_neurons_genes.csv.gz not found...  downloading....
model neurons_doublets.model not found...  downloading from github ....
model neurons_doublets.model.tgz not found...  downloading....
model neurons_doublets.model downloaded successfully


x neurons_doublets.model/
x neurons_doublets.model/variables/
x neurons_doublets.model/saved_model.pb
x neurons_doublets.model/assets/
x neurons_doublets.model/variables/variables.data-00000-of-00001
x neurons_doublets.model/variables/variables.index


# step two: 
### data loading...

We are going to re-create the test data from the analysis.  In the future, I will create a 
dataset that is only test data, then I will try to create a function that will convert your data
into the appropriat format.  After all, it's more fun to play with your data.  Keep in mind that
this data is mouse spinal cord neurons and doublets only.  

The input data is a sparse matrix created in R from Seurat data.  In Seurat, the count matrices are genes (rows) by cells (columns).  
Tensorflow requires cells (rows) by genes (columns).  I performed the matrix transpose in R, so you won't see it here.

note: it takes around 10 sec to read the data on my laptop (macbook pro 2018).

In [58]:
t0 = time.time()
genes = pd.read_csv("dirty_neurons_genes.csv.gz",index_col=0)
df = pd.read_csv("dirty_neurons_test_barcodes.csv.gz",index_col=0)
lbl = pd.read_csv("dirty_neurons_test_labels.csv.gz",index_col=0)
t1 = time.time()
print(f"time to read data files: {t1-t0:.3f} s")
print(f"all cell_ids from the labels == cell_ids from matrix: {all(df.cell_id == lbl.cell_id)}")

t0 = time.time()
dirty_neurons = scipy.io.mmread("dirty_neurons_test.mtx.gz")
t1 = time.time()
print(f"time to read matrix: {t1-t0:.3f} s")

if scipy.sparse.isspmatrix_coo(dirty_neurons):
  print("converting raw counts to CSR")
  t0 = time.time()
  dirty_neurons = dirty_neurons.tocsr()
  t1 = time.time()
  print(f"time to convert matrix: {t1-t0:.3f} s")

x_test = make_log_maxn_tensor(dirty_neurons)

time to read data files: 0.037 s
all cell_ids from the labels == cell_ids from matrix: True
time to read matrix: 10.147 s
converting raw counts to CSR
time to convert matrix: 0.053 s


### load the label encoder
The labels must be in the same order as model.

In [59]:
label_encoder = sklearn.preprocessing.LabelEncoder()
label_encoder.classes_ = np.load('dirty_neuron_encoder.npy')
ohe_encoder = sklearn.preprocessing.LabelBinarizer()
ohe_encoder.classes_ = np.load('dirty_neuron_encoder.npy')

Normally at prediction time, we would not have labels.  But since we have them, let's use them...

In [60]:
y_test = lbl.final_cluster_assignment.values

we now have the testing data and are ready to run it through the model...

# Step Three:
load the Neural Network

In [61]:
model = tf.keras.models.load_model('neurons_doublets.model')
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               6507520   
_________________________________________________________________
cell_type (Dense)            (None, 70)                17990     
Total params: 6,525,510
Trainable params: 6,525,510
Non-trainable params: 0
_________________________________________________________________


In [62]:
pred = model.predict(x_test)
cell_class = np.argmax(pred,axis=1)
cell_type = label_encoder.inverse_transform(cell_class)
called_class = label_encoder.transform(y_test)

df = pd.DataFrame({"predicted_class":cell_class,"called_class":called_class,"predicted_type":cell_type,"called_type":y_test,"probability":np.max(pred,axis=1)} )
df['agree'] = df.predicted_class == df.called_class
df

Unnamed: 0,predicted_class,called_class,predicted_type,called_type,probability,agree
0,39,39,Garbage,Garbage,0.947911,True
1,39,39,Garbage,Garbage,0.964306,True
2,39,39,Garbage,Garbage,0.097964,True
3,39,39,Garbage,Garbage,0.957435,True
4,1,1,Excit-01,Excit-01,0.538859,True
...,...,...,...,...,...,...
2805,35,35,Excit-35,Excit-35,0.972334,True
2806,62,62,Inhib-23,Inhib-23,0.951047,True
2807,22,22,Excit-22,Excit-22,0.453575,True
2808,35,39,Excit-35,Garbage,0.799530,False


# So you Now want to run your data...

## background work Step 1
lets take a second to talk about data formats..  Your data should be a sparse matrix in the for where Rows are the cells and the columns are the genes.  If you are running on Seurat data,  your Seurat object has the matrix saved in seuratObject.data.  In my version of Seurat, (Seurat_3.1.5) here is how I get the matrix.  If you have a better way, use that way.  The sparseMatrix format, called Matrix Market format, does not save the gene names or barcodes, so you need to write that out too.  

```
library(Seurat)
library(dplyr)
library(readr)
library(Matrix)

sparseMatrix <- GetAssayData(seuratObj,slot = "counts",assay = "RNA") 
writeMM(  t(sparseMatrix) ,"myData.mtx")
write_lines(rownames(seuratObj),"genes.txt")
write_lines(colnames(seuratObj),"barcodes.txt")

```

## backgroud work Step 2
Congratulations, you saved a sparse matrix with gene names and .  The next step is to read the data into this colab notebook.

I assume you are running this in colab.  If you are running this on your laptop in jupyter notebook/labs. The process is the same, but it may look different.  

You need to upload the matrix to the notebook virtual machine. 
The left panel of the colab window should say "Files" at the top.  If it says "Table of Contents, select the folder icon and you should see the files in colab. (See the screen shot below.  The red arrow points to the folder icon.
Notice the upload button in the colab notebook, I circled it in blue in the screenshot. Click the upload button and select the files: myData.mtx, genes.txt, and barcodes.txt.  You still need the label encoder used during training of the neural network, and the neural network. The next code cell has code get the network and label encoder.  If you ran the code above, you already have the encoder and the network.  Don't worry, I check if you've already downloaded the files.

Obviously, if you gave the files different names select the appropriate files.

![](https://raw.githubusercontent.com/ArielLevineLabNINDS/SeqSeek-Classify-NN/master/colabScreenShot1.png)

## Load the data into the code
now we are ready to get started...
Make sure that you ran the code above.  We're working to make it so you don't have to run the above code.

In [52]:
%%bash
FILE=dirty_neuron_encoder.npy
if [ ! -f $FILE ]; then
      echo "$FILE not found...  downloading...."
      wget -q https://raw.githubusercontent.com/ArielLevineLabNINDS/Seq-Seek-classifyData/master/$FILE
      #curl https://raw.githubusercontent.com/ArielLevineLabNINDS/Seq-Seek-classifyData/master/$FILE -o $FILE
    else
      echo "$FILE found..."
    fi

## download the model (same code as above)
FILE=neurons_doublets.model.tgz
DIR=neurons_doublets.model

if [ ! -d $DIR ]; then
  echo "model $DIR not found...  downloading from github ...."
  if [ ! -f $FILE ]; then
      echo "model $FILE not found...  downloading...."
      # if you are using a mac, comment out the wget command and uncomment out the curl command below
      wget -q https://raw.githubusercontent.com/ArielLevineLabNINDS/SeqSeek-Classify-NN/master/$FILE
      #curl https://raw.githubusercontent.com/ArielLevineLabNINDS/Seq-Seek-classifyData/master/$FILE -o $FILE
  else
      echo "$FILE found..."
  fi
  ## untar the file...
  tar -xvf $FILE
  rm $FILE

  if [ ! -d $DIR ]; then
    echo "download failed!"
  else
    echo "model $DIR downloaded successfully"
  fi
else
  echo "model $DIR found"
fi


dirty_neuron_encoder.npy found...
model neurons_doublets.model found


In [53]:
t0 = time.time()
genes = pd.read_csv("genes.txt",names=["gene"])
df = pd.read_csv("barcodes.txt",names=["cell_id"])
t1 = time.time()
print(f"time to read data files: {t1-t0:.3f} s")

t0 = time.time()
data = scipy.io.mmread("myData.mtx")
t1 = time.time()
print(f"time to read matrix: {t1-t0:.3f} s")
if scipy.sparse.isspmatrix_coo(data):
  print("converting raw counts to CSR")
  t0 = time.time()
  data = data.tocsr()
  t1 = time.time()
  print(f"time to convert matrix: {t1-t0:.3f} s")



FileNotFoundError: [Errno 2] No such file or directory: 'genes.txt'