# Disease Type Classifier Evaluation

- BME 230A class project winter 2019
- Andrew E. Davidson
- [aedavids@ucsc.edu](mailto:aedavids@edu?subject=SimpleModel.ipynb)

identify genes that maximal activate and compare to known cancer causing genes

In [6]:
from keras.models import load_model

import matplotlib.pyplot as plt
import numpy as np
import os 
import pandas as pd

from sklearn.preprocessing import LabelEncoder
import sys

import tensorflow as tf

# fix random seed for reproducibility
theMeaningOfLife = 42

import keras
if "2.1.6" != keras.__version__ :
    emsg = "ERROR keras version {} != 2.1.6, new version can not save and restore models".format(keras.__version__)
    raise ValueError(emsg)

# add path to our local modules
# assume they are in the same directory we launched the juypter server in
# /home/ubuntu/BME-230a
!pwd
localModuleDir = "."
sys.path.append(localModuleDir)

/home/ubuntu/BME-230a


## load model and data

In [22]:
%%time
rootDir = "/bme-230a-ebs"
sourceDataFilePath = "{}/data/tcga_target_gtex.h5".format(rootDir)
print(sourceDataFilePath)
if not os.path.isfile(sourceDataFilePath) :
    emsg = "ERROR: {} not found".format(sourceDataFilePath)
    print(emsg)
    print("change rootDir")
    sys.stdout.flush() # force error message to print
    raise ValueError(emsg)
    
from loadData import loadDiseaseTypeData
# literate programing, name all the return values even though we do not
# use them
XTrainNumpy, yTrainNumpy, XTestNumpy, yTestNumpy = loadDiseaseTypeData(rootDir)
#XTestNumpy = yTestNumpy = None # clear memory

/bme-230a-ebs/data/tcga_target_gtex.h5
sourceDataFilePath:/bme-230a-ebs/data/tcga_target_gtex.h5
<class 'numpy.ndarray'>
(19126, 93)
<class 'numpy.ndarray'>
(19126, 1)
CPU times: user 1.12 s, sys: 7.46 s, total: 8.58 s
Wall time: 25.6 s


## <span style="color:red"> AEDWIP the data looks suprisingly clean</span>
- was there a bug in the original data set prepration ?
- was a bug introduced while creating the tidy data set?
    * maybe sklearn.model_selection.StratifiedShuffleSplit samples with replacement by default?

In [17]:
print(XTrainNumpy[0:10, 0:3])

[[-9.96604119 -9.96581639 -9.96588062]
 [-9.96604119 -9.96581639 -9.96588062]
 [-9.96604119 -9.96581639 -2.38840045]
 [-9.96604119 -9.96581639 -2.93240066]
 [-9.96604119 -9.96581639 -9.96588062]
 [-9.96604119 -9.96581639 -9.96588062]
 [ 4.78870011 -9.96581639  0.42331692]
 [-9.96604119 -9.96581639 -9.96588062]
 [-9.96604119 -9.96581639 -0.28450013]
 [-9.96604119 -9.96581639 -9.96588062]]


In [7]:
modelName="diseaseClassifier"
modelRootDir = "{}/models".format(rootDir)
fullModelPath = "{}/full{}.h5".format(modelRootDir, modelName)
diseaseClassifierModel = load_model(fullModelPath)
diseaseClassifierModel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 58581)             0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 58581)             234324    
_________________________________________________________________
dense_1 (Dense)              (None, 93)                5448126   
_________________________________________________________________
activation_1 (Activation)    (None, 93)                0         
Total params: 5,682,450
Trainable params: 5,565,288
Non-trainable params: 117,162
_________________________________________________________________


In [30]:
%%time
# find which class a gene maximal activates
def findAEDWIP(model, m, batchSize):
    '''
    input:
        model:
        #n: the number of examples
        m: the number of features
        batchSize
    '''
    ret = np.zeros((m,2))
    q,r = divmod(m, batchSize)
    startRowIdx = 0
    i = 0
    while i < q:
        if i < q:
            bs = batchSize
        else:
            bs = r

        print("{} {}".format(startRowIdx, startRowIdx + bs))
        
        # create a feature vector one hot. We only want the activation f
        # value for a single gene
        batch = np.zeros((bs, m))
        for j in range(bs):
            batch[j, startRowIdx + j] = 1
        
        predictedOnHots = model.predict(batch)
        predictedValuesTensor = keras.backend.argmax(predictedOnHots)
    
        # use keras escape hatch to tensor flow
        sess = tf.Session()
        with sess.as_default():
            predictedValuesNumpy = predictedValuesTensor.eval()        
        
            i+=1
            startRowIdx += bs
        
        return ret
    

# findAEDWIP(diseaseClassifierModel, XTrainNumpy[0:10, :])
# why hold on to all the memory we can probably pick these values out of the mode
xShape= XTrainNumpy.shape
yShape = yTrainNumpy.shape
#findAEDWIP(diseaseClassifierModel, n=xShape[0], m=xShape[1], k=yShape[1])
ret = findAEDWIP(diseaseClassifierModel, m=xShape[1],  batchSize=6000)
print(ret.shape)
print(ret[3])

0 2

batch i:0
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
2 4

batch i:1
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]
4 6

batch i:2
[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]
6 8

batch i:3
[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]
8 10

batch i:4
[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
