# Disease Type Classifier Evaluation

- BME 230A class project winter 2019
- Andrew E. Davidson
- [aedavids@ucsc.edu](mailto:aedavids@edu?subject=SimpleModel.ipynb)

identify genes that maximal activate and compare to known cancer causing genes

In [1]:
from keras.models import load_model

import matplotlib.pyplot as plt
import numpy as np
import os 
import pandas as pd

from sklearn.preprocessing import LabelEncoder
import sys

import tensorflow as tf

# fix random seed for reproducibility
theMeaningOfLife = 42

import keras
if "2.1.6" != keras.__version__ :
    emsg = "ERROR keras version {} != 2.1.6, new version can not save and restore models".format(keras.__version__)
    raise ValueError(emsg)

# add path to our local modules
# assume they are in the same directory we launched the juypter server in
# /home/ubuntu/BME-230a
!pwd
localModuleDir = "."
sys.path.append(localModuleDir)

Using TensorFlow backend.


/home/ubuntu/BME-230a


## load model and data

In [2]:
%%time
rootDir = "/bme-230a-ebs"
sourceDataFilePath = "{}/data/tcga_target_gtex.h5".format(rootDir)
print(sourceDataFilePath)
if not os.path.isfile(sourceDataFilePath) :
    emsg = "ERROR: {} not found".format(sourceDataFilePath)
    print(emsg)
    print("change rootDir")
    sys.stdout.flush() # force error message to print
    raise ValueError(emsg)
    
from loadData import loadDiseaseTypeData
# literate programing, name all the return values even though we do not
# use them
XTrainNumpy, yTrainNumpy, XTestNumpy, yTestNumpy = loadDiseaseTypeData(rootDir)
#XTestNumpy = yTestNumpy = None # clear memory

/bme-230a-ebs/data/tcga_target_gtex.h5
sourceDataFilePath:/bme-230a-ebs/data/tcga_target_gtex.h5
CPU times: user 1.27 s, sys: 5.56 s, total: 6.83 s
Wall time: 53.8 s


## <span style="color:red"> AEDWIP the data looks suprisingly clean</span>
- was there a bug in the original data set prepration ?
- was a bug introduced while creating the tidy data set?
    * maybe sklearn.model_selection.StratifiedShuffleSplit samples with replacement by default?

In [3]:
print(XTrainNumpy[0:10, 0:3])

[[-9.96604119 -9.96581639 -9.96588062]
 [-9.96604119 -9.96581639 -9.96588062]
 [-9.96604119 -9.96581639 -2.38840045]
 [-9.96604119 -9.96581639 -2.93240066]
 [-9.96604119 -9.96581639 -9.96588062]
 [-9.96604119 -9.96581639 -9.96588062]
 [ 4.78870011 -9.96581639  0.42331692]
 [-9.96604119 -9.96581639 -9.96588062]
 [-9.96604119 -9.96581639 -0.28450013]
 [-9.96604119 -9.96581639 -9.96588062]]


In [4]:
modelName="diseaseClassifier"
modelRootDir = "{}/models".format(rootDir)
fullModelPath = "{}/full{}.h5".format(modelRootDir, modelName)
diseaseClassifierModel = load_model(fullModelPath)
diseaseClassifierModel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 58581)             0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 58581)             234324    
_________________________________________________________________
dense_1 (Dense)              (None, 93)                5448126   
_________________________________________________________________
activation_1 (Activation)    (None, 93)                0         
Total params: 5,682,450
Trainable params: 5,565,288
Non-trainable params: 117,162
_________________________________________________________________


In [24]:
%%time
# find which class a gene maximal activates
def findAEDWIP(model, m, batchSize):
    '''
    input:
        model:
        #n: the number of examples
        m: the number of features
        batchSize
    '''
    ret = np.zeros((m,1))
    numBatches,r = divmod(m, batchSize)
    print("m:{} batchSize:{} numBatches:{} r:{}".format(m, batchSize, numBatches, r))
    startRowIdx = 0
    batchCount = 0
    while (batchCount < numBatches):
        if batchCount < numBatches:
            bs = batchSize
        else:
            bs = r # short batch


        # create a feature vector one hot. We only want the activation f
        # value for a single gene
        batch = np.zeros((bs, m))
        for j in range(bs):
            # use a high value
            # when value was 1 all predictions where class 27
            batch[j, startRowIdx + j] = 1000000 
        
        predictions = model.predict(batch)
        predictedValuesTensor = keras.backend.argmax(predictions)

        # use keras escape hatch to tensor flow
        # there is probably a better way to do this
        sess = tf.Session()
        with sess.as_default():
            predictedValuesNumpy = predictedValuesTensor.eval()  

        shape = predictedValuesNumpy.shape
        ret[startRowIdx: startRowIdx+bs] = np.reshape(predictedValuesNumpy, (shape[0],-1))
        
        # increment loop counts
        batchCount += 1
        startRowIdx += bs
        #print("batchCount:{} next start:{}".format( batchCount,startRowIdx))

    return ret
    

# findAEDWIP(diseaseClassifierModel, XTrainNumpy[0:10, :])
# why hold on to all the memory we can probably pick these values out of the mode
xShape= XTrainNumpy.shape
ret = findAEDWIP(diseaseClassifierModel, m=xShape[1],  batchSize=6000) 
print("ret.shape:{}".format(ret.shape))
print(np.transpose(ret[100:300]))

m:58581 batchSize:6000 numBatches:9 r:4581
ret.shape:(58581, 1)
[[27. 51. 51. 52. 56. 72. 31. 27. 57. 25. 51. 25. 11. 57.  1. 58. 51. 71.
  25. 52. 27. 27.  6. 80. 27. 51. 27. 51. 52. 52. 51. 55. 61. 23. 27. 71.
  64. 75. 27. 27. 51. 80.  8. 27. 51.  7. 57. 27. 51. 73.  9. 80. 80. 27.
  76. 67. 25. 31. 27. 51. 52. 27. 27. 27. 76. 67. 87. 11. 55. 27.  6. 51.
  25. 51. 27. 25. 25. 27. 31. 66. 71. 52. 51. 25. 14. 30. 40. 57. 66. 71.
  51. 27. 27. 27. 27. 76. 80. 51. 89. 80. 48. 27. 27. 27. 27. 51. 27. 27.
  71. 27. 27. 27. 80. 68.  1. 61. 87. 51. 27. 51. 80. 23. 71. 64. 51. 40.
  64. 27. 51. 27. 27. 85. 46. 80. 27. 58. 31. 27. 80. 51. 47. 57. 27. 85.
  92. 25. 27. 27. 25. 27. 51. 61. 57. 51. 51. 27. 27. 71. 38. 40. 71. 72.
  51. 85. 51. 58. 76. 71. 51. 51. 64. 51. 46. 27. 51. 62. 51. 25. 76. 71.
  27. 25. 27. 68. 84. 27. 51. 64. 82. 21. 51. 51. 27. 76. 25. 64. 31. 27.
   1. 71.]]
CPU times: user 1min 30s, sys: 12.9 s, total: 1min 43s
Wall time: 29.1 s
