# Disease Type Classifier Evaluation

- BME 230A class project winter 2019
- Andrew E. Davidson
- [aedavids@ucsc.edu](mailto:aedavids@edu?subject=SimpleModel.ipynb)

ref: diseaseTypeClassifier.ipynb

identify genes that maximal activate and compare to known cancer causing genes

In [None]:
from keras.models import load_model

import matplotlib.pyplot as plt
import numpy as np
import os 
import pandas as pd

from sklearn.preprocessing import LabelEncoder
import sys

import tensorflow as tf

# fix random seed for reproducibility
theMeaningOfLife = 42

import keras
if "2.1.6" != keras.__version__ :
    emsg = "ERROR keras version {} != 2.1.6, new version can not save and restore models".format(keras.__version__)
    raise ValueError(emsg)

# add path to our local modules
# assume they are in the same directory we launched the juypter server in
# /home/ubuntu/BME-230a
!pwd
localModuleDir = "."
sys.path.append(localModuleDir)

## load model and data

In [None]:
%%time
rootDir = "/bme-230a-ebs"
sourceDataFilePath = "{}/data/tcga_target_gtex.h5".format(rootDir)
print(sourceDataFilePath)
if not os.path.isfile(sourceDataFilePath) :
    emsg = "ERROR: {} not found".format(sourceDataFilePath)
    print(emsg)
    print("change rootDir")
    sys.stdout.flush() # force error message to print
    raise ValueError(emsg)
    
from loadData import loadCancerDiseaseTypeTidyDataSet

ret = loadCancerDiseaseTypeTidyDataSet(rootDir)
diseaseLabelEncoder, XTrainNumpy, yTrainNumpy, XTestNumpy, yTestNumpy = ret
#XTestNumpy = yTestNumpy = None # clean up memory
ret = None # clean up memory

## <span style="color:red"> AEDWIP the data looks suprisingly clean</span>
- was there a bug in the original data set prepration ?
- was a bug introduced while creating the tidy data set?
    * maybe sklearn.model_selection.StratifiedShuffleSplit samples with replacement by default?
- signifigant figure?

In [None]:
print(XTrainNumpy[0:10, 0:3])

In [None]:
# print(yTrainNumpy[0:3,:]) shape (1, 1 + k ) [0] is diseasy type, rest is one hot
# looks like maybe a bug
# load XDF and YDF , what is the deiase value if normal? I get it imputed to either NaN or 0

In [None]:
modelName="diseaseClassifier"
modelRootDir = "{}/models".format(rootDir)
fullModelPath = "{}/{}.h5".format(modelRootDir, modelName)
diseaseClassifierModel = load_model(fullModelPath)
diseaseClassifierModel.summary()

In [None]:
%%time
# find which class a gene maximal activates
def findAEDWIP(model, m, batchSize):
    '''
    input:
        model:
        #n: the number of examples
        m: the number of features
        batchSize
    '''
    ret = np.zeros((m,1))
    numBatches,r = divmod(m, batchSize)
    print("m:{} batchSize:{} numBatches:{} r:{}".format(m, batchSize, numBatches, r))
    startRowIdx = 0
    batchCount = 0
    while (batchCount < numBatches):
        if batchCount < numBatches:
            bs = batchSize
        else:
            bs = r # short batch

        # create a feature vector one hot. We only want the activation f
        # value for a single gene
        batch = np.zeros((bs, m))
        for j in range(bs):
            # use a high value
            # when value was 1 all predictions where class 27
            batch[j, startRowIdx + j] = 1000000 
        
        predictions = model.predict(batch)
        predictedValuesTensor = keras.backend.argmax(predictions)

        # use keras escape hatch to tensor flow
        # there is probably a better way to do this
        sess = tf.Session()
        with sess.as_default():
            predictedValuesNumpy = predictedValuesTensor.eval()  

        shape = predictedValuesNumpy.shape
        ret[startRowIdx: startRowIdx+bs] = np.reshape(predictedValuesNumpy, (shape[0],-1))
        
        # increment loop counts
        batchCount += 1
        startRowIdx += bs
        #print("batchCount:{} next start:{}".format( batchCount,startRowIdx))

    return ret
    

# why hold on to all the memory we can probably pick these values out of the mode
xShape= XTrainNumpy.shape
maxActivations = findAEDWIP(diseaseClassifierModel, m=xShape[1],  batchSize=6000) 
print("maxActivations.shape:{}".format(maxActivations.shape))

In [None]:
print(np.transpose(maxActivations[100:300]))

In [None]:
# geneIdx is the feature column index
# print(xShape[1])
# geneIdx = np.arange(xShape[1]) 
# print(geneIdx.shape)
# print(geneIdx)

xxxDF = pd.DataFrame(maxActivations, columns=["diseaseValue"])
print(xxxDF.head())
print(xxxDF.tail())
print(xxxDF.index)
print(xxxDF.shape)

In [None]:
%%capture --no-display
# turn off deprecation warnings
# https://ipython.readthedocs.io/en/stable/interactive/magics.html?highlight=capture#cellmagic-capture
countsDict = dict()
grouped = xxxDF.groupby("diseaseValue")
for diseaseValue, group in grouped:
    stats = group.agg(['count'])
    dv = int(diseaseValue)
    key = "{}_{}".format(dv,diseaseLabelEncoder.inverse_transform([dv]) )
    countsDict[key] = stats.values

In [None]:
print("key:24_['Pancreatic Adenocarcinoma'] maximal activated gene count:{}"
         .format(countsDict["24_['Pancreatic Adenocarcinoma']"]))

In [None]:
%%capture --no-display
# turn off deprecation warnings
# https://ipython.readthedocs.io/en/stable/interactive/magics.html?highlight=capture#cellmagic-capture
dt = diseaseLabelEncoder.inverse_transform(range(38))

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8,10)
countsList = [v[0][0] for v in countsDict.values()]
plt.barh(dt, countsList )
plt.show()

In [47]:
# how can we get back the Hugo gene names

# Load training set
XDF = pd.read_hdf("/bme-230a-ebs/data/tcga_target_gtex.h5", "expression")

#https://stackoverflow.com/a/45327097/4586180
df = pd.DataFrame(maxActivations.reshape(-1, len(maxActivations))
                  , columns=XDF.columns.values
                 )
print(df.shape)
print(df.iloc[:, 0:3])

print()
print(XDF.iloc[0, 0:3])

(1, 58581)
   5S_rRNA  5_8S_rRNA  7SK
0     23.0       16.0  6.0

5S_rRNA     -9.966041
5_8S_rRNA   -9.965816
7SK         -9.965881
Name: GTEX-1117F-0226-SM-5GZZ7, dtype: float64


In [52]:
paNumpy = countsDict["24_['Pancreatic Adenocarcinoma']"]
print(type(paNumpy))
print(paNumpy.shape)
print(paNumpy[30:32])
paNumpy

<class 'numpy.ndarray'>
(1, 1)
[]


array([[44]])

In [114]:
print(maxActivations[0:6].astype(int))
print(maxActivations.shape)

[[23]
 [16]
 [ 6]
 [20]
 [ 6]
 [ 6]]
(58581, 1)


In [109]:
#%%capture --no-display
disease = diseaseLabelEncoder.inverse_transform(maxActivations.astype(int))
print(type(disease))
print(disease.shape)
#disease = np.reshape(disease, (1,-1))
print(disease.shape)      
print(disease[0:3])

<class 'numpy.ndarray'>
(58581, 1)
(58581, 1)
[['Ovarian Serous Cystadenocarcinoma']
 ['Kidney Clear Cell Carcinoma']
 ['Breast Invasive Carcinoma']]


  if diff:


In [127]:
# turn off deprecation warnings
# https://ipython.readthedocs.io/en/stable/interactive/magics.html?highlight=capture#cellmagic-capture
# df2 = pd.DataFrame(maxActivations.astype(int)

dataDict = {
    "HugoId":XDF.columns.values,
    "disease":disease.flatten(),
    "maxActDisease":maxActivations.astype(int).flatten()
}

aedwipDF = pd.DataFrame(data=dataDict)

aedwipDF.head()

Unnamed: 0,HugoId,disease,maxActDisease
0,5S_rRNA,Ovarian Serous Cystadenocarcinoma,23
1,5_8S_rRNA,Kidney Clear Cell Carcinoma,16
2,7SK,Breast Invasive Carcinoma,6
3,A1BG,Lung Squamous Cell Carcinoma,20
4,A1BG-AS1,Breast Invasive Carcinoma,6


In [132]:
#https://www.tutorialspoint.com/python_pandas/python_pandas_groupby.htm
groupedDF = aedwipDF.groupby('disease')
print(type(groupedDF))
#print(groupedDF.groups)
groupedDF.get_group('Pancreatic Adenocarcinoma')

<class 'pandas.core.groupby.DataFrameGroupBy'>


Unnamed: 0,HugoId,disease,maxActDisease
2835,AC079235.1,Pancreatic Adenocarcinoma,24
2993,AC087499.9,Pancreatic Adenocarcinoma,24
3021,AC090311.1,Pancreatic Adenocarcinoma,24
4291,AC231645.1,Pancreatic Adenocarcinoma,24
5069,AL008708.1,Pancreatic Adenocarcinoma,24
5533,AL354931.1,Pancreatic Adenocarcinoma,24
5819,AL603650.4,Pancreatic Adenocarcinoma,24
6785,ARAF,Pancreatic Adenocarcinoma,24
7334,ATP6V1D,Pancreatic Adenocarcinoma,24
8132,C14orf93,Pancreatic Adenocarcinoma,24


In [None]:
https://www.kegg.jp/kegg-bin/search?q=%09Pancreatic+Adenocarcinoma&display=disease&from=disease
https://www.kegg.jp/dbget-bin/www_bget?ds:H00019

        
google CYB5D2 Pancreatic Adenocarcinoma
https://www.proteinatlas.org/ENSG00000167740-CYB5D2/pathology
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4753160/