# Dimensionalty Reduced Disease Type Classifier

- BME 230A class project winter 2019
- Andrew E. Davidson
- [aedavids@ucsc.edu](mailto:aedavids@edu?subject=SimpleModel.ipynb)

ref:
- [https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)
- Chapter 8 "Dimensionality Reduction in "Hands-On Machine Learning with Scikit-learn & TensorFlow" by Aurelien Geron
- [https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60](https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60)

In [1]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ProgbarLogger
from keras.layers import Dense
from keras.layers import Dense,Input,BatchNormalization, InputLayer, Activation
from keras.models import Sequential
from keras.optimizers import Adam, SGD, Adadelta, Adagrad
from keras.regularizers import L1L2
from keras.utils import np_utils
import matplotlib.pyplot as plt
import numpy as np
import os 
import pandas as pd

from sklearn.preprocessing import LabelEncoder
import sys

import tensorflow as tf

# fix random seed for reproducibility
theMeaningOfLife = 42

import keras
if "2.1.6" != keras.__version__ :
    emsg = "ERROR keras version {} != 2.1.6, new version can not save and restore models".format(keras.__version__)
    raise ValueError(emsg)

# add path to our local modules
# assume they are in the same directory we launched the juypter server in
# /home/ubuntu/BME-230a
!pwd
localModuleDir = "."
sys.path.append(localModuleDir)

Using TensorFlow backend.


/home/ubuntu/BME-230a


## load data

In [2]:
%%time
rootDir = "/bme-230a-ebs"
sourceDataFilePath = "{}/data/tcga_target_gtex.h5".format(rootDir)
print(sourceDataFilePath)
if not os.path.isfile(sourceDataFilePath) :
    emsg = "ERROR: {} not found".format(sourceDataFilePath)
    print(emsg)
    print("change rootDir")
    sys.stdout.flush() # force error message to print
    raise ValueError(emsg)
    
from loadData import loadCancerDiseaseTypeTidyDataSet

ret = loadCancerDiseaseTypeTidyDataSet(rootDir)
hugoIds, diseaseLabelEncoder, XTrainNumpy, yTrainNumpy, XTestNumpy, yTestNumpy = ret
#XTestNumpy = yTestNumpy = None # clean up memory
ret = None # clean up memory

/bme-230a-ebs/data/tcga_target_gtex.h5
sourceDataFilePath:/bme-230a-ebs/data/tcga_target_gtex.h5
CPU times: user 1.3 s, sys: 4.89 s, total: 6.18 s
Wall time: 6.18 s


In [3]:
print("XTrainNumpy.shape: {}".format(XTrainNumpy.shape))
print("yTrainNumpy.shape: {}".format(yTrainNumpy.shape))

XTrainNumpy.shape: (8424, 58581)
yTrainNumpy.shape: (8424, 39)


In [4]:
%%time
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95) # account for 95% of the variance
XTrainReducedNumpy = pca.fit_transform(XTrainNumpy) 
XTrainNumpy = None # prevent bugs and clean up memory

CPU times: user 13min 24s, sys: 5.86 s, total: 13min 30s
Wall time: 3min 33s


In [5]:
print("type(XTrainReducedNumpy:{}".format(type(XTrainReducedNumpy)))
print("XTrainReducedNumpy.shape: {}".format(XTrainReducedNumpy.shape))
print("pca.n_components_: {}".format(pca.n_components_))
print("pca.explained_variance_: {}".format(pca.explained_variance_))
print("pca.explained_variance_ratio_: {}".format(pca.explained_variance_ratio_))

type(XTrainReducedNumpy:<class 'numpy.ndarray'>
XTrainReducedNumpy.shape: (8424, 5895)
pca.n_components_: 5895
pca.explained_variance_: [2.15254990e+04 1.86303089e+04 1.64745828e+04 ... 8.72403837e+00
 8.72019841e+00 8.71888428e+00]
pca.explained_variance_ratio_: [7.24130749e-02 6.26734811e-02 5.54214888e-02 ... 2.93481904e-05
 2.93352726e-05 2.93308518e-05]


## Create Model
this should be the same as the model in disaseTypeClassifier.ipynb

In [6]:
def multiClassClassifier(inputDim=None, outputDim=None, learningRate=0.001):
    '''
    aedwip
    '''
    classify = [
        InputLayer(input_shape=(inputDim,)),
        BatchNormalization(),
        Dense(outputDim), # dot(input, kernel) + bias
        Activation('softmax') 
    ]
    
    model = Sequential(classify)   
    # https://keras.io/backend/#categorical_crossentropy
    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=learningRate), metrics=['accuracy']) 
    
    return model 

In [None]:
%%time
modelName="reducedDiseaseClassifier"
# the first col of yTrainNumpy is the disease value. 
# numCases is size of the prediction output
numCases = yTrainNumpy.shape[1] -1

reducedDiseaseClassifierModel = multiClassClassifier(
                                inputDim=XTrainReducedNumpy.shape[1],
                                outputDim=numCases,
                                )
reducedDiseaseClassifierModel.summary()

# https://keras.io/callbacks/
checkPointPath="./models/{}.chkPt".format(modelName)
callbacks = [
    # monitor valuse either 'acc' for accuracy or 'loss'
    # 'val_loss' is loss on hold if valaidation_split is set
    # 'loss' is loss on training
    # same for 'acc' and 'val_acc'
    EarlyStopping(monitor='loss', patience=2, verbose=0) 
    ,ModelCheckpoint(checkPointPath, monitor='loss', save_best_only=False, verbose=0)
    # FIXME: progbar generates run time error
    #,ProgbarLogger(count_mode='samples', stateful_metrics=None)
]

trainOneHots = yTrainNumpy[:,1:]
history = reducedDiseaseClassifierModel.fit(XTrainReducedNumpy,trainOneHots,        
                                        shuffle=None, # we already shuffled
                                        epochs= 8, #20, #100
                                        batch_size=1024, 
                                        # we already split the data         
                                        validation_split=0.0, 
                                        verbose=0,
                                        callbacks=callbacks
                                     )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 5895)              0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 5895)              23580     
_________________________________________________________________
dense_2 (Dense)              (None, 38)                224048    
_________________________________________________________________
activation_2 (Activation)    (None, 38)                0         
Total params: 247,628
Trainable params: 235,838
Non-trainable params: 11,790
_________________________________________________________________
