# Dimensionalty Reduced Disease Type Classifier

- BME 230A class project winter 2019
- Andrew E. Davidson
- [aedavids@ucsc.edu](mailto:aedavids@edu?subject=SimpleModel.ipynb)

ref:
- [https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)
- Chapter 8 "Dimensionality Reduction in "Hands-On Machine Learning with Scikit-learn & TensorFlow" by Aurelien Geron
- [https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60](https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60)
- [https://scikit-learn.org/stable/modules/decomposition.html#pca](https://scikit-learn.org/stable/modules/decomposition.html#pca)
- [https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_fa_model_selection.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-fa-model-selection-py](https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_fa_model_selection.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-fa-model-selection-py)

TODO:
- save pca to disk [https://stackoverflow.com/a/42503036/4586180](https://stackoverflow.com/a/42503036/4586180)

In [1]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ProgbarLogger
from keras.layers import Dense
from keras.layers import Dense,Input,BatchNormalization, InputLayer, Activation
from keras.models import Sequential
from keras.optimizers import Adam, SGD, Adadelta, Adagrad
from keras.regularizers import L1L2
from keras.utils import np_utils
import matplotlib.pyplot as plt
import numpy as np
import os 
import pandas as pd

from sklearn.preprocessing import LabelEncoder
import sys

import tensorflow as tf

# fix random seed for reproducibility
theMeaningOfLife = 42

import keras
if "2.1.6" != keras.__version__ :
    emsg = "ERROR keras version {} != 2.1.6, new version can not save and restore models".format(keras.__version__)
    raise ValueError(emsg)

# add path to our local modules
# assume they are in the same directory we launched the juypter server in
# /home/ubuntu/BME-230a
!pwd
localModuleDir = "."
sys.path.append(localModuleDir)

Using TensorFlow backend.


/home/ubuntu/BME-230a


## load data

In [2]:
%%time
rootDir = "/bme-230a-ebs"
sourceDataFilePath = "{}/data/tcga_target_gtex.h5".format(rootDir)
print(sourceDataFilePath)
if not os.path.isfile(sourceDataFilePath) :
    emsg = "ERROR: {} not found".format(sourceDataFilePath)
    print(emsg)
    print("change rootDir")
    sys.stdout.flush() # force error message to print
    raise ValueError(emsg)
    
from loadData import loadCancerDiseaseTypeTidyDataSet

ret = loadCancerDiseaseTypeTidyDataSet(rootDir)
hugoIds, diseaseLabelEncoder, XTrainNumpy, yTrainNumpy, XTestNumpy, yTestNumpy = ret
#XTestNumpy = yTestNumpy = None # clean up memory
ret = None # clean up memory

/bme-230a-ebs/data/tcga_target_gtex.h5
sourceDataFilePath:/bme-230a-ebs/data/tcga_target_gtex.h5
CPU times: user 1.33 s, sys: 4.87 s, total: 6.2 s
Wall time: 6.2 s


In [6]:
print("XTrainNumpy.shape: {}".format(XTrainNumpy.shape))
print("yTrainNumpy.shape: {}".format(yTrainNumpy.shape))
print("XTestNumpy.shape: {}".format(XTestNumpy.shape))

XTrainNumpy.shape: (8424, 58581)
yTrainNumpy.shape: (8424, 39)
XTestNumpy.shape: (2106, 58581)


In [32]:
%%time
from sklearn.decomposition import PCA
from sklearn.externals import joblib

XTrainReducedPCAFilePath = "{}/data/reducedXDiseaseTypePCA.pkl".format(rootDir)
XTrainReducedNumpyDataFilePath = "{}/data/reducedXDiseaseTypeDataSet.npz".format(rootDir)

if os.path.isfile(XTrainReducedNumpyDataFilePath) :
    print("loading XTrainReducedNumpy and XTestReducedNumpy from {}".format(XTrainReducedNumpyDataFilePath))
    reducedFiles = np.load(XTrainReducedNumpyDataFilePath)
    #print(reducedFiles.files)
    XTrainReducedNumpy = reducedFiles['arr_0']
    XTestReducedNumpy  = reducedFiles['arr_1']
    
    if os.path.isfile(XTrainReducedPCAFilePath) :
        print("loading pca and pca from {}".format(XTrainReducedPCAFilePath))
        pca = joblib.load(XTrainReducedPCAFilePath)
    else:
        raise ValueError("pca is missing path:{}".format(XTrainReducedPCAFilePath))
        
else :
    print("running PCA")
    pca = PCA(n_components=0.95) # account for 95% of the variance
    XTrainReducedNumpy = pca.fit_transform(XTrainNumpy) 
    XTestReducedNumpy = pca.transform(XTestNumpy)
    np.savez(XTrainReducedNumpyDataFilePath, XTrainReducedNumpy, XTestReducedNumpy)
    print("saved numpy arrays to :{}".format(XTrainReducedNumpyDataFilePath))
    
    joblib.dump(pca, XTrainReducedPCAFilePath)
    print("saved pca to :{}".format(XTrainReducedPCAFilePath))

loading XTrainReducedNumpy and XTestReducedNumpy from /bme-230a-ebs/data/reducedXDiseaseTypeDataSet.npz
['arr_0', 'arr_1']
loading pca and pca from /bme-230a-ebs/data/reducedXDiseaseTypePCA.pkl
CPU times: user 652 ms, sys: 1.14 s, total: 1.8 s
Wall time: 1.79 s


In [16]:
print("      type(XTrainReducedNumpy:{}".format(type(XTrainReducedNumpy)))
print("     XTrainReducedNumpy.shape: {}".format(XTrainReducedNumpy.shape))
print("      XTestReducedNumpy.shape: {}".format(XTestReducedNumpy.shape))
print("            pca.n_components_: {}".format(pca.n_components_))
print("      pca.explained_variance_: {}".format(pca.explained_variance_))
print("pca.explained_variance_ratio_: {}".format(pca.explained_variance_ratio_))

      type(XTrainReducedNumpy:<class 'numpy.ndarray'>
     XTrainReducedNumpy.shape: (8424, 5895)
      XTestReducedNumpy.shape: (2106, 5895)
            pca.n_components_: 5895
      pca.explained_variance_: [2.15254990e+04 1.86303089e+04 1.64745828e+04 ... 8.72403837e+00
 8.72019841e+00 8.71888428e+00]
pca.explained_variance_ratio_: [7.24130749e-02 6.26734811e-02 5.54214888e-02 ... 2.93481904e-05
 2.93352726e-05 2.93308518e-05]


In [None]:
# test PCA save and restore
print("       XTrainNumpy[0:3][0:3]: {}".format(XTrainNumpy[0:3][0:3]))
print("XTrainReducedNumpy[0:3][0:3]: {}".format(XTrainReducedNumpy[0:3][0:3]))
BREAKED

In [25]:
ldfg;ldkfg;ldf
SAVE= XTrainNumpy[0:2][:]
print(SAVE.shape)
np.array_equal(SAVE, SAVE)

REDUCED_SAVE = pca.transform(SAVE)
print(REDUCED_SAVE.shape)

(2, 58581)
(2, 5895)


In [33]:
test = pca.transform(SAVE)
print(np.array_equal(REDUCED_SAVE, test))

True


In [29]:
# %%time
# from sklearn.externals import joblib

# print(XTrainReducedPCAFilePath)
# if os.path.isfile(XTrainReducedPCAFilePath) :
#     print("loading pca and pca from {}".format(XTrainReducedPCAFilePath))
#     pca = joblib.load(XTrainReducedPCAFilePath)
# else :
#     print("save PCA")
#     joblib.dump(pca, XTrainReducedPCAFilePath)
    
test = pca.transform(SAVE)
print(np.array_equal(REDUCED_SAVE, test))

/bme-230a-ebs/data/reducedXDiseaseTypePCA.pkl
loading pca and pca from /bme-230a-ebs/data/reducedXDiseaseTypePCA.pkl
True
CPU times: user 708 ms, sys: 924 ms, total: 1.63 s
Wall time: 1.63 s


## Create Model
this should be the same as the model in disaseTypeClassifier.ipynb

In [None]:
def multiClassClassifier(inputDim=None, outputDim=None, learningRate=0.001):
    '''
    aedwip
    '''
    classify = [
        InputLayer(input_shape=(inputDim,)),
        BatchNormalization(),
        Dense(outputDim), # dot(input, kernel) + bias
        Activation('softmax') 
    ]
    
    model = Sequential(classify)   
    # https://keras.io/backend/#categorical_crossentropy
    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=learningRate), metrics=['accuracy']) 
    
    return model 

In [None]:
%%time
modelName="reducedDiseaseClassifier"
# the first col of yTrainNumpy is the disease value. 
# numCases is size of the prediction output
numCases = yTrainNumpy.shape[1] -1

reducedDiseaseClassifierModel = multiClassClassifier(
                                inputDim=XTrainReducedNumpy.shape[1],
                                outputDim=numCases,
                                )
reducedDiseaseClassifierModel.summary()

In [None]:
# https://keras.io/callbacks/
checkPointPath="./models/{}.chkPt".format(modelName)
callbacks = [
    # monitor valuse either 'acc' for accuracy or 'loss'
    # 'val_loss' is loss on hold if valaidation_split is set
    # 'loss' is loss on training
    # same for 'acc' and 'val_acc'
    #EarlyStopping(monitor='loss', patience=2, verbose=0), 
    ModelCheckpoint(checkPointPath, monitor='loss', save_best_only=False, verbose=0)
    # FIXME: progbar generates run time error
    #,ProgbarLogger(count_mode='samples', stateful_metrics=None)
]

trainOneHots = yTrainNumpy[:,1:]
history = reducedDiseaseClassifierModel.fit(XTrainReducedNumpy,trainOneHots,        
                                        shuffle=None, # we already shuffled
                                        epochs= 8, #20, #100
                                        batch_size=1024, 
                                        # we already split the data         
                                        validation_split=0.0, 
                                        verbose=0,
                                        callbacks=callbacks
                                     )

In [None]:
reducedDiseaseClassifierModel.summary()