# Cancer Type Classifier
- BME 230A class project winter 2019
- Andrew E. Davidson
- [aedavids@ucsc.edu](mailto:aedavids@edu?subject=SimpleModel.ipynb)

classify cancer type from gene expression

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

import matplotlib.pyplot as plt
import numpy as np
import os 
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

import sys

# fix random seed for reproducibility
theMeaningOfLife = 42

import keras
if "2.1.6" != keras.__version__ :
    emsg = "ERROR keras version {} != 2.1.6, new version can not save and restore models".format(keras.__version__)
    raise ValueError(emsg)

# add path to our local modules
# assume they are in the same directory we launched the juypter server in
# /home/ubuntu/BME-230a
!pwd
localModuleDir = "."
sys.path.append(localModuleDir)
from loadData import loadTumorNormalData

In [None]:
rootDir = "/bme-230a-ebs"
sourceDataFilePath = "{}/data/tcga_target_gtex.h5".format(rootDir)
print(sourceDataFilePath)
if not os.path.isfile(sourceDataFilePath) :
    emsg = "ERROR: {} not found".format(sourceDataFilePath)
    print(emsg)
    print("change rootDir")
    sys.stdout.flush() # force error message to print
    raise ValueError(emsg)

In [None]:
%%time
# Load training set
XDF = pd.read_hdf(sourceDataFilePath, "expression")
print("XDF.shape:{}".format(XDF.shape))

yDF = pd.read_hdf(sourceDataFilePath, "labels")
print("yDF.shape:{} type(yDF):".format(yDF.shape)) 

In [None]:
XDF.head(1)

In [None]:
yDF.head(1)

In [None]:
np.array_equal(pd.unique( yDF.loc[:,"disease"] ), pd.unique( yDF.loc[:,"category"] ))

In [None]:
diseaseClasses = pd.unique( yDF.loc[:,"disease"] )
diseaseK = len(diseaseClasses)
print("diseaseK: {}, type(diseaseClasses): {}".format(diseaseK, type(diseaseClasses)))
print(diseaseClasses)

In [None]:
def plotCategoryCounts(df, colNameStr, ):
    '''
    TODO: make this more generic
    argument 
        df: a pandas data frame 
        colNameStr: 
        
    returns 
        (fig, ax)
    '''
    # countDF is series
    countDF = df[colNameStr].value_counts()
    fig, ax = plt.subplots()
    fig.set_size_inches(5,18)
    ax.barh(countDF.index, countDF)    
    
    return (fig, ax)

fig, ax = plotCategoryCounts(yDF, "disease")
ax.set_title("disease counts")

## Prepare data

In [None]:
# Convert disease  into numerical values 
encoder = LabelEncoder()
yDF["disease_value"] = pd.Series(encoder.fit_transform(yDF["disease"]), index=yDF.index)
yDF[["disease","disease_value"]].head(3)

In [None]:
#%%time
# Split into stratified training and test sets based on classes (i.e. tissue type) so that we have equal
# proportions of each disease type in the train and test sets
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=theMeaningOfLife)
for train_index, test_index in split.split(XDF.values, yDF["disease_value"]):
    XTrainNumpy, XTestNumpy = XDF.values[train_index], XDF.values[test_index]
    yTrainNumpy, yTestNumpy = yDF["disease_value"][train_index].values, \
                        yDF["disease_value"][test_index].values
    
    
print("XTrainNumpy.shape: {} XTestNumpy.shape: {}".format(XTrainNumpy.shape, XTestNumpy.shape))
# pandas series values attribute create numpy arrays with shapes that are under specified
# E.G. yTestNumpy.shape: (3826,) .this causes a lot of bugs in other packages
# reshape(n,-1) causes reshape to set the last value
yTrainNumpy = np.reshape(yTrainNumpy,(yTrainNumpy.shape[0], -1))
yTestNumpy  = np.reshape( yTestNumpy, (yTestNumpy.shape[0], -1))
print("yTrainNumpy.shape: {} yTestNumpy.shape: {}".format(yTrainNumpy.shape, yTestNumpy.shape))
print(type(yTestSeries))

## Create Model

In [None]:
def multiClassClassifier(inputDim=None, outputDim=None):
    '''
    aedwip
    '''
    classify = [
        InputLayer(input_shape=(inputDim,)),
        BatchNormalization(),
        Dense(outputDim), # dot(input, kernel) + bias
        Activation('softmax') 
    ]
    
    model = Sequential(classify)   

    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=lr), metrics=['accuracy']) 
    
    return model 

In [None]:
%%time
modelName="diseaseClassifier"
diseaseClassifierModel = multiClassClassifier(
                                input_dim=X_train.shape[1],
                                output_dim=diseaseK,
                                )
diseaseClassifierModel.summary()

# https://keras.io/callbacks/
checkPointPath="./models/{}.chkPt".format(modelName)
callbacks = [
    # monitor valuse either 'acc' for accuracy or 'loss'
    # 'val_loss' is loss on hold if valaidation_split is set
    # 'loss' is loss on training
    # same for 'acc' and 'val_acc'
    EarlyStopping(monitor='loss', patience=2, verbose=0) 
    ,ModelCheckpoint(checkPointPath, monitor='loss', save_best_only=False, verbose=0)
    # FIXME: progbar generates run time error
    #,ProgbarLogger(count_mode='samples', stateful_metrics=None)
]

history = diseaseClassifierModel.fit(X_train, y_train.values,        
                                        shuffle=None, # we already shuffled
                                        epochs= 20, #100
                                        batch_size=1024, 
                                        # we already split the data         
                                        validation_split=0.0, 
                                        verbose=0,
                                        callbacks=callbacks
                                     )