# Logistic Regression Tumor/normal Feature Selection
identify genes that maximal activate and compare to know cancer causing genes

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

# fix random seed for reproducibility
theMeaningOfLife = 42
np.random.seed(theMeaningOfLife)

In [8]:
# running juypter notebook on my mac book pro was really slow
# work around, copied files to google drive
# google drive does not respect symbolic link I made 
# ln -s rcurrie-tumornormal/data project/data
# work around move data 
from google.colab import drive
drive.mount('/content/drive') # force_remount=True
projectDirUnix="drive/'My Drive'/GD_BME230a/project"
projectDir="drive/My Drive/GD_BME230a/project"
!ls $projectDirUnix

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 data				  README.md~
 loadData-googleSyncProblem.py	  requirements.txt
 loadData-googleSyncProblem.py~   sandbox
 loadData.py			  simpleModelFeatureSelection.ipynb
 models				  simpleModel.ipynb
 __pycache__			  simpleModel.pdf
'__pycache__ (1)'		  t
 rcurrie-tumornormal		  tensorFlowForDeepLearning
 README.md			  test.h5


In [9]:
%%time
# load model
from keras.models import load_model
modelName="logisticRegressionTumorNormal"
fullModelPath = "{}/models/full{}.h5".format(projectDir,modelName)
print("fullModelPath:{}".format(fullModelPath))

model = load_model(fullModelPath)
model.summary()
#model.get_weights()
print("model.optimizer:{}".format(model.optimizer))

fullModelPath:drive/My Drive/GD_BME230a/project/models/fulllogisticRegressionTumorNormal.h5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_1 (Batch (None, 58581)             234324    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 58582     
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total params: 292,906
Trainable params: 175,744
Non-trainable params: 117,162
_________________________________________________________________
model.optimizer:<keras.optimizers.Adam object at 0x7f29cd57d0f0>
CPU times: user 879 ms, sys: 34.2 ms, total: 913 ms
Wall time: 929 ms


In [0]:
_sourceDataFile = "data/tcga_target_gtex.h5"            
_tumorNormalFile = "data/logisticRegressionTumorNormal.h5" 

def loadTumorNormalData(projectDir):
    '''
     work around google sync issue
     
  # add local python modules to path
  import sys
  sys.path.append(projectDir)

  # load data
  from loadData import loadTumorNormalData     

    arguments:
        projectDir: root of project
    '''
    sourceDataFilePath = "{}/{}".format(projectDir, _sourceDataFile)
    print("sourceDataFilePath:{}".format(sourceDataFilePath))
    store = pd.HDFStore(sourceDataFilePath, mode="r")
    #print("{} source store.info():{}".format(_sourceDataFile, store.info()))
    #print("{} store.keys():{}".format(_sourceDataFile, store.keys()))
     
    # Load training set
    X = pd.read_hdf(sourceDataFilePath, "expression")
    Y = pd.read_hdf(sourceDataFilePath, "labels")
 
    # Convert tumor_normal  into numerical values 
    from sklearn.preprocessing import LabelEncoder
     
    encoder = LabelEncoder()
    Y["tumor_normal_value"] = pd.Series(encoder.fit_transform(Y["tumor_normal"]), index=Y.index)
    #Y[["tumor_normal","tumor_normal_value"]].head(3)
     
    # Split into stratified training and test sets based on classes (i.e. tissue type) so that we have equal
    # proportions of each tissue type in the train and test sets
    from sklearn.model_selection import StratifiedShuffleSplit
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=theMeaningOfLife)
    for train_index, test_index in split.split(X.values, Y["tumor_normal_value"]):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = Y["tumor_normal_value"][train_index], \
                            Y["tumor_normal_value"][test_index]  
     
    #print("AEDWIP X_train.shape:{}".format(X_train.shape))
    #print("AEDWIP X.shape:{} len(X.columns):{}".format(X.shape, len(X.columns)))
    
    # X_train and X_test are numpy arrays
    XTrainDF = pd.DataFrame(X_train, columns=X.columns)
    XTestDF = pd.DataFrame(X_test, columns=X.columns)
    
    # y_train and y_test are pandas series
    #yTrainDF = pd.DataFrame(y_train, columns=["tumor_normal_value"])
    yTrainDF = pd.DataFrame(y_train, columns=["tumor_normal_value"])
    yTrainDF.head(3)
    yTestDF = pd.DataFrame(y_test, columns=["tumor_normal_value"])
    #return (X_train, y_train, X_test, y_test)
    #return (XTrainDF, yTrainDF, XTestDF, yTestDF)
    return (XTrainDF, y_train, XTestDF, y_test)


In [11]:
 %%time

# # add local python modules to path
# import sys
# sys.path.append(projectDir)

# # load data
# from loadData import loadTumorNormalData
XTrainDF, yTrainSeries, _, _ = loadTumorNormalData(projectDir)
yTrainDF = pd.DataFrame(yTrainSeries)
yTrainSeries = None # clean up memory

sourceDataFilePath:drive/My Drive/GD_BME230a/project/data/tcga_target_gtex.h5


OSError: ignored

# start eval

In [0]:
%%time
# https://keras.io/getting-started/faq/#how-can-i-obtain-the-output-of-an-intermediate-layer
# We want to identify how much each feature contributes to Z the value passed to the activation layer
# its important that we use the batch_normalization layer from our logistic regression model
# else we will potentially scale the data to a distribution with a different mean and variance

from keras.models import Model

layerName="batch_normalization_1"
normalizationModel = Model(inputs=model.input,
                                 outputs=model.get_layer(layerName).output)

normalizedXTrain = normalizationModel.predict(XTrainDF)

print("normalizedXTrain.shape:{}".format(XTrainDF.shape))
print("type(normalizedXTrain):{}".format(type(normalizedXTrain)))

In [0]:
# make sure batch normalization was applied
print(XTrainDF.iloc[0:5,0:5])
print()
print(normalizedXTrain[0:5,0:5])

In [0]:
%%time
# create a new dataFrame that combines our features with out labels
# the index for XTrainDF are integers
# teh index for yTrainDF GTEX values e.g. GTEX-ZQG8-2426-SM-57WEE
normalizedXTrainDF = pd.DataFrame(normalizedXTrain, columns=XTrainDF.columns)
evalDF = pd.concat([normalizedXTrainDF, yTrainDF.reset_index(drop=True)], axis=1)
normalizedXTrainDF = None # clean up memory
XTrainDF = None # clean up memory

In [0]:
def calcStats(df, categoryColName, listOfAggFunctions):
    '''
    # https://stackoverflow.com/a/14734627/4586180
    1. groups rows by category
    2. applies the aggregate function to each group
    
    arguments:
        df:
            dataframe
            
        categoryColName:
            a string identifying the column to group by
            
        listOfAggFunctions:
            a list of string names of the aggrate function to run
    
    returns
        a dictionary. 
            The key will be the classes in the col identifyied by categoryColName
            the value will be a dataframe with the aggragate values
        
    '''
    ret = dict()
    grouped = df.groupby(categoryColName)
    for key, group in grouped:
        stats = group.agg(listOfAggFunctions)
        ret[key] = stats
        
    return ret
        
   
# its easier to look at a small example than to explain what it calcStats() does
tn = pd.Series(["normal","tumor","normal","tumor"], dtype="category")
df = pd.DataFrame({"gene1":[1,2,3,4],
                   "gene2":[11, 22, 33, 44],
                   "tumorNormal":tn})

ret = calcStats(df=df, categoryColName="tumorNormal", listOfAggFunctions=['min', 'max', 'mean'])
print()
for key in ret.keys():
    print("key:{}\n{}\n".format(key, ret[key]))

In [0]:
%%time
# for each class, calculate the mean value for each gene
statsDict = calcStats(df=evalDF, categoryColName="tumor_normal_value", 
                      listOfAggFunctions=['min', 'max', 'std', 'mean'])
evalDF = None # clean up memory

In [0]:
# the keys are '0' and '1'
# AEDWIP make sure '0' is normal
normalDF = statsDict[0]
tumorDF = statsDict[1]
print("normalDF.shape:{}".format(normalDF.shape))
print(" tumorDF.shape:{}".format(tumorDF.shape))

In [0]:
print(normalDF.iloc[0:4, 0:3])
last = normalDF.columns[-3:]
print()
print(normalDF.iloc[0:4,:].loc[:,last])

In [0]:
%%time
# identify which gene have the greatest effect on z
# by multiplying the means by the dense layer weight
denseLayer = model.get_layer('dense_1')
print(denseLayer.get_config())
weights = denseLayer.get_weights()
print("\ntype(weights):{}".format( type(weights) ))
print("len(weights):{}".format(len(weights)))
print("weights[0][0:3]:{}".format(weights[0][0:3]))
print("len(weights[0]):{}".format(len(weights[0])))
print("type(weights[0]):{}".format(type(weights[0])))
print("len(weights[1]):{}".format(len(weights[1])))
coeficients = weights[0]
print("coeficients.shape:{}".format(coeficients.shape))

In [0]:
# last is the list of features,
last = normalDF.columns[:-1]
print(last)

normalMeansSeries = normalDF.loc['mean', last]

print("\nnormalMeansSeries.head:\n{}".format(normalMeansSeries.head()))
      
print("\nnormalMeansSeries.shape:{}".format(normalMeansSeries.shape))
print("type(normalMeansSeries):{}".format(type(normalMeansSeries)))

In [0]:
%%time
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.multiply.html
# element wise multiplication
v = normalMeansSeries.values
print(v.shape)
print(type(v))

# total crushes memory swap goes to 19.85 GB
# <class 'numpy.ndarray'>
# CPU times: user 11.6 s, sys: 12.8 s, total: 24.4 s
# Wall time: 1min 15s
normalContribution = np.multiply(v,coeficients)

# very slow
#normalContribution = np.multiply(normalMeansSeries,coeficients)

In [0]:
normalContribution[0:4]

In [0]:
%%time
# bins = numpy.linspace(-10, 10, 100)

plt.hist(normalContribution, bins=50, alpha=0.5, label='normal')
#pyplot.hist(y, bins=50, alpha=0.5, label='y')
plt.legend(loc='upper right')
plt.show()

In [0]:
# %%time
# # print(XTrainDF.shape)
# # print(yTrainDF.shape)

# # print()
# # print(XTrainDF.head(2))
# # print(yTrainDF.head(2))

# aaa = XTrainDF.iloc[0:3,0:3]
# bbb = yTrainDF.iloc[0:3,:]

# print("aaa.shape():{}".format(aaa.shape))
# print("bbb.shape():{}".format(bbb.shape))

# print(aaa)
# print("\nbbb")
# print(bbb)
# print(bbb.loc[:,'tumor_normal_value'].values)


# # append the label column
# ccc = pd.concat([aaa, bbb.reset_index(drop=True)], axis=1)
# print("\n\nccc:\n{}".format(ccc))

# print("\n\n, did bbb loose its index?\n{}".format(bbb))


# # print(bbb.loc[:,'id'])

# # aaa['tumor_normal_value'] = bbb.loc[:,'tumor_normal_value'].values
# # print("\n\n new and improve\n:{}".format(aaa))
# # print("\n\n******* did cat work")

# #print(pd.concat([aaa, bbb.values], axis=1))
# #print(evalDF)
# # print(evalDF.iloc[0:3, 0:3])
# # print(evalDF.iloc[0:3, -3])

In [0]:
# print(yTrainDF.columns)
# print(yTrainDF.iloc[0:3,-1].values)
# print(yTrainDF.index[0:3])

In [0]:
# # append label to training set
# XTrainDF['tumor_normal_value'] = yTrainDF.loc[:,'tumor_normal_value'].values
# print(XTrainDF.iloc[0:3,0:3])
# print()
# print(XTrainDF.iloc[0:3,-3])
# print()
# print(XTrainDF.loc[0:3, 'tumor_normal_value'])

In [0]:
#statsDict = calcStats(evalDF, categoryColName="tumorNormal", listOfAggFunctions=['min', 'max', 'mean'])

In [0]:
# %%time
# from sklearn.metrics import confusion_matrix

# yPredict = model.predict(XTestDF)
# print(type(yPredict))
# print(yPredict[0:3])

# yTestPredict = [1 if p > 0.5 else 0 for p in yPredict]
# #print(yTestPredict[0:3])
# cf = confusion_matrix(yTestDF, yTestPredict)
# print(cf)
# expected = [[1707, 13],[8, 2098]]
# np.testing.assert_array_equal(cf, expected)