# Logistic Regression Tumor/normal Feature Selection
identify genes that maximal activate and compare to know cancer causing genes

In [1]:
import keras
print("keras version should be Keras==2.1.6, new version can not save and restore models")
print("keras version:{}".format(keras.__version__))
 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import sys

# fix random seed for reproducibility
theMeaningOfLife = 42
np.random.seed(theMeaningOfLife)

Using TensorFlow backend.


keras version should be Keras==2.1.6, new version can not save and restore models
keras version:2.1.6


In [2]:
# add path to our local modules
# assume they are in the same directory we launched the juypter server in
# /home/ubuntu/BME-230a
!pwd
localModuleDir = "."
sys.path.append(localModuleDir)
from loadData import loadTumorNormalData

/home/ubuntu/BME-230a


In [3]:
# save this code example 

# do not use google drive it is really slow
# load data from AWS EBS volumne: BME-230a-project volume id: vol-026c8e33988a1475b

# from google.colab import drive
# drive.mount('/content/drive') # force_remount=True
# projectDirUnix="drive/'My Drive'/GD_BME230a/project"
# projectDir="drive/My Drive/GD_BME230a/project"
# !ls $projectDirUnix

# rootDir = "/bme-230a-ebs"
# dataFile = "{}/data/tcga_target_gtex.h5".format(rootDir)
# store = pd.HDFStore(dataFile, mode="r")
# print("store.info():{}".format(store.info()))
# print("store.keys():{}".format(store.keys()))

In [4]:
%%time

rootDir = "/bme-230a-ebs"
XTrainDF, yTrainSeries, _, _ = loadTumorNormalData(rootDir)
yTrainDF = pd.DataFrame(yTrainSeries)
yTrainSeries = None # clean up memory

sourceDataFilePath:/bme-230a-ebs/data/tcga_target_gtex.h5
CPU times: user 1.24 s, sys: 4.7 s, total: 5.93 s
Wall time: 5.93 s


In [5]:
print("XTrainDF.shape:{}".format(XTrainDF.shape))
XTrainDF.iloc[0:3,0:3]

XTrainDF.shape:(15300, 58581)


Unnamed: 0,5S_rRNA,5_8S_rRNA,7SK
0,-9.966041,-9.965816,-0.687321
1,-9.966041,-9.965816,-9.965881
2,-9.966041,-9.965816,-9.965881


# Load trained model

In [6]:
%%time
# load model
from keras.models import load_model
modelName="logisticRegressionTumorNormal"
modelRootDir = "{}/models".format(rootDir)
fullModelPath = "{}/full{}.h5".format(modelRootDir, modelName)
print("fullModelPath:{}".format(fullModelPath))

model = load_model(fullModelPath)
model.summary()
#model.get_weights()
print("model.optimizer:{}".format(model.optimizer))

fullModelPath:/bme-230a-ebs/models/fulllogisticRegressionTumorNormal.h5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 58581)             0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 58581)             234324    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 58582     
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total params: 292,906
Trainable params: 175,744
Non-trainable params: 117,162
_________________________________________________________________
model.optimizer:<keras.optimizers.Adam object at 0x7fa30ab95da0>
CPU times: user 680 ms, sys: 4 ms, total: 684 ms
Wall time: 687 ms


# start eval

In [7]:
%%time
# https://keras.io/getting-started/faq/#how-can-i-obtain-the-output-of-an-intermediate-layer
# We want to identify how much each feature contributes to Z the value passed to the activation layer
# its important that we use the batch_normalization layer from our logistic regression model
# else we will potentially scale the data to a distribution with a different mean and variance

from keras.models import Model

layerName="batch_normalization_1"
normalizationModel = Model(inputs=model.input,
                                 outputs=model.get_layer(layerName).output)

normalizedXTrain = normalizationModel.predict(XTrainDF)

print("normalizedXTrain.shape:{}".format(XTrainDF.shape))
print("type(normalizedXTrain):{}".format(type(normalizedXTrain)))

normalizedXTrain.shape:(15300, 58581)
type(normalizedXTrain):<class 'numpy.ndarray'>
CPU times: user 19.9 s, sys: 872 ms, total: 20.7 s
Wall time: 6.23 s


In [8]:
# make sure batch normalization was applied
print(XTrainDF.iloc[0:5,0:5])
print()
print(normalizedXTrain[0:5,0:5])

    5S_rRNA  5_8S_rRNA       7SK    A1BG  A1BG-AS1
0 -9.966041  -9.965816 -0.687321  2.7487    0.6425
1 -9.966041  -9.965816 -9.965881  1.7489    0.4552
2 -9.966041  -9.965816 -9.965881  3.1393   -0.5332
3 -9.966041  -9.965816 -9.965881  1.8957    0.3346
4 -9.966041  -9.965816 -9.965881  6.1639    2.7951

[[-0.22909236 -0.01842499  1.7634513  -0.4423349  -0.33213907]
 [-0.22909236 -0.01842499 -0.59044564 -0.93999165 -0.45341104]
 [-0.22909236 -0.01842499 -0.59044564 -0.24791121 -1.0933752 ]
 [-0.22909236 -0.01842499 -0.59044564 -0.86692107 -0.5314965 ]
 [-0.22909236 -0.01842499 -0.59044564  1.2576027   1.0616152 ]]


In [9]:
%%time
# create a new dataFrame that combines our features with out labels
# the index for XTrainDF are integers
# teh index for yTrainDF GTEX values e.g. GTEX-ZQG8-2426-SM-57WEE
normalizedXTrainDF = pd.DataFrame(normalizedXTrain, columns=XTrainDF.columns)
evalTrainDF = pd.concat([normalizedXTrainDF, yTrainDF.reset_index(drop=True)], axis=1)
yTrainDF = None # clean up memory
normalizedXTrainDF = None # clean up memory
XTrainDF = None # clean up memory

CPU times: user 2.22 s, sys: 696 ms, total: 2.91 s
Wall time: 2.92 s


In [10]:
def calcStats(df, categoryColName, listOfAggFunctions):
    '''
    # https://stackoverflow.com/a/14734627/4586180
    1. groups rows by category
    2. applies the aggregate function to each group
    
    arguments:
        df:
            dataframe
            
        categoryColName:
            a string identifying the column to group by
            
        listOfAggFunctions:
            a list of string names of the aggrate function to run
    
    returns
        a dictionary. 
            The key will be the classes in the col identifyied by categoryColName
            the value will be a dataframe with the aggragate values
        
    '''
    ret = dict()
    grouped = df.groupby(categoryColName)
    for key, group in grouped:
        stats = group.agg(listOfAggFunctions)
        ret[key] = stats
        
    return ret
        
   
# small example that explains what it calcStats() does
tn = pd.Series(["normal","tumor","normal","tumor"], dtype="category")
df = pd.DataFrame({"gene1":[1,2,3,4],
                   "gene2":[11, 22, 33, 44],
                   "tumorNormal":tn})

ret = calcStats(df=df, categoryColName="tumorNormal", listOfAggFunctions=['min', 'max', 'mean'])
print()
for key in ret.keys():
    print("key:{}\n{}\n".format(key, ret[key]))


key:normal
      gene1  gene2
min     1.0   11.0
max     3.0   33.0
mean    2.0   22.0

key:tumor
      gene1  gene2
min     2.0   22.0
max     4.0   44.0
mean    3.0   33.0



In [11]:
%%time
# for each class, calculate the mean value for each gene
statsTrainDict = calcStats(df=evalTrainDF, categoryColName="tumor_normal_value", 
                      listOfAggFunctions=['min', 'max', 'std', 'mean'])
evalTrainDF = None # clean up memory

CPU times: user 1min 12s, sys: 884 ms, total: 1min 13s
Wall time: 1min 13s


In [12]:
# the keys are '0' and '1'
# AEDWIP make sure '0' is normal
normalTrainDF = statsTrainDict[0]
tumorTrainDF = statsTrainDict[1]
print("normalTrainDF.shape:{}".format(normalTrainDF.shape))
print(" tumorTrainDF.shape:{}".format(tumorTrainDF.shape))

normalTrainDF.shape:(4, 58582)
 tumorTrainDF.shape:(4, 58582)


In [13]:
print(normalTrainDF.iloc[0:4, 0:3])
last3ColLabels = normalTrainDF.columns[-3:]
print()
print(normalTrainDF.iloc[0:4,:].loc[:,last3ColLabels])

       5S_rRNA  5_8S_rRNA       7SK
min  -0.229092  -0.018425 -0.590446
max   5.554689  68.263947  2.793195
std   1.095426   0.823410  1.058365
mean  0.045851  -0.008494  0.078725

        uc_338  yR211F11.2  tumor_normal_value
min  -2.405998   -0.343928                 0.0
max   1.359166    4.647242                 0.0
std   0.827414    0.876714                 0.0
mean  0.305524   -0.090991                 0.0


In [14]:
%%time
# identify which gene have the greatest effect on z
# by multiplying the means by the dense layer weight
denseLayer = model.get_layer('dense_1')
print(denseLayer.get_config())
weights = denseLayer.get_weights()
print("\ntype(weights):{}".format( type(weights) ))
print("len(weights):{}".format(len(weights)))
print("weights[0][0:3]:{}".format(weights[0][0:3]))
print("len(weights[0]):{}".format(len(weights[0])))
print("type(weights[0]):{}".format(type(weights[0])))
print("len(weights[1]):{}".format(len(weights[1])))
coeficients = weights[0]
print("coeficients.shape:{}".format(coeficients.shape))

{'name': 'dense_1', 'trainable': True, 'units': 1, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}

type(weights):<class 'list'>
len(weights):2
weights[0][0:3]:[[ 0.00080593]
 [-0.00365209]
 [ 0.00453593]]
len(weights[0]):58581
type(weights[0]):<class 'numpy.ndarray'>
len(weights[1]):1
coeficients.shape:(58581, 1)
CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 8.53 ms


In [15]:
print(coeficients[0])
print(coeficients[1])

[0.00080593]
[-0.00365209]


In [16]:
# last is the list of features,
#lastTrain = normalTrainDF.columns[:-1]

# select all the gene col lables. The last col is tumor_normal_value
normalTrainFeatureCols =  normalTrainDF.columns[:-1]
# print("type(lastTrain):{}".format(type(lastTrain)))
# print("lastTrain.shape:{}".format(lastTrain.shape))
# print("lastTrain:{}".format(lastTrain))

# normalTrainMeansSeries = normalTrainDF.loc['mean', last3ColLabels]
normalTrainMeansSeries = normalTrainDF.loc['mean', normalTrainFeatureCols]
normalTrainDF = None # clear memory

# print("\nnormalTrainMeansSeries.head:\n{}".format(normalTrainMeansSeries.head()))
      
# print("\normalTrainMeansSeries.shape:{}".format(normalTrainMeansSeries.shape))
# print("type(normalTrainMeansSeries):{}".format(type(normalTrainMeansSeries)))

In [17]:
%%time
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.multiply.html
# element wise multiplication
normalTrainMeansValues = normalTrainMeansSeries.values
normalTrainMeansSeries = None # clean memory
print("normalTrainMeansValues.shape:{}".format(normalTrainMeansValues.shape))

# np.multiple multiply with shapes (58581,) * (58581, 1) exhasts memory
s = normalTrainMeansValues.shape
normalTrainMeansValues = np.reshape(normalTrainMeansValues, (s[0], 1))
print("type(normalTrainMeansValues):{}".format(type(normalTrainMeansValues)))
# print(normalTrainMeansValues[0])
# print(normalTrainMeansValues[1])
# print(normalTrainMeansValues[2])
print("valuenormalTrainMeansValues:{}".format(normalTrainMeansValues))

normalTrainMeansValues.shape:(58581,)
type(normalTrainMeansValues):<class 'numpy.ndarray'>
valuenormalTrainMeansValues:[[ 0.04585141]
 [-0.00849445]
 [ 0.07872539]
 ...
 [-0.00280762]
 [ 0.30552363]
 [-0.09099111]]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 418 Âµs


In [19]:
%%time
# total crushes memory swap goes to 19.85 GB
# <class 'numpy.ndarray'>
# CPU times: user 11.6 s, sys: 12.8 s, total: 24.4 s
# Wall time: 1min 15s
print("normalTrainMeansValues.shape:{}".format(normalTrainMeansValues.shape))
print("type(normalTrainMeansValues):{}".format(type(normalTrainMeansValues)))
print("coeficients.shape:{}".format(coeficients.shape))
print("type(coeficients):{}".format(type(coeficients)))

# https://www.geeksforgeeks.org/garbage-collection-python/
import gc 
# (threshold, allocations, deallocations)
print("Garbage collection thresholds:", gc.get_threshold()) 

# Returns the number of objects it has collected and deallocated 
collected = gc.collect() 
  
# Prints Garbage collector  as 0 object 
print("Garbage collector: collected", "%d objects." % collected) 

# np.multiply generates MemoryError 
normalContribution = np.multiply(normalTrainMeansValues, coeficients) # element wise mult
#normalContribution = normalTrainMeansValues * coeficients
# n = normalTrainMeansValues.shape[0]
# bucketSize = 5000
# quotient, remainder = divmod(n,bucketSize)
# print("\nn:{} bucketSize:{} quotient:{} remainder:{}\n".format(n, bucketSize, quotient, remainder))
# i = 0
# q = 0
# normalContribution = np.empty((n,1))
# while i < n:
#     if q < quotient:
#         ll = bucketSize
#     else:
#         ll = remainder
        
#     ntmv = normalTrainMeansValues[i:i+ll]
#     c = coeficients[i:i+ll]
#     print("ntmv.shape{} c.shape:{}".format(ntmv.reshape(ll,1).shape, c.shape))

#     tmp = np.multiply(ntmv.reshape(ll,1), c)
#     print("tmp.shape:{}".format(tmp.shape))
#     normalContribution[i:i+ll] = tmp
#     q += 1
#     i += ll
#     print("i:{}, q:{}, ll:{}".format(i, q, ll))



normalTrainMeansValues.shape:(58581, 1)
type(normalTrainMeansValues):<class 'numpy.ndarray'>
coeficients.shape:(58581, 1)
type(coeficients):<class 'numpy.ndarray'>
Garbage collection thresholds: (700, 10, 10)
Garbage collector: collected 0 objects.
CPU times: user 64 ms, sys: 0 ns, total: 64 ms
Wall time: 64.6 ms


In [20]:
print("normalContribution.shape:{}".format(normalContribution.shape))
print("normalContribution[0:4]:{}".format(normalContribution[0:4]))
print("normalContribution[-3:]:{}".format(normalContribution[-3:]))

normalContribution.shape:(58581, 1)
normalContribution[0:4]:[[ 3.69530185e-05]
 [ 3.10225339e-05]
 [ 3.57093244e-04]
 [-2.48970414e-04]]
normalContribution[-3:]:[[ 9.34205485e-06]
 [-4.02751563e-03]
 [ 3.43294116e-06]]


In [None]:
a = np.array([1, 2, 3])
b = np.array([1, 2, 3]) + 2
print(a)
print(b)
print(a * b)
e = np.empty(a.shape)
e[0:1] = a[0:1]
print(e)

In [None]:
c = a * b
print(c)
print(c[0])
print(c[0:2])
c[0:2] = c[0:2] + 10
print(c)

In [None]:
%%time
# bins = numpy.linspace(-10, 10, 100)

plt.hist(normalContribution, bins=50, alpha=0.5, label='normal')
#pyplot.hist(y, bins=50, alpha=0.5, label='y')
plt.legend(loc='upper right')
plt.show()

In [None]:
# %%time
# from sklearn.metrics import confusion_matrix

# yPredict = model.predict(XTestDF)
# print(type(yPredict))
# print(yPredict[0:3])

# yTestPredict = [1 if p > 0.5 else 0 for p in yPredict]
# #print(yTestPredict[0:3])
# cf = confusion_matrix(yTestDF, yTestPredict)
# print(cf)
# expected = [[1707, 13],[8, 2098]]
# np.testing.assert_array_equal(cf, expected)