In [None]:
import numpy as np
import matplotlib.pyplot as plt
import util

from keras.models import Model
from keras.layers import Dense
from keras.layers import LSTM, Input

import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split


In [None]:

intensity_treshold = 8000 #  everything under this will be annotated false

data_folder = '../data/sara_training/'
peak_clarkii= data_folder + 'clarkiiT0I1_peak_annotations.csv' # data with peak: mz, rt, Y/N
mz_clarkii = data_folder + 'clarkiiT0I1.mzML'
peak_viridis= data_folder + 'viridisT0I2_peak_annotations.csv' # data with peak: mz, rt, Y/N
mz_viridis = data_folder + 'viridisT0I2.mzML'

pickle_path = './output/pickle_training.npy'

In [None]:
class Factory:
    def __init__(self):
        self.X_ = []
        self.y_ =  []
        self.mz_ = []
        self.rt_ =  []

    def add_data(self, path_annotation, path_mz):
        d = util.build_data_ml(path_annotation,path_mz )
        self.X_ += d[0]
        self.y_ +=  d[1]
        self.mz_ += d[2]
        self.rt_ +=  d[3]

    def save(self, file_out):
        with open(file_out, 'wb') as fi:
            np.save(fi, np.asanyarray([self.X_,self.y_,self.mz_,self.rt_], dtype=object))

    def load(self, file_out):
        data = np.load(file_out, allow_pickle=True)
        self.X_ = data[0]
        self.y_ =  data[1]
        self.mz_ = data[2]
        self.rt_ =  data[3]


In [None]:
# Process input files and pickle it for later (only run once)
data = Factory()
data.add_data(peak_clarkii,mz_clarkii)
data.add_data(peak_viridis,mz_viridis)
data.save(pickle_path)

In [None]:
# Load data pickled
data = np.load(pickle_path, allow_pickle=True)

X_ = data[0]
y_ =  data[1]
mz_ = data[2]
rt_ =  data[3]

'''
for i in range(len(X_)):
    color = 'green' if y_[i] == 1 else 'red'
    yn = 'YES' if y_[i] == 1 else 'NO'
    plt.title(  str(i) + ' ' + yn + ' - ' + str(mz_[i]) + ' // ' + str(float(rt_[i])))
    plt.plot(X_[i], color=color)
    plt.show()
'''


In [None]:
def smooth(x,window_len=11,window='hanning'):
    """smooth the data using a window with requested size.

    This method is based on the convolution of a scaled window with the signal.
    The signal is prepared by introducing reflected copies of the signal
    (with the window size) in both ends so that transient parts are minimized
    in the begining and end part of the output signal.

    input:
        x: the input signal
        window_len: the dimension of the smoothing window; should be an odd integer
        window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
            flat window will produce a moving average smoothing.

    output:
        the smoothed signal

    example:

    t=linspace(-2,2,0.1)
    x=sin(t)+randn(len(t))*0.1
    y=smooth(x)

    see also:

    numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve
    scipy.signal.lfilter

    TODO: the window parameter could be the window itself if an array instead of a string
    NOTE: length(output) != length(input), to correct this: return y[(window_len/2-1):-(window_len/2)] instead of just y.
    """


    if window_len<3:
        return x


    s=np.r_[x[window_len-1:0:-1],x,x[-2:-window_len-1:-1]]
    #print(len(s))
    if window == 'flat': #moving average
        w=np.ones(window_len,'d')
    else:
        w=eval('np.'+window+'(window_len)')

    y=np.convolve(w/w.sum(),s,mode='valid')
    return y

def NormalizeData(data):
    if np.max(data) - np.min(data) == 0.0:
        return data
    return (data - np.min(data)) / (np.max(data) - np.min(data))


In [None]:
df = pd.DataFrame(X_)

df['y']= y_
df['mz']= mz_
df['rt']= rt_

df['treshold_satisfied'] = df[0].apply(lambda x: 1 if np.amax(x) > intensity_treshold else 0)
df['y'] = df['y'] & df['treshold_satisfied']
df['y'] = df['y'].apply(lambda x: 1 if x == True else 0)

df['norm'] = df[0].apply(lambda x: NormalizeData(x) )

df['smooth'] = df[0].apply(lambda x: smooth(x) )
df['smooth'] = df['smooth'].apply(lambda x: x[5:-5] )

df['grad1'] = df['smooth'].apply(lambda x: np.gradient(x) )
df['grad2'] = df['grad1'].apply(lambda x: np.gradient(x) )

df['fft'] = df[0].apply(lambda x: np.fft.fft(x) )
df['fftr'] = df['fft'].apply(lambda x: np.real(x) )
df['ffti'] = df['fft'].apply(lambda x: np.real(np.imag(x)) )

In [None]:
xfft = np.hstack([np.array(df['treshold_satisfied']).reshape((len(df), -1)),np.vstack(df['fftr']), np.vstack(df['ffti'])] )
print(xfft.shape)

x = np.stack([ np.vstack(df['norm']), np.vstack(df['smooth']) ,
              np.vstack(df['grad1']) , np.vstack(df['grad2'])  ] , axis = 2)
print(x.shape)

In [None]:
df

In [None]:
def ret_mats(df):
    xfft = np.hstack( [np.array(df['treshold_satisfied']).reshape((len(df), -1)), np.vstack(df['fftr']), np.vstack(df['ffti'])] )
    x = np.stack([ np.vstack(df['norm']), np.vstack(df['smooth']) ,
              np.vstack(df['grad1']) , np.vstack(df['grad2'])  ] , axis = 2)
    y = df.y.map(lambda x : float(x))

    return x,xfft,y

train, test = train_test_split(df, test_size=0.3)

mats={'train':ret_mats(train), 'test':ret_mats(test)}

In [None]:
model = None
tf.config.run_functions_eagerly(True)

inputvec = Input(shape=(120,4))
inputfft = Input(shape=(241))

lstm = LSTM(50,  return_sequences = True , return_state = False, dropout=0.2,
    recurrent_dropout=0.2, go_backwards=True)

whole_seq_output = lstm(inputvec)

lstm2 = LSTM(40, return_sequences = True , return_state = False, dropout=0.2,
    recurrent_dropout=0.2, go_backwards=True )

whole_seq_output2 = lstm2(whole_seq_output)

lstm3 = LSTM(30, return_sequences = False , return_state = True, dropout=0.2,
    recurrent_dropout=0.2, go_backwards=True )

final_memory_state, final_carry_state, whole_seq_output = lstm3(whole_seq_output2)

dense_input = tf.keras.layers.Concatenate()([final_memory_state, final_carry_state,whole_seq_output, inputfft])
dense_input = tf.keras.layers.Dropout(.2)(dense_input)
output1 = Dense(50)(dense_input)
output1 = tf.keras.layers.Dropout(.2)(output1)
output = Dense(25)(output1)
final = Dense(1,activation = 'sigmoid')(output)


model = Model( inputs = [inputvec,inputfft] , outputs = final)

opt = tf.keras.optimizers.Adam( learning_rate=0.001)

model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit([mats['train'][0], mats['train'][1]], mats['train'][2], epochs=100, batch_size=250)


In [None]:
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.show()


In [40]:
scores = model.evaluate([ mats['test'][0], mats['test'][1]], mats['test'][2], verbose=1)
print(scores)



[33670.69921875, 0.9297658801078796]


In [None]:
model.save('output/model_peak')

In [None]:
pre_raw = model.predict([ mats['test'][0], mats['test'][1]])

pre = [1 if  pr > 0.5 else 0 for pr in pre_raw ]


In [None]:
false = 0
small_all = 0

fp = 0
tp = 0
fn = 0
tn = 0

fp_small = 0
fn_small = 0


cpt =0

for p in pre:

    df_key = list(mats['test'][2].keys())[cpt]

    dfdf = df[0][df_key]

    if max(dfdf) < 8000:
        small_all +=1

    if p != list(mats['test'][2])[cpt]:

        if p == 1:
            fp +=1
        else:
            fn +=1


        plt.title( 'WRONG' + str(df['mz'][df_key]) )
        color = 'green' if p == 1 else 'red'
        plt.plot(dfdf , color = color)
        plt.show()

        if max(dfdf) < 8000:
            if p == 1:
                fp_small +=1
            else:
                fn_small +=1

        false +=1


    else:
        if p == 1:
            tp +=1
        else:
            tn +=1

        plt.title( 'GOOD' + str(df['mz'][df_key]) )
        color = 'green' if p == 1 else 'red'
        plt.plot(dfdf , color = color)
        plt.show()

    cpt +=1



In [39]:
total = len( mats['test'][0])
print('Predicted Features: ', total)
print('False Predictions: ', false)
print('% Good: ', (total-false)/total*100)
print('\n')

print('TP', tp)
print('TN', tn)
print('FP', fp)
print('FN', fn)
print('\n')

print('Precision: ', (100 * tp)/ (tp+fp) )
print('Recall: ', (100 * tp/ (tp+fn) ))
print('\n')

print('< 8000  FP: ', fp_small)
print('< 8000  FN: ', fn_small)
print('< 8000 Totals: ', small_all)
print('\n')

print('Corrected Precision: ', (100 * tp)/ (tp+fp-fp_small) )
print('Corrected Recall: ', (100 * tp/ (tp+fn-fn_small) ))
print('Corrected % Good: ', (total-false+fp_small)/total*100)

Predicted Features:  299
False Predictions:  21
% Good:  92.97658862876254


TP 7
TN 271
FP 14
FN 7


Precision:  33.333333333333336
Recall:  50.0


< 8000  FP:  8
< 8000  FN:  0
< 8000 Totals:  259


Corrected Precision:  53.84615384615385
Corrected Recall:  50.0
Corrected % Good:  95.65217391304348
